diff --git a/internal/test-utils/content/code-comments.md b/internal/test-utils/content/code-comments.md new file mode 100644 index 00000000..166edd57 --- /dev/null +++ b/internal/test-utils/content/code-comments.md @@ -0,0 +1,191 @@ +# Code comments test document + +This document contains code blocks in many popular languages. Each code block includes comments in the language's syntax, and some also include contents that might be misinterpreted as a comment by a naive parser. + +The contents of actual comments are wrapped between pairs of `!!!` sequences (e.g. `!!! this is a comment !!!`), which allows tests to verify that the comments are recognized correctly from start to end. + +The contents of expressions that might be misinterpreted as comments contain a `XXX` sequence that can be used by tests to verify that the parser does not treat them as comments. + +## JavaScript + +```js +// !!! separate line !!! +console.log('Hello, world!'); // !!! end of line 1 !!! +/* !!! multi-line comment on its own single line !!! */ +console.log('Hello, world!'); /* !!! multi-line comment starting +at the end of a line and spanning multiple lines +after it !!! */ let a = 1; // !!! end of line 2 !!! +/* !!! another multi-line comment +// this is not a separate comment +end of multi-line comment !!! */ +let b = `Template strings can span +// multiple lines and contents inside can XXX +look like comments /* although they are not XXX */, +and they can contain ${/* !!! real comments inside +expressions !!! */ +// !!! even multiple ones !!! +'nested /* strings XXX */'}`; +``` + +## TypeScript + +```ts +function test(x: number): void { + // !!! separate line !!! + console.log('Hello, world!'); // !!! end of line 1 !!! + /* !!! multi-line comment on its own single line !!! */ + console.log('Hello, world!'); /* !!! multi-line comment starting + at the end of a line and spanning multiple lines + after it !!! */ let a = 1 as number; // !!! end of line 2 !!! + /* !!! another multi-line comment + // this is not a separate comment + end of multi-line comment !!! */ + let b: string = `Template strings can span + // multiple lines and contents inside can XXX + look like comments /* although they are not XXX */, + and they can contain ${/* !!! real comments inside + expressions !!! */ + // !!! even multiple ones !!! + 'nested /* strings XXX */' + x}`; +} +``` + +## HTML + +```html + +

Hello, world! XXX +XXX +

+

Hello, world!

Some text + +Some text + + +``` + +## CSS + +```css +/* !!! separate line !!! */ +body { + color: black; /* !!! end of line 1 !!! */ + font-size: 2rem; /* !!! multi-line comment starting + at the end of a line and spanning multiple lines + after it !!! */ background-color: white; /* !!! end of line 2 !!! */ +} +/* !!! another multi-line comment +/* this is not a separate comment, CSS does not support nesting comments +end of multi-line comment !!! */ +div { + /* !!! beginning of line !!! */color: black; + content: "/* this is not a comment XXX */"; + content: 'and /* this neither XXX */'; +} +``` + +## MD + +````md + +# Hello, world! +Some text Some text +// this is not a comment XXX +```js +// !!! comment in nested JS code !!! +return ` + +` +``` + +```` + +## MDX + +````mdx +{/* !!! separate line !!! */} +# Hello, world! {/* !!! end of line 1 !!! */} +Some text {/* !!! multi-line comment starting +at the end of a line and spanning multiple lines +after it !!! */} Some text {/* !!! end of line 2 !!! */} +// this is not a comment XXX +```js +// !!! comment in nested JS code !!! +return ` +{/* this is not a comment XXX */} +` +``` +{/* !!! back in mdx !!! */} +```` + +## YAML + +```yml +# !!! separate line !!! +key: value # !!! end of value !!! +list: # !!! end of list start !!! + - item1 # !!! end of list item !!! +string: "This is a string # XXX and this is not a comment" # !!! end of string !!! +literal: | + This is a literal block scalar that can contain + various "quotes" like 'this', + and even blank lines: # XXX not a comment + + # Newlines are kept as-is. XXX +# !!! end of block scalar 1 !!! +folded: > + This is a folded block scalar that can contain + various "quotes" like 'this', + and even blank lines: # XXX not a comment + + # also not a comment XXX + - even + - things: that look like lists, + some extra indentation, + it's all just a long multiline string. +# !!! end of block scalar 2 !!! +a: |2 + # XXX with indentation indicator +b: |-2 + # XXX with indentation indicator and no end newline +c: |+2 + # XXX with indentation indicator and all end newlines +d: >2 + # XXX with indentation indicator +e: >-2 + # XXX with indentation indicator and no end newline +f: >+2 + # XXX with indentation indicator and all end newlines +# !!! end of all block scalars !!! +``` + +# Todos + +- JSX/TSX +- Python +- Ruby +- Shell +- SQL +- JSON +- TOML +- XML +- Java +- C# +- C++ +- C +- Swift +- Kotlin +- Rust diff --git a/packages/@expressive-code/core/src/helpers/comment-parser.ts b/packages/@expressive-code/core/src/helpers/comment-parser.ts new file mode 100644 index 00000000..c9ccfae2 --- /dev/null +++ b/packages/@expressive-code/core/src/helpers/comment-parser.ts @@ -0,0 +1,350 @@ +type Range = { + start: number + end: number +} + +interface Comment { + outerRange: Range + contentRange: Range + outerText: string + text: string + type: 'single' | 'multi' +} + +type ParserState = + | { + parser: (ctx: ParsingContext) => void + } + | { + capturing: true + parser: (ctx: ParsingContext) => boolean + } + +interface ParsingContext { + code: string + i: number + commentStart: number + commentContentStart: number + escaped: boolean + comments: Comment[] + stateStack: Array + enterState: (state: ParserState) => void + exitState: () => void + currentState: () => ParserState + allCapturingStates: () => ParserState[] +} + +export function parseComments(code: string, language: string) { + const parse = createParser(getInitialStateByLanguage(language)) + return parse(code) +} + +function createParser(initialState: ParserState) { + return function (code: string) { + const ctx: ParsingContext = { + code: code, + i: 0, + commentStart: -1, + commentContentStart: -1, + escaped: false, + comments: [], + stateStack: [], + enterState: function (state) { + this.stateStack.push(state) + }, + exitState: function () { + if (this.stateStack.length > 0) { + this.stateStack.pop() + } + }, + currentState: function () { + return this.stateStack[this.stateStack.length - 1] || initialState + }, + allCapturingStates: function () { + return this.stateStack.filter((state) => 'capturing' in state && state.capturing === true) + }, + } + + // Loop through the code, calling the current state function + // (note that we also call it once past the end of the code + // to ensure that any final state can be handled) + for (; ctx.i <= ctx.code.length; ctx.i++) { + if (ctx.allCapturingStates().some((state) => state.parser(ctx))) continue + ctx.currentState().parser(ctx) + } + return ctx.comments + } +} + +function getInitialStateByLanguage(language: string): ParserState { + switch (language) { + case 'htm': + case 'html': + case 'xsl': + return { parser: parseBaseHtmlCode } + case 'md': + case 'markdown': + return { parser: parseBaseMdCode } + case 'mdx': + return { parser: parseBaseMdxCode } + default: + return { parser: parseBaseJsCode } + } +} + +// TODO: HTML can include