@@ -56,6 +56,7 @@ import {Parser, Token, TokenizerMode, html} from 'parse5'
5656import { pointStart , pointEnd } from 'unist-util-position'
5757import { visit } from 'unist-util-visit'
5858import { zwitch } from 'zwitch'
59+ import { webNamespaces } from 'web-namespaces'
5960
6061// Node types associated with MDX.
6162// <https://github.com/mdx-js/mdx/blob/641eb91/packages/mdx/lib/node-types.js>
@@ -123,9 +124,9 @@ export function raw(tree, options) {
123124
124125 // Unpack if possible and when not given a `root`.
125126 if (
126- tree . type !== 'root' &&
127127 result . type === 'root' &&
128- result . children . length === 1
128+ result . children . length === 1 &&
129+ result . children [ 0 ] . type === tree . type
129130 ) {
130131 return result . children [ 0 ]
131132 }
@@ -179,25 +180,11 @@ function root(node, state) {
179180 * Nothing.
180181 */
181182function element ( node , state ) {
182- resetTokenizer ( state , pointStart ( node ) )
183- // @ts -expect-error: private.
184- // type-coverage:ignore-next-line
185- state . parser . currentToken = startTag ( node )
186- // @ts -expect-error: private.
187- // type-coverage:ignore-next-line
188- state . parser . _processToken ( state . parser . currentToken )
183+ startTag ( node , state )
189184
190185 all ( node . children , state )
191186
192- if ( ! htmlVoidElements . includes ( node . tagName ) ) {
193- resetTokenizer ( state , pointEnd ( node ) )
194- // @ts -expect-error: private.
195- // type-coverage:ignore-next-line
196- state . parser . currentToken = endTag ( node )
197- // @ts -expect-error: private.
198- // type-coverage:ignore-next-line
199- state . parser . _processToken ( state . parser . currentToken )
200- }
187+ endTag ( node , state )
201188}
202189
203190/**
@@ -366,6 +353,7 @@ function handleRaw(node, state) {
366353 // consumes the whole token.
367354
368355 // Note: `State` is not exposed by `parse5`, so these numbers are fragile.
356+ // See: <https://github.com/inikulin/parse5/blob/46cba43/packages/parse5/lib/tokenizer/index.ts#L58>
369357 if (
370358 state . parser . tokenizer . state === 72 /* NAMED_CHARACTER_REFERENCE */ ||
371359 state . parser . tokenizer . state === 78 /* NUMERIC_CHARACTER_REFERENCE_END */
@@ -460,14 +448,9 @@ function resetTokenizer(state, point) {
460448 // type-coverage:ignore-next-line
461449 state . parser . tokenizer . inLoop = false
462450
463- // To do: make all of this much smarter: `inForeignNode`, `state`, `returnState`, etc.
464-
465- // Note: don’t reset `inForeignNode` so that the state of HTML in SVG
466- // in HTML etc is kept.
467-
468- state . parser . tokenizer . lastStartTagName = ''
451+ // Note: don’t reset `state`, `inForeignNode`, or `lastStartTagName`, we
452+ // manually update those when needed.
469453 state . parser . tokenizer . active = false
470- state . parser . tokenizer . state = TokenizerMode . DATA
471454 // @ts -expect-error: private.
472455 // type-coverage:ignore-next-line
473456 state . parser . tokenizer . returnState = TokenizerMode . DATA
@@ -530,21 +513,39 @@ function setPoint(state, point) {
530513}
531514
532515/**
533- * Create a `parse5` `endTag` token .
516+ * Emit a start tag .
534517 *
535518 * @param {Element } node
536519 * Element.
537- * @returns {TagToken }
538- * Start tag token.
520+ * @param {State } state
521+ * Info passed around about the current state.
522+ * @returns {void }
523+ * Nothing.
539524 */
540- function startTag ( node ) {
541- // To do: pass `space`.
542- const result = toParse5 ( { ...node , children : [ ] } )
525+ function startTag ( node , state ) {
526+ // Ignore tags if we’re in plain text.
527+ if ( state . parser . tokenizer . state === TokenizerMode . PLAINTEXT ) return
528+
529+ resetTokenizer ( state , pointStart ( node ) )
530+
531+ const current = state . parser . openElements . current
532+ let ns = 'namespaceURI' in current ? current . namespaceURI : webNamespaces . html
533+
534+ if ( ns === webNamespaces . html && node . tagName === 'svg' ) {
535+ ns = webNamespaces . svg
536+ }
537+
538+ const result = toParse5 (
539+ // Shallow clone to not delve into `children`: we only need the attributes.
540+ { ...node , children : [ ] } ,
541+ { space : ns === webNamespaces . svg ? 'svg' : 'html' }
542+ )
543543 // Always element.
544544 /* c8 ignore next */
545545 const attrs = 'attrs' in result ? result . attrs : [ ]
546546
547- return {
547+ /** @type {TagToken } */
548+ const tag = {
548549 type : Token . TokenType . START_TAG ,
549550 tagName : node . tagName ,
550551 tagID : html . getTagID ( node . tagName ) ,
@@ -554,18 +555,54 @@ function startTag(node) {
554555 attrs,
555556 location : createParse5Location ( node )
556557 }
558+
559+ // The HTML parsing algorithm works by doing half of the state management in
560+ // the tokenizer and half in the parser.
561+ // We can’t use the tokenizer here, as we don’t have strings.
562+ // So we act *as if* the tokenizer emits tokens:
563+
564+ // @ts -expect-error: private.
565+ // type-coverage:ignore-next-line
566+ state . parser . currentToken = tag
567+ // @ts -expect-error: private.
568+ // type-coverage:ignore-next-line
569+ state . parser . _processToken ( state . parser . currentToken )
570+
571+ // …but then we still need a bunch of work that the tokenizer would normally
572+ // do, such as:
573+
574+ // Set a tag name, similar to how the tokenizer would do it.
575+ state . parser . tokenizer . lastStartTagName = node . tagName
576+
577+ // `inForeignNode` is correctly set by the parser.
557578}
558579
559580/**
560- * Create a `parse5` `endTag` token .
581+ * Emit an end tag .
561582 *
562583 * @param {Element } node
563584 * Element.
564- * @returns {TagToken }
565- * End tag token.
585+ * @param {State } state
586+ * Info passed around about the current state.
587+ * @returns {void }
588+ * Nothing.
566589 */
567- function endTag ( node ) {
568- return {
590+ function endTag ( node , state ) {
591+ // Do not emit closing tags for HTML void elements.
592+ if (
593+ ! state . parser . tokenizer . inForeignNode &&
594+ htmlVoidElements . includes ( node . tagName )
595+ ) {
596+ return
597+ }
598+
599+ // Ignore tags if we’re in plain text.
600+ if ( state . parser . tokenizer . state === TokenizerMode . PLAINTEXT ) return
601+
602+ resetTokenizer ( state , pointEnd ( node ) )
603+
604+ /** @type {TagToken } */
605+ const tag = {
569606 type : Token . TokenType . END_TAG ,
570607 tagName : node . tagName ,
571608 tagID : html . getTagID ( node . tagName ) ,
@@ -574,6 +611,36 @@ function endTag(node) {
574611 attrs : [ ] ,
575612 location : createParse5Location ( node )
576613 }
614+
615+ // The HTML parsing algorithm works by doing half of the state management in
616+ // the tokenizer and half in the parser.
617+ // We can’t use the tokenizer here, as we don’t have strings.
618+ // So we act *as if* the tokenizer emits tokens:
619+
620+ // @ts -expect-error: private.
621+ // type-coverage:ignore-next-line
622+ state . parser . currentToken = tag
623+ // @ts -expect-error: private.
624+ // type-coverage:ignore-next-line
625+ state . parser . _processToken ( state . parser . currentToken )
626+
627+ // …but then we still need a bunch of work that the tokenizer would normally
628+ // do, such as:
629+
630+ // Switch back to the data state after alternative states that don’t accept
631+ // tags:
632+ if (
633+ // Current element is closed.
634+ tag . tagName === state . parser . tokenizer . lastStartTagName &&
635+ // `<textarea>` and `<title>`
636+ ( state . parser . tokenizer . state === TokenizerMode . RCDATA ||
637+ // `<iframe>`, `<noembed>`, `<style>`, `<xmp>`
638+ state . parser . tokenizer . state === TokenizerMode . RAWTEXT ||
639+ // `<script>`
640+ state . parser . tokenizer . state === TokenizerMode . SCRIPT_DATA )
641+ ) {
642+ state . parser . tokenizer . state = TokenizerMode . DATA
643+ }
577644}
578645
579646/**
0 commit comments