Fix rcdata, rawtext, script data, and plaintext states

wooorm · wooorm · commit e66705ab1b64 · 2023-01-16T13:10:44.000+01:00
This fixes a bug where the state of rcdata (`<textarea>`, `<title>`), rawtext (`<iframe>`, `<noembed>`, `<style>`, `<xmp>`), script data (`<script>`), and plaintext (`<plaintext>`) was not properly handled. It also improves void tags only being ignored when explicitly in HTML. Closes GH-19.
diff --git a/lib/index.js b/lib/index.js
@@ -56,6 +56,7 @@ import {Parser, Token, TokenizerMode, html} from 'parse5'
 import {pointStart, pointEnd} from 'unist-util-position'
 import {visit} from 'unist-util-visit'
 import {zwitch} from 'zwitch'
+import {webNamespaces} from 'web-namespaces'
 
 // Node types associated with MDX.
 // <https://github.com/mdx-js/mdx/blob/641eb91/packages/mdx/lib/node-types.js>
@@ -123,9 +124,9 @@ export function raw(tree, options) {
 
   // Unpack if possible and when not given a `root`.
   if (
-    tree.type !== 'root' &&
     result.type === 'root' &&
-    result.children.length === 1
+    result.children.length === 1 &&
+    result.children[0].type === tree.type
   ) {
     return result.children[0]
   }
@@ -179,25 +180,11 @@ function root(node, state) {
  *   Nothing.
  */
 function element(node, state) {
-  resetTokenizer(state, pointStart(node))
-  // @ts-expect-error: private.
-  // type-coverage:ignore-next-line
-  state.parser.currentToken = startTag(node)
-  // @ts-expect-error: private.
-  // type-coverage:ignore-next-line
-  state.parser._processToken(state.parser.currentToken)
+  startTag(node, state)
 
   all(node.children, state)
 
-  if (!htmlVoidElements.includes(node.tagName)) {
-    resetTokenizer(state, pointEnd(node))
-    // @ts-expect-error: private.
-    // type-coverage:ignore-next-line
-    state.parser.currentToken = endTag(node)
-    // @ts-expect-error: private.
-    // type-coverage:ignore-next-line
-    state.parser._processToken(state.parser.currentToken)
-  }
+  endTag(node, state)
 }
 
 /**
@@ -366,6 +353,7 @@ function handleRaw(node, state) {
   // consumes the whole token.
 
   // Note: `State` is not exposed by `parse5`, so these numbers are fragile.
+  // See: <https://github.com/inikulin/parse5/blob/46cba43/packages/parse5/lib/tokenizer/index.ts#L58>
   if (
     state.parser.tokenizer.state === 72 /* NAMED_CHARACTER_REFERENCE */ ||
     state.parser.tokenizer.state === 78 /* NUMERIC_CHARACTER_REFERENCE_END */
@@ -460,14 +448,9 @@ function resetTokenizer(state, point) {
   // type-coverage:ignore-next-line
   state.parser.tokenizer.inLoop = false
 
-  // To do: make all of this much smarter: `inForeignNode`, `state`, `returnState`, etc.
-
-  // Note: don’t reset `inForeignNode` so that the state of HTML in SVG
-  // in HTML etc is kept.
-
-  state.parser.tokenizer.lastStartTagName = ''
+  // Note: don’t reset `state`, `inForeignNode`, or `lastStartTagName`, we
+  // manually update those when needed.
   state.parser.tokenizer.active = false
-  state.parser.tokenizer.state = TokenizerMode.DATA
   // @ts-expect-error: private.
   // type-coverage:ignore-next-line
   state.parser.tokenizer.returnState = TokenizerMode.DATA
@@ -530,21 +513,39 @@ function setPoint(state, point) {
 }
 
 /**
- * Create a `parse5` `endTag` token.
+ * Emit a start tag.
  *
  * @param {Element} node
  *   Element.
- * @returns {TagToken}
- *   Start tag token.
+ * @param {State} state
+ *   Info passed around about the current state.
+ * @returns {void}
+ *   Nothing.
  */
-function startTag(node) {
-  // To do: pass `space`.
-  const result = toParse5({...node, children: []})
+function startTag(node, state) {
+  // Ignore tags if we’re in plain text.
+  if (state.parser.tokenizer.state === TokenizerMode.PLAINTEXT) return
+
+  resetTokenizer(state, pointStart(node))
+
+  const current = state.parser.openElements.current
+  let ns = 'namespaceURI' in current ? current.namespaceURI : webNamespaces.html
+
+  if (ns === webNamespaces.html && node.tagName === 'svg') {
+    ns = webNamespaces.svg
+  }
+
+  const result = toParse5(
+    // Shallow clone to not delve into `children`: we only need the attributes.
+    {...node, children: []},
+    {space: ns === webNamespaces.svg ? 'svg' : 'html'}
+  )
   // Always element.
   /* c8 ignore next */
   const attrs = 'attrs' in result ? result.attrs : []
 
-  return {
+  /** @type {TagToken} */
+  const tag = {
     type: Token.TokenType.START_TAG,
     tagName: node.tagName,
     tagID: html.getTagID(node.tagName),
@@ -554,18 +555,54 @@ function startTag(node) {
     attrs,
     location: createParse5Location(node)
   }
+
+  // The HTML parsing algorithm works by doing half of the state management in
+  // the tokenizer and half in the parser.
+  // We can’t use the tokenizer here, as we don’t have strings.
+  // So we act *as if* the tokenizer emits tokens:
+
+  // @ts-expect-error: private.
+  // type-coverage:ignore-next-line
+  state.parser.currentToken = tag
+  // @ts-expect-error: private.
+  // type-coverage:ignore-next-line
+  state.parser._processToken(state.parser.currentToken)
+
+  // …but then we still need a bunch of work that the tokenizer would normally
+  // do, such as:
+
+  // Set a tag name, similar to how the tokenizer would do it.
+  state.parser.tokenizer.lastStartTagName = node.tagName
+
+  // `inForeignNode` is correctly set by the parser.
 }
 
 /**
- * Create a `parse5` `endTag` token.
+ * Emit an end tag.
  *
  * @param {Element} node
  *   Element.
- * @returns {TagToken}
- *   End tag token.
+ * @param {State} state
+ *   Info passed around about the current state.
+ * @returns {void}
+ *   Nothing.
  */
-function endTag(node) {
-  return {
+function endTag(node, state) {
+  // Do not emit closing tags for HTML void elements.
+  if (
+    !state.parser.tokenizer.inForeignNode &&
+    htmlVoidElements.includes(node.tagName)
+  ) {
+    return
+  }
+
+  // Ignore tags if we’re in plain text.
+  if (state.parser.tokenizer.state === TokenizerMode.PLAINTEXT) return
+
+  resetTokenizer(state, pointEnd(node))
+
+  /** @type {TagToken} */
+  const tag = {
     type: Token.TokenType.END_TAG,
     tagName: node.tagName,
     tagID: html.getTagID(node.tagName),
@@ -574,6 +611,36 @@ function endTag(node) {
     attrs: [],
     location: createParse5Location(node)
   }
+
+  // The HTML parsing algorithm works by doing half of the state management in
+  // the tokenizer and half in the parser.
+  // We can’t use the tokenizer here, as we don’t have strings.
+  // So we act *as if* the tokenizer emits tokens:
+
+  // @ts-expect-error: private.
+  // type-coverage:ignore-next-line
+  state.parser.currentToken = tag
+  // @ts-expect-error: private.
+  // type-coverage:ignore-next-line
+  state.parser._processToken(state.parser.currentToken)
+
+  // …but then we still need a bunch of work that the tokenizer would normally
+  // do, such as:
+
+  // Switch back to the data state after alternative states that don’t accept
+  // tags:
+  if (
+    // Current element is closed.
+    tag.tagName === state.parser.tokenizer.lastStartTagName &&
+    // `<textarea>` and `<title>`
+    (state.parser.tokenizer.state === TokenizerMode.RCDATA ||
+      // `<iframe>`, `<noembed>`, `<style>`, `<xmp>`
+      state.parser.tokenizer.state === TokenizerMode.RAWTEXT ||
+      // `<script>`
+      state.parser.tokenizer.state === TokenizerMode.SCRIPT_DATA)
+  ) {
+    state.parser.tokenizer.state = TokenizerMode.DATA
+  }
 }
 
 /**
diff --git a/package.json b/package.json
@@ -43,6 +43,7 @@
     "unist-util-position": "^4.0.0",
     "unist-util-visit": "^4.0.0",
     "vfile": "^5.0.0",
+    "web-namespaces": "^2.0.0",
     "zwitch": "^2.0.0"
   },
   "devDependencies": {
diff --git a/test.js b/test.js