site/node_modules/hast-util-raw/lib/index.js

/**
 * @typedef {import('hast').Comment} Comment
 * @typedef {import('hast').Doctype} Doctype
 * @typedef {import('hast').Element} Element
 * @typedef {import('hast').Nodes} Nodes
 * @typedef {import('hast').Root} Root
 * @typedef {import('hast').RootContent} RootContent
 * @typedef {import('hast').Text} Text
 *
 * @typedef {import('mdast-util-to-hast').Raw} Raw
 *
 * @typedef {import('parse5').DefaultTreeAdapterMap} DefaultTreeAdapterMap
 * @typedef {import('parse5').ParserOptions<DefaultTreeAdapterMap>} ParserOptions
 * @typedef {import('parse5').Token.CharacterToken} CharacterToken
 * @typedef {import('parse5').Token.CommentToken} CommentToken
 * @typedef {import('parse5').Token.DoctypeToken} DoctypeToken
 * @typedef {import('parse5').Token.Location} Location
 * @typedef {import('parse5').Token.TagToken} TagToken
 *
 * @typedef {import('unist').Point} Point
 *
 * @typedef {import('vfile').VFile} VFile
 */

/**
 * @typedef Options
 *   Configuration.
 * @property {VFile | null | undefined} [file]
 *   Corresponding virtual file representing the input document (optional).
 * @property {Array<Nodes['type']> | null | undefined} [passThrough]
 *   List of custom hast node types to pass through (as in, keep) (optional).
 *
 *   If the passed through nodes have children, those children are expected to
 *   be hast again and will be handled.
 *
 * @typedef State
 *   Info passed around about the current state.
 * @property {(node: Nodes) => undefined} handle
 *   Add a hast node to the parser.
 * @property {Options} options
 *   User configuration.
 * @property {Parser<DefaultTreeAdapterMap>} parser
 *   Current parser.
 * @property {boolean} stitches
 *   Whether there are stitches.
 *
 * @typedef {{type: 'comment', value: {stitch: Nodes}}} Stitch
 *   Custom comment-like value we pass through parse5, which contains a
 *   replacement node that we’ll swap back in afterwards.
 */

import structuredClone from '@ungap/structured-clone'
import {fromParse5} from 'hast-util-from-parse5'
import {toParse5} from 'hast-util-to-parse5'
import {htmlVoidElements} from 'html-void-elements'
import {Parser, Token, TokenizerMode, html} from 'parse5'
import {pointEnd, pointStart} from 'unist-util-position'
import {visit} from 'unist-util-visit'
import {webNamespaces} from 'web-namespaces'
import {zwitch} from 'zwitch'

// Node types associated with MDX.
// <https://github.com/mdx-js/mdx/blob/8a56312/packages/mdx/lib/node-types.js>
const knownMdxNames = new Set([
  'mdxFlowExpression',
  'mdxJsxFlowElement',
  'mdxJsxTextElement',
  'mdxTextExpression',
  'mdxjsEsm'
])

/** @type {ParserOptions} */
const parseOptions = {sourceCodeLocationInfo: true, scriptingEnabled: false}

/**
 * Pass a hast tree through an HTML parser, which will fix nesting, and turn
 * raw nodes into actual nodes.
 *
 * @param {Nodes} tree
 *   Original hast tree to transform.
 * @param {Options | null | undefined} [options]
 *   Configuration (optional).
 * @returns {Nodes}
 *   Parsed again tree.
 */
export function raw(tree, options) {
  const document = documentMode(tree)
  /** @type {(node: Nodes, state: State) => undefined} */
  const one = zwitch('type', {
    handlers: {root, element, text, comment, doctype, raw: handleRaw},
    unknown
  })

  /** @type {State} */
  const state = {
    parser: document
      ? new Parser(parseOptions)
      : Parser.getFragmentParser(undefined, parseOptions),
    handle(node) {
      one(node, state)
    },
    stitches: false,
    options: options || {}
  }

  one(tree, state)
  resetTokenizer(state, pointStart())

  const p5 = document ? state.parser.document : state.parser.getFragment()
  const result = fromParse5(p5, {
    // To do: support `space`?
    file: state.options.file
  })

  if (state.stitches) {
    visit(result, 'comment', function (node, index, parent) {
      const stitch = /** @type {Stitch} */ (/** @type {unknown} */ (node))
      if (stitch.value.stitch && parent && index !== undefined) {
        /** @type {Array<RootContent>} */
        const siblings = parent.children
        // @ts-expect-error: assume the stitch is allowed.
        siblings[index] = stitch.value.stitch
        return index
      }
    })
  }

  // Unpack if possible and when not given a `root`.
  if (
    result.type === 'root' &&
    result.children.length === 1 &&
    result.children[0].type === tree.type
  ) {
    return result.children[0]
  }

  return result
}

/**
 * Transform all nodes
 *
 * @param {Array<RootContent>} nodes
 *   hast content.
 * @param {State} state
 *   Info passed around about the current state.
 * @returns {undefined}
 *   Nothing.
 */
function all(nodes, state) {
  let index = -1

  /* istanbul ignore else - invalid nodes, see rehypejs/rehype-raw#7. */
  if (nodes) {
    while (++index < nodes.length) {
      state.handle(nodes[index])
    }
  }
}

/**
 * Transform a root.
 *
 * @param {Root} node
 *   hast root node.
 * @param {State} state
 *   Info passed around about the current state.
 * @returns {undefined}
 *   Nothing.
 */
function root(node, state) {
  all(node.children, state)
}

/**
 * Transform an element.
 *
 * @param {Element} node
 *   hast element node.
 * @param {State} state
 *   Info passed around about the current state.
 * @returns {undefined}
 *   Nothing.
 */
function element(node, state) {
  startTag(node, state)

  all(node.children, state)

  endTag(node, state)
}

/**
 * Transform a text.
 *
 * @param {Text} node
 *   hast text node.
 * @param {State} state
 *   Info passed around about the current state.
 * @returns {undefined}
 *   Nothing.
 */
function text(node, state) {
  /** @type {CharacterToken} */
  const token = {
    type: Token.TokenType.CHARACTER,
    chars: node.value,
    location: createParse5Location(node)
  }

  resetTokenizer(state, pointStart(node))
  // @ts-expect-error: private.
  // type-coverage:ignore-next-line
  state.parser.currentToken = token
  // @ts-expect-error: private.
  // type-coverage:ignore-next-line
  state.parser._processToken(state.parser.currentToken)
}

/**
 * Transform a doctype.
 *
 * @param {Doctype} node
 *   hast doctype node.
 * @param {State} state
 *   Info passed around about the current state.
 * @returns {undefined}
 *   Nothing.
 */
function doctype(node, state) {
  /** @type {DoctypeToken} */
  const token = {
    type: Token.TokenType.DOCTYPE,
    name: 'html',
    forceQuirks: false,
    publicId: '',
    systemId: '',
    location: createParse5Location(node)
  }

  resetTokenizer(state, pointStart(node))
  // @ts-expect-error: private.
  // type-coverage:ignore-next-line
  state.parser.currentToken = token
  // @ts-expect-error: private.
  // type-coverage:ignore-next-line
  state.parser._processToken(state.parser.currentToken)
}

/**
 * Transform a stitch.
 *
 * @param {Nodes} node
 *   unknown node.
 * @param {State} state
 *   Info passed around about the current state.
 * @returns {undefined}
 *   Nothing.
 */
function stitch(node, state) {
  // Mark that there are stitches, so we need to walk the tree and revert them.
  state.stitches = true

  /** @type {Nodes} */
  const clone = cloneWithoutChildren(node)

  // Recurse, because to somewhat handle `[<x>]</x>` (where `[]` denotes the
  // passed through node).
  if ('children' in node && 'children' in clone) {
    // Root in root out.
    const fakeRoot = /** @type {Root} */ (
      raw({type: 'root', children: node.children}, state.options)
    )
    clone.children = fakeRoot.children
  }

  // Hack: `value` is supposed to be a string, but as none of the tools
  // (`parse5` or `hast-util-from-parse5`) looks at it, we can pass nodes
  // through.
  comment({type: 'comment', value: {stitch: clone}}, state)
}

/**
 * Transform a comment (or stitch).
 *
 * @param {Comment | Stitch} node
 *   hast comment node.
 * @param {State} state
 *   Info passed around about the current state.
 * @returns {undefined}
 *   Nothing.
 */
function comment(node, state) {
  /** @type {string} */
  // @ts-expect-error: we pass stitches through.
  const data = node.value

  /** @type {CommentToken} */
  const token = {
    type: Token.TokenType.COMMENT,
    data,
    location: createParse5Location(node)
  }
  resetTokenizer(state, pointStart(node))
  // @ts-expect-error: private.
  // type-coverage:ignore-next-line
  state.parser.currentToken = token
  // @ts-expect-error: private.
  // type-coverage:ignore-next-line
  state.parser._processToken(state.parser.currentToken)
}

/**
 * Transform a raw node.
 *
 * @param {Raw} node
 *   hast raw node.
 * @param {State} state
 *   Info passed around about the current state.
 * @returns {undefined}
 *   Nothing.
 */
function handleRaw(node, state) {
  // Reset preprocessor:
  // See: <https://github.com/inikulin/parse5/blob/6f7ca60/packages/parse5/lib/tokenizer/preprocessor.ts#L18-L31>.
  state.parser.tokenizer.preprocessor.html = ''
  // @ts-expect-error: private.
  // type-coverage:ignore-next-line
  state.parser.tokenizer.preprocessor.pos = -1
  // @ts-expect-error: private.
  // type-coverage:ignore-next-line
  state.parser.tokenizer.preprocessor.lastGapPos = -2
  // @ts-expect-error: private.
  // type-coverage:ignore-next-line
  state.parser.tokenizer.preprocessor.gapStack = []
  // @ts-expect-error: private.
  // type-coverage:ignore-next-line
  state.parser.tokenizer.preprocessor.skipNextNewLine = false
  // @ts-expect-error: private.
  // type-coverage:ignore-next-line
  state.parser.tokenizer.preprocessor.lastChunkWritten = false
  state.parser.tokenizer.preprocessor.endOfChunkHit = false
  // @ts-expect-error: private.
  // type-coverage:ignore-next-line
  state.parser.tokenizer.preprocessor.isEol = false

  // Now pass `node.value`.
  setPoint(state, pointStart(node))
  state.parser.tokenizer.write(node.value, false)
  // @ts-expect-error: private.
  // type-coverage:ignore-next-line
  state.parser.tokenizer._runParsingLoop()

  // Character references hang, so if we ended there, we need to flush
  // those too.
  // We reset the preprocessor as if the document ends here.
  // Then one single call to the relevant state does the trick, parse5
  // consumes the whole token.

  // Note: `State` is not exposed by `parse5`, so these numbers are fragile.
  // See: <https://github.com/inikulin/parse5/blob/46cba43/packages/parse5/lib/tokenizer/index.ts#L58>
  // Note: a change to `parse5`, which breaks this, was merged but not released.
  // Investigate when it is.
  if (
    state.parser.tokenizer.state === 72 /* NAMED_CHARACTER_REFERENCE */ ||
    state.parser.tokenizer.state === 78 /* NUMERIC_CHARACTER_REFERENCE_END */
  ) {
    // @ts-expect-error: private.
    // type-coverage:ignore-next-line
    state.parser.tokenizer.preprocessor.lastChunkWritten = true
    /** @type {number} */
    // @ts-expect-error: private.
    // type-coverage:ignore-next-line
    const cp = state.parser.tokenizer._consume()
    // @ts-expect-error: private.
    // type-coverage:ignore-next-line
    state.parser.tokenizer._callState(cp)
  }
}

/**
 * Crash on an unknown node.
 *
 * @param {unknown} node_
 *   unknown node.
 * @param {State} state
 *   Info passed around about the current state.
 * @returns {undefined}
 *   Never.
 */
function unknown(node_, state) {
  const node = /** @type {Nodes} */ (node_)

  if (
    state.options.passThrough &&
    state.options.passThrough.includes(node.type)
  ) {
    stitch(node, state)
  } else {
    let extra = ''

    if (knownMdxNames.has(node.type)) {
      extra =
        ". It looks like you are using MDX nodes with `hast-util-raw` (or `rehype-raw`). If you use this because you are using remark or rehype plugins that inject `'html'` nodes, then please raise an issue with that plugin, as its a bad and slow idea. If you use this because you are using markdown syntax, then you have to configure this utility (or plugin) to pass through these nodes (see `passThrough` in docs), but you can also migrate to use the MDX syntax"
    }

    throw new Error('Cannot compile `' + node.type + '` node' + extra)
  }
}

/**
 * Reset the tokenizer of a parser.
 *
 * @param {State} state
 *   Info passed around about the current state.
 * @param {Point | undefined} point
 *   Point.
 * @returns {undefined}
 *   Nothing.
 */
function resetTokenizer(state, point) {
  setPoint(state, point)

  // Process final characters if they’re still there after hibernating.
  /** @type {CharacterToken} */
  // @ts-expect-error: private.
  // type-coverage:ignore-next-line
  const token = state.parser.tokenizer.currentCharacterToken

  if (token && token.location) {
    token.location.endLine = state.parser.tokenizer.preprocessor.line
    token.location.endCol = state.parser.tokenizer.preprocessor.col + 1
    token.location.endOffset = state.parser.tokenizer.preprocessor.offset + 1
    // @ts-expect-error: private.
    // type-coverage:ignore-next-line
    state.parser.currentToken = token
    // @ts-expect-error: private.
    // type-coverage:ignore-next-line
    state.parser._processToken(state.parser.currentToken)
  }

  // Reset tokenizer:
  // See: <https://github.com/inikulin/parse5/blob/6f7ca60/packages/parse5/lib/tokenizer/index.ts#L187-L223>.
  // Especially putting it back in the `data` state is useful: some elements,
  // like textareas and iframes, change the state.
  // See GH-7.
  // But also if broken HTML is in `raw`, and then a correct element is given.
  // See GH-11.
  // @ts-expect-error: private.
  // type-coverage:ignore-next-line
  state.parser.tokenizer.paused = false
  // @ts-expect-error: private.
  // type-coverage:ignore-next-line
  state.parser.tokenizer.inLoop = false

  // Note: don’t reset `state`, `inForeignNode`, or `lastStartTagName`, we
  // manually update those when needed.
  state.parser.tokenizer.active = false
  // @ts-expect-error: private.
  // type-coverage:ignore-next-line
  state.parser.tokenizer.returnState = TokenizerMode.DATA
  // @ts-expect-error: private.
  // type-coverage:ignore-next-line
  state.parser.tokenizer.charRefCode = -1
  // @ts-expect-error: private.
  // type-coverage:ignore-next-line
  state.parser.tokenizer.consumedAfterSnapshot = -1
  // @ts-expect-error: private.
  // type-coverage:ignore-next-line
  state.parser.tokenizer.currentLocation = null
  // @ts-expect-error: private.
  // type-coverage:ignore-next-line
  state.parser.tokenizer.currentCharacterToken = null
  // @ts-expect-error: private.
  // type-coverage:ignore-next-line
  state.parser.tokenizer.currentToken = null
  // @ts-expect-error: private.
  // type-coverage:ignore-next-line
  state.parser.tokenizer.currentAttr = {name: '', value: ''}
}

/**
 * Set current location.
 *
 * @param {State} state
 *   Info passed around about the current state.
 * @param {Point | undefined} point
 *   Point.
 * @returns {undefined}
 *   Nothing.
 */
function setPoint(state, point) {
  if (point && point.offset !== undefined) {
    /** @type {Location} */
    const location = {
      startLine: point.line,
      startCol: point.column,
      startOffset: point.offset,
      endLine: -1,
      endCol: -1,
      endOffset: -1
    }

    // @ts-expect-error: private.
    // type-coverage:ignore-next-line
    state.parser.tokenizer.preprocessor.lineStartPos = -point.column + 1 // Looks weird, but ensures we get correct positional info.
    state.parser.tokenizer.preprocessor.droppedBufferSize = point.offset
    state.parser.tokenizer.preprocessor.line = point.line
    // @ts-expect-error: private.
    // type-coverage:ignore-next-line
    state.parser.tokenizer.currentLocation = location
  }
}

/**
 * Emit a start tag.
 *
 * @param {Element} node
 *   Element.
 * @param {State} state
 *   Info passed around about the current state.
 * @returns {undefined}
 *   Nothing.
 */
function startTag(node, state) {
  // Ignore tags if we’re in plain text.
  if (state.parser.tokenizer.state === TokenizerMode.PLAINTEXT) return

  resetTokenizer(state, pointStart(node))

  const current = state.parser.openElements.current
  let ns = 'namespaceURI' in current ? current.namespaceURI : webNamespaces.html

  if (ns === webNamespaces.html && node.tagName === 'svg') {
    ns = webNamespaces.svg
  }

  const result = toParse5(
    // Shallow clone to not delve into `children`: we only need the attributes.
    {...node, children: []},
    {space: ns === webNamespaces.svg ? 'svg' : 'html'}
  )
  // Always element.
  /* c8 ignore next */
  const attrs = 'attrs' in result ? result.attrs : []

  /** @type {TagToken} */
  const tag = {
    type: Token.TokenType.START_TAG,
    tagName: node.tagName,
    tagID: html.getTagID(node.tagName),
    // We always send start and end tags.
    selfClosing: false,
    ackSelfClosing: false,
    attrs,
    location: createParse5Location(node)
  }

  // The HTML parsing algorithm works by doing half of the state management in
  // the tokenizer and half in the parser.
  // We can’t use the tokenizer here, as we don’t have strings.
  // So we act *as if* the tokenizer emits tokens:

  // @ts-expect-error: private.
  // type-coverage:ignore-next-line
  state.parser.currentToken = tag
  // @ts-expect-error: private.
  // type-coverage:ignore-next-line
  state.parser._processToken(state.parser.currentToken)

  // …but then we still need a bunch of work that the tokenizer would normally
  // do, such as:

  // Set a tag name, similar to how the tokenizer would do it.
  state.parser.tokenizer.lastStartTagName = node.tagName

  // `inForeignNode` is correctly set by the parser.
}

/**
 * Emit an end tag.
 *
 * @param {Element} node
 *   Element.
 * @param {State} state
 *   Info passed around about the current state.
 * @returns {undefined}
 *   Nothing.
 */
function endTag(node, state) {
  // Do not emit closing tags for HTML void elements.
  if (
    !state.parser.tokenizer.inForeignNode &&
    htmlVoidElements.includes(node.tagName)
  ) {
    return
  }

  // Ignore tags if we’re in plain text.
  if (state.parser.tokenizer.state === TokenizerMode.PLAINTEXT) return

  resetTokenizer(state, pointEnd(node))

  /** @type {TagToken} */
  const tag = {
    type: Token.TokenType.END_TAG,
    tagName: node.tagName,
    tagID: html.getTagID(node.tagName),
    selfClosing: false,
    ackSelfClosing: false,
    attrs: [],
    location: createParse5Location(node)
  }

  // The HTML parsing algorithm works by doing half of the state management in
  // the tokenizer and half in the parser.
  // We can’t use the tokenizer here, as we don’t have strings.
  // So we act *as if* the tokenizer emits tokens:

  // @ts-expect-error: private.
  // type-coverage:ignore-next-line
  state.parser.currentToken = tag
  // @ts-expect-error: private.
  // type-coverage:ignore-next-line
  state.parser._processToken(state.parser.currentToken)

  // …but then we still need a bunch of work that the tokenizer would normally
  // do, such as:

  // Switch back to the data state after alternative states that don’t accept
  // tags:
  if (
    // Current element is closed.
    tag.tagName === state.parser.tokenizer.lastStartTagName &&
    // `<textarea>` and `<title>`
    (state.parser.tokenizer.state === TokenizerMode.RCDATA ||
      // `<iframe>`, `<noembed>`, `<style>`, `<xmp>`
      state.parser.tokenizer.state === TokenizerMode.RAWTEXT ||
      // `<script>`
      state.parser.tokenizer.state === TokenizerMode.SCRIPT_DATA)
  ) {
    state.parser.tokenizer.state = TokenizerMode.DATA
  }
}

/**
 * Check if `node` represents a whole document or a fragment.
 *
 * @param {Nodes} node
 *   hast node.
 * @returns {boolean}
 *   Whether this represents a whole document or a fragment.
 */
function documentMode(node) {
  const head = node.type === 'root' ? node.children[0] : node
  return Boolean(
    head &&
      (head.type === 'doctype' ||
        (head.type === 'element' && head.tagName === 'html'))
  )
}

/**
 * Get a `parse5` location from a node.
 *
 * @param {Nodes | Stitch} node
 *   hast node.
 * @returns {Location}
 *   `parse5` location.
 */
function createParse5Location(node) {
  const start = pointStart(node) || {
    line: undefined,
    column: undefined,
    offset: undefined
  }
  const end = pointEnd(node) || {
    line: undefined,
    column: undefined,
    offset: undefined
  }

  /** @type {Record<keyof Location, number | undefined>} */
  const location = {
    startLine: start.line,
    startCol: start.column,
    startOffset: start.offset,
    endLine: end.line,
    endCol: end.column,
    endOffset: end.offset
  }

  // @ts-expect-error: unist point values can be `undefined` in hast, which
  // `parse5` types don’t want.
  return location
}

/**
 * @template {Nodes} NodeType
 *   Node type.
 * @param {NodeType} node
 *   Node to clone.
 * @returns {NodeType}
 *   Cloned node, without children.
 */
function cloneWithoutChildren(node) {
  return 'children' in node
    ? structuredClone({...node, children: []})
    : structuredClone(node)
}