/** * @typedef {import('unist').Point} Point * @typedef {import('unist').Position} Position */ import {characterEntitiesLegacy} from 'character-entities-legacy' import {characterReferenceInvalid} from 'character-reference-invalid' import {isDecimal} from 'is-decimal' import {isHexadecimal} from 'is-hexadecimal' import {isAlphanumerical} from 'is-alphanumerical' import {decodeNamedCharacterReference} from 'decode-named-character-reference' const fromCharCode = String.fromCharCode // Warning messages. const messages = [ '', /* 1: Non terminated (named) */ 'Named character references must be terminated by a semicolon', /* 2: Non terminated (numeric) */ 'Numeric character references must be terminated by a semicolon', /* 3: Empty (named) */ 'Named character references cannot be empty', /* 4: Empty (numeric) */ 'Numeric character references cannot be empty', /* 5: Unknown (named) */ 'Named character references must be known', /* 6: Disallowed (numeric) */ 'Numeric character references cannot be disallowed', /* 7: Prohibited (numeric) */ 'Numeric character references cannot be outside the permissible Unicode range' ] /** * Parse HTML character references. * * @param {string} value * @param {import('../index.js').Options} [options={}] */ export function parseEntities(value, options = {}) { const additional = typeof options.additional === 'string' ? options.additional.charCodeAt(0) : options.additional /** @type {Array} */ const result = [] let index = 0 let lines = -1 let queue = '' /** @type {Point|undefined} */ let point /** @type {Array|undefined} */ let indent if (options.position) { if ('start' in options.position || 'indent' in options.position) { // @ts-expect-error: points don’t have indent. indent = options.position.indent // @ts-expect-error: points don’t have indent. point = options.position.start } else { point = options.position } } let line = (point ? point.line : 0) || 1 let column = (point ? point.column : 0) || 1 // Cache the current point. let previous = now() /** @type {number|undefined} */ let character // Ensure the algorithm walks over the first character (inclusive). index-- while (++index <= value.length) { // If the previous character was a newline. if (character === 10 /* `\n` */) { column = (indent ? indent[lines] : 0) || 1 } character = value.charCodeAt(index) if (character === 38 /* `&` */) { const following = value.charCodeAt(index + 1) // The behavior depends on the identity of the next character. if ( following === 9 /* `\t` */ || following === 10 /* `\n` */ || following === 12 /* `\f` */ || following === 32 /* ` ` */ || following === 38 /* `&` */ || following === 60 /* `<` */ || Number.isNaN(following) || (additional && following === additional) ) { // Not a character reference. // No characters are consumed, and nothing is returned. // This is not an error, either. queue += fromCharCode(character) column++ continue } const start = index + 1 let begin = start let end = start /** @type {string} */ let type if (following === 35 /* `#` */) { // Numerical reference. end = ++begin // The behavior further depends on the next character. const following = value.charCodeAt(end) if (following === 88 /* `X` */ || following === 120 /* `x` */) { // ASCII hexadecimal digits. type = 'hexadecimal' end = ++begin } else { // ASCII decimal digits. type = 'decimal' } } else { // Named reference. type = 'named' } let characterReferenceCharacters = '' let characterReference = '' let characters = '' // Each type of character reference accepts different characters. // This test is used to detect whether a reference has ended (as the semicolon // is not strictly needed). const test = type === 'named' ? isAlphanumerical : type === 'decimal' ? isDecimal : isHexadecimal end-- while (++end <= value.length) { const following = value.charCodeAt(end) if (!test(following)) { break } characters += fromCharCode(following) // Check if we can match a legacy named reference. // If so, we cache that as the last viable named reference. // This ensures we do not need to walk backwards later. if (type === 'named' && characterEntitiesLegacy.includes(characters)) { characterReferenceCharacters = characters // @ts-expect-error: always able to decode. characterReference = decodeNamedCharacterReference(characters) } } let terminated = value.charCodeAt(end) === 59 /* `;` */ if (terminated) { end++ const namedReference = type === 'named' ? decodeNamedCharacterReference(characters) : false if (namedReference) { characterReferenceCharacters = characters characterReference = namedReference } } let diff = 1 + end - start let reference = '' if (!terminated && options.nonTerminated === false) { // Empty. } else if (!characters) { // An empty (possible) reference is valid, unless it’s numeric (thus an // ampersand followed by an octothorp). if (type !== 'named') { warning(4 /* Empty (numeric) */, diff) } } else if (type === 'named') { // An ampersand followed by anything unknown, and not terminated, is // invalid. if (terminated && !characterReference) { warning(5 /* Unknown (named) */, 1) } else { // If there’s something after an named reference which is not known, // cap the reference. if (characterReferenceCharacters !== characters) { end = begin + characterReferenceCharacters.length diff = 1 + end - begin terminated = false } // If the reference is not terminated, warn. if (!terminated) { const reason = characterReferenceCharacters ? 1 /* Non terminated (named) */ : 3 /* Empty (named) */ if (options.attribute) { const following = value.charCodeAt(end) if (following === 61 /* `=` */) { warning(reason, diff) characterReference = '' } else if (isAlphanumerical(following)) { characterReference = '' } else { warning(reason, diff) } } else { warning(reason, diff) } } } reference = characterReference } else { if (!terminated) { // All nonterminated numeric references are not rendered, and emit a // warning. warning(2 /* Non terminated (numeric) */, diff) } // When terminated and numerical, parse as either hexadecimal or // decimal. let referenceCode = Number.parseInt( characters, type === 'hexadecimal' ? 16 : 10 ) // Emit a warning when the parsed number is prohibited, and replace with // replacement character. if (prohibited(referenceCode)) { warning(7 /* Prohibited (numeric) */, diff) reference = fromCharCode(65533 /* `�` */) } else if (referenceCode in characterReferenceInvalid) { // Emit a warning when the parsed number is disallowed, and replace by // an alternative. warning(6 /* Disallowed (numeric) */, diff) reference = characterReferenceInvalid[referenceCode] } else { // Parse the number. let output = '' // Emit a warning when the parsed number should not be used. if (disallowed(referenceCode)) { warning(6 /* Disallowed (numeric) */, diff) } // Serialize the number. if (referenceCode > 0xffff) { referenceCode -= 0x10000 output += fromCharCode((referenceCode >>> (10 & 0x3ff)) | 0xd800) referenceCode = 0xdc00 | (referenceCode & 0x3ff) } reference = output + fromCharCode(referenceCode) } } // Found it! // First eat the queued characters as normal text, then eat a reference. if (reference) { flush() previous = now() index = end - 1 column += end - start + 1 result.push(reference) const next = now() next.offset++ if (options.reference) { options.reference.call( options.referenceContext, reference, {start: previous, end: next}, value.slice(start - 1, end) ) } previous = next } else { // If we could not find a reference, queue the checked characters (as // normal characters), and move the pointer to their end. // This is possible because we can be certain neither newlines nor // ampersands are included. characters = value.slice(start - 1, end) queue += characters column += characters.length index = end - 1 } } else { // Handle anything other than an ampersand, including newlines and EOF. if (character === 10 /* `\n` */) { line++ lines++ column = 0 } if (Number.isNaN(character)) { flush() } else { queue += fromCharCode(character) column++ } } } // Return the reduced nodes. return result.join('') // Get current position. function now() { return { line, column, offset: index + ((point ? point.offset : 0) || 0) } } /** * Handle the warning. * * @param {1|2|3|4|5|6|7} code * @param {number} offset */ function warning(code, offset) { /** @type {ReturnType} */ let position if (options.warning) { position = now() position.column += offset position.offset += offset options.warning.call( options.warningContext, messages[code], position, code ) } } /** * Flush `queue` (normal text). * Macro invoked before each reference and at the end of `value`. * Does nothing when `queue` is empty. */ function flush() { if (queue) { result.push(queue) if (options.text) { options.text.call(options.textContext, queue, { start: previous, end: now() }) } queue = '' } } } /** * Check if `character` is outside the permissible unicode range. * * @param {number} code * @returns {boolean} */ function prohibited(code) { return (code >= 0xd800 && code <= 0xdfff) || code > 0x10ffff } /** * Check if `character` is disallowed. * * @param {number} code * @returns {boolean} */ function disallowed(code) { return ( (code >= 0x0001 && code <= 0x0008) || code === 0x000b || (code >= 0x000d && code <= 0x001f) || (code >= 0x007f && code <= 0x009f) || (code >= 0xfdd0 && code <= 0xfdef) || (code & 0xffff) === 0xffff || (code & 0xffff) === 0xfffe ) }