site/node_modules/parse-entities/lib/index.js
2024-10-14 08:09:33 +02:00

407 lines
11 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* @typedef {import('unist').Point} Point
* @typedef {import('unist').Position} Position
*/
import {characterEntitiesLegacy} from 'character-entities-legacy'
import {characterReferenceInvalid} from 'character-reference-invalid'
import {isDecimal} from 'is-decimal'
import {isHexadecimal} from 'is-hexadecimal'
import {isAlphanumerical} from 'is-alphanumerical'
import {decodeNamedCharacterReference} from 'decode-named-character-reference'
const fromCharCode = String.fromCharCode
// Warning messages.
const messages = [
'',
/* 1: Non terminated (named) */
'Named character references must be terminated by a semicolon',
/* 2: Non terminated (numeric) */
'Numeric character references must be terminated by a semicolon',
/* 3: Empty (named) */
'Named character references cannot be empty',
/* 4: Empty (numeric) */
'Numeric character references cannot be empty',
/* 5: Unknown (named) */
'Named character references must be known',
/* 6: Disallowed (numeric) */
'Numeric character references cannot be disallowed',
/* 7: Prohibited (numeric) */
'Numeric character references cannot be outside the permissible Unicode range'
]
/**
* Parse HTML character references.
*
* @param {string} value
* @param {import('../index.js').Options} [options={}]
*/
export function parseEntities(value, options = {}) {
const additional =
typeof options.additional === 'string'
? options.additional.charCodeAt(0)
: options.additional
/** @type {Array<string>} */
const result = []
let index = 0
let lines = -1
let queue = ''
/** @type {Point|undefined} */
let point
/** @type {Array<number>|undefined} */
let indent
if (options.position) {
if ('start' in options.position || 'indent' in options.position) {
// @ts-expect-error: points dont have indent.
indent = options.position.indent
// @ts-expect-error: points dont have indent.
point = options.position.start
} else {
point = options.position
}
}
let line = (point ? point.line : 0) || 1
let column = (point ? point.column : 0) || 1
// Cache the current point.
let previous = now()
/** @type {number|undefined} */
let character
// Ensure the algorithm walks over the first character (inclusive).
index--
while (++index <= value.length) {
// If the previous character was a newline.
if (character === 10 /* `\n` */) {
column = (indent ? indent[lines] : 0) || 1
}
character = value.charCodeAt(index)
if (character === 38 /* `&` */) {
const following = value.charCodeAt(index + 1)
// The behavior depends on the identity of the next character.
if (
following === 9 /* `\t` */ ||
following === 10 /* `\n` */ ||
following === 12 /* `\f` */ ||
following === 32 /* ` ` */ ||
following === 38 /* `&` */ ||
following === 60 /* `<` */ ||
Number.isNaN(following) ||
(additional && following === additional)
) {
// Not a character reference.
// No characters are consumed, and nothing is returned.
// This is not an error, either.
queue += fromCharCode(character)
column++
continue
}
const start = index + 1
let begin = start
let end = start
/** @type {string} */
let type
if (following === 35 /* `#` */) {
// Numerical reference.
end = ++begin
// The behavior further depends on the next character.
const following = value.charCodeAt(end)
if (following === 88 /* `X` */ || following === 120 /* `x` */) {
// ASCII hexadecimal digits.
type = 'hexadecimal'
end = ++begin
} else {
// ASCII decimal digits.
type = 'decimal'
}
} else {
// Named reference.
type = 'named'
}
let characterReferenceCharacters = ''
let characterReference = ''
let characters = ''
// Each type of character reference accepts different characters.
// This test is used to detect whether a reference has ended (as the semicolon
// is not strictly needed).
const test =
type === 'named'
? isAlphanumerical
: type === 'decimal'
? isDecimal
: isHexadecimal
end--
while (++end <= value.length) {
const following = value.charCodeAt(end)
if (!test(following)) {
break
}
characters += fromCharCode(following)
// Check if we can match a legacy named reference.
// If so, we cache that as the last viable named reference.
// This ensures we do not need to walk backwards later.
if (type === 'named' && characterEntitiesLegacy.includes(characters)) {
characterReferenceCharacters = characters
// @ts-expect-error: always able to decode.
characterReference = decodeNamedCharacterReference(characters)
}
}
let terminated = value.charCodeAt(end) === 59 /* `;` */
if (terminated) {
end++
const namedReference =
type === 'named' ? decodeNamedCharacterReference(characters) : false
if (namedReference) {
characterReferenceCharacters = characters
characterReference = namedReference
}
}
let diff = 1 + end - start
let reference = ''
if (!terminated && options.nonTerminated === false) {
// Empty.
} else if (!characters) {
// An empty (possible) reference is valid, unless its numeric (thus an
// ampersand followed by an octothorp).
if (type !== 'named') {
warning(4 /* Empty (numeric) */, diff)
}
} else if (type === 'named') {
// An ampersand followed by anything unknown, and not terminated, is
// invalid.
if (terminated && !characterReference) {
warning(5 /* Unknown (named) */, 1)
} else {
// If theres something after an named reference which is not known,
// cap the reference.
if (characterReferenceCharacters !== characters) {
end = begin + characterReferenceCharacters.length
diff = 1 + end - begin
terminated = false
}
// If the reference is not terminated, warn.
if (!terminated) {
const reason = characterReferenceCharacters
? 1 /* Non terminated (named) */
: 3 /* Empty (named) */
if (options.attribute) {
const following = value.charCodeAt(end)
if (following === 61 /* `=` */) {
warning(reason, diff)
characterReference = ''
} else if (isAlphanumerical(following)) {
characterReference = ''
} else {
warning(reason, diff)
}
} else {
warning(reason, diff)
}
}
}
reference = characterReference
} else {
if (!terminated) {
// All nonterminated numeric references are not rendered, and emit a
// warning.
warning(2 /* Non terminated (numeric) */, diff)
}
// When terminated and numerical, parse as either hexadecimal or
// decimal.
let referenceCode = Number.parseInt(
characters,
type === 'hexadecimal' ? 16 : 10
)
// Emit a warning when the parsed number is prohibited, and replace with
// replacement character.
if (prohibited(referenceCode)) {
warning(7 /* Prohibited (numeric) */, diff)
reference = fromCharCode(65533 /* `<60>` */)
} else if (referenceCode in characterReferenceInvalid) {
// Emit a warning when the parsed number is disallowed, and replace by
// an alternative.
warning(6 /* Disallowed (numeric) */, diff)
reference = characterReferenceInvalid[referenceCode]
} else {
// Parse the number.
let output = ''
// Emit a warning when the parsed number should not be used.
if (disallowed(referenceCode)) {
warning(6 /* Disallowed (numeric) */, diff)
}
// Serialize the number.
if (referenceCode > 0xffff) {
referenceCode -= 0x10000
output += fromCharCode((referenceCode >>> (10 & 0x3ff)) | 0xd800)
referenceCode = 0xdc00 | (referenceCode & 0x3ff)
}
reference = output + fromCharCode(referenceCode)
}
}
// Found it!
// First eat the queued characters as normal text, then eat a reference.
if (reference) {
flush()
previous = now()
index = end - 1
column += end - start + 1
result.push(reference)
const next = now()
next.offset++
if (options.reference) {
options.reference.call(
options.referenceContext,
reference,
{start: previous, end: next},
value.slice(start - 1, end)
)
}
previous = next
} else {
// If we could not find a reference, queue the checked characters (as
// normal characters), and move the pointer to their end.
// This is possible because we can be certain neither newlines nor
// ampersands are included.
characters = value.slice(start - 1, end)
queue += characters
column += characters.length
index = end - 1
}
} else {
// Handle anything other than an ampersand, including newlines and EOF.
if (character === 10 /* `\n` */) {
line++
lines++
column = 0
}
if (Number.isNaN(character)) {
flush()
} else {
queue += fromCharCode(character)
column++
}
}
}
// Return the reduced nodes.
return result.join('')
// Get current position.
function now() {
return {
line,
column,
offset: index + ((point ? point.offset : 0) || 0)
}
}
/**
* Handle the warning.
*
* @param {1|2|3|4|5|6|7} code
* @param {number} offset
*/
function warning(code, offset) {
/** @type {ReturnType<now>} */
let position
if (options.warning) {
position = now()
position.column += offset
position.offset += offset
options.warning.call(
options.warningContext,
messages[code],
position,
code
)
}
}
/**
* Flush `queue` (normal text).
* Macro invoked before each reference and at the end of `value`.
* Does nothing when `queue` is empty.
*/
function flush() {
if (queue) {
result.push(queue)
if (options.text) {
options.text.call(options.textContext, queue, {
start: previous,
end: now()
})
}
queue = ''
}
}
}
/**
* Check if `character` is outside the permissible unicode range.
*
* @param {number} code
* @returns {boolean}
*/
function prohibited(code) {
return (code >= 0xd800 && code <= 0xdfff) || code > 0x10ffff
}
/**
* Check if `character` is disallowed.
*
* @param {number} code
* @returns {boolean}
*/
function disallowed(code) {
return (
(code >= 0x0001 && code <= 0x0008) ||
code === 0x000b ||
(code >= 0x000d && code <= 0x001f) ||
(code >= 0x007f && code <= 0x009f) ||
(code >= 0xfdd0 && code <= 0xfdef) ||
(code & 0xffff) === 0xffff ||
(code & 0xffff) === 0xfffe
)
}