site/node_modules/parse-entities/lib/index.js

407 lines
11 KiB
JavaScript
Raw Normal View History

2024-10-14 08:09:33 +02:00
/**
* @typedef {import('unist').Point} Point
* @typedef {import('unist').Position} Position
*/
import {characterEntitiesLegacy} from 'character-entities-legacy'
import {characterReferenceInvalid} from 'character-reference-invalid'
import {isDecimal} from 'is-decimal'
import {isHexadecimal} from 'is-hexadecimal'
import {isAlphanumerical} from 'is-alphanumerical'
import {decodeNamedCharacterReference} from 'decode-named-character-reference'
const fromCharCode = String.fromCharCode
// Warning messages.
const messages = [
'',
/* 1: Non terminated (named) */
'Named character references must be terminated by a semicolon',
/* 2: Non terminated (numeric) */
'Numeric character references must be terminated by a semicolon',
/* 3: Empty (named) */
'Named character references cannot be empty',
/* 4: Empty (numeric) */
'Numeric character references cannot be empty',
/* 5: Unknown (named) */
'Named character references must be known',
/* 6: Disallowed (numeric) */
'Numeric character references cannot be disallowed',
/* 7: Prohibited (numeric) */
'Numeric character references cannot be outside the permissible Unicode range'
]
/**
* Parse HTML character references.
*
* @param {string} value
* @param {import('../index.js').Options} [options={}]
*/
export function parseEntities(value, options = {}) {
const additional =
typeof options.additional === 'string'
? options.additional.charCodeAt(0)
: options.additional
/** @type {Array<string>} */
const result = []
let index = 0
let lines = -1
let queue = ''
/** @type {Point|undefined} */
let point
/** @type {Array<number>|undefined} */
let indent
if (options.position) {
if ('start' in options.position || 'indent' in options.position) {
// @ts-expect-error: points dont have indent.
indent = options.position.indent
// @ts-expect-error: points dont have indent.
point = options.position.start
} else {
point = options.position
}
}
let line = (point ? point.line : 0) || 1
let column = (point ? point.column : 0) || 1
// Cache the current point.
let previous = now()
/** @type {number|undefined} */
let character
// Ensure the algorithm walks over the first character (inclusive).
index--
while (++index <= value.length) {
// If the previous character was a newline.
if (character === 10 /* `\n` */) {
column = (indent ? indent[lines] : 0) || 1
}
character = value.charCodeAt(index)
if (character === 38 /* `&` */) {
const following = value.charCodeAt(index + 1)
// The behavior depends on the identity of the next character.
if (
following === 9 /* `\t` */ ||
following === 10 /* `\n` */ ||
following === 12 /* `\f` */ ||
following === 32 /* ` ` */ ||
following === 38 /* `&` */ ||
following === 60 /* `<` */ ||
Number.isNaN(following) ||
(additional && following === additional)
) {
// Not a character reference.
// No characters are consumed, and nothing is returned.
// This is not an error, either.
queue += fromCharCode(character)
column++
continue
}
const start = index + 1
let begin = start
let end = start
/** @type {string} */
let type
if (following === 35 /* `#` */) {
// Numerical reference.
end = ++begin
// The behavior further depends on the next character.
const following = value.charCodeAt(end)
if (following === 88 /* `X` */ || following === 120 /* `x` */) {
// ASCII hexadecimal digits.
type = 'hexadecimal'
end = ++begin
} else {
// ASCII decimal digits.
type = 'decimal'
}
} else {
// Named reference.
type = 'named'
}
let characterReferenceCharacters = ''
let characterReference = ''
let characters = ''
// Each type of character reference accepts different characters.
// This test is used to detect whether a reference has ended (as the semicolon
// is not strictly needed).
const test =
type === 'named'
? isAlphanumerical
: type === 'decimal'
? isDecimal
: isHexadecimal
end--
while (++end <= value.length) {
const following = value.charCodeAt(end)
if (!test(following)) {
break
}
characters += fromCharCode(following)
// Check if we can match a legacy named reference.
// If so, we cache that as the last viable named reference.
// This ensures we do not need to walk backwards later.
if (type === 'named' && characterEntitiesLegacy.includes(characters)) {
characterReferenceCharacters = characters
// @ts-expect-error: always able to decode.
characterReference = decodeNamedCharacterReference(characters)
}
}
let terminated = value.charCodeAt(end) === 59 /* `;` */
if (terminated) {
end++
const namedReference =
type === 'named' ? decodeNamedCharacterReference(characters) : false
if (namedReference) {
characterReferenceCharacters = characters
characterReference = namedReference
}
}
let diff = 1 + end - start
let reference = ''
if (!terminated && options.nonTerminated === false) {
// Empty.
} else if (!characters) {
// An empty (possible) reference is valid, unless its numeric (thus an
// ampersand followed by an octothorp).
if (type !== 'named') {
warning(4 /* Empty (numeric) */, diff)
}
} else if (type === 'named') {
// An ampersand followed by anything unknown, and not terminated, is
// invalid.
if (terminated && !characterReference) {
warning(5 /* Unknown (named) */, 1)
} else {
// If theres something after an named reference which is not known,
// cap the reference.
if (characterReferenceCharacters !== characters) {
end = begin + characterReferenceCharacters.length
diff = 1 + end - begin
terminated = false
}
// If the reference is not terminated, warn.
if (!terminated) {
const reason = characterReferenceCharacters
? 1 /* Non terminated (named) */
: 3 /* Empty (named) */
if (options.attribute) {
const following = value.charCodeAt(end)
if (following === 61 /* `=` */) {
warning(reason, diff)
characterReference = ''
} else if (isAlphanumerical(following)) {
characterReference = ''
} else {
warning(reason, diff)
}
} else {
warning(reason, diff)
}
}
}
reference = characterReference
} else {
if (!terminated) {
// All nonterminated numeric references are not rendered, and emit a
// warning.
warning(2 /* Non terminated (numeric) */, diff)
}
// When terminated and numerical, parse as either hexadecimal or
// decimal.
let referenceCode = Number.parseInt(
characters,
type === 'hexadecimal' ? 16 : 10
)
// Emit a warning when the parsed number is prohibited, and replace with
// replacement character.
if (prohibited(referenceCode)) {
warning(7 /* Prohibited (numeric) */, diff)
reference = fromCharCode(65533 /* `<60>` */)
} else if (referenceCode in characterReferenceInvalid) {
// Emit a warning when the parsed number is disallowed, and replace by
// an alternative.
warning(6 /* Disallowed (numeric) */, diff)
reference = characterReferenceInvalid[referenceCode]
} else {
// Parse the number.
let output = ''
// Emit a warning when the parsed number should not be used.
if (disallowed(referenceCode)) {
warning(6 /* Disallowed (numeric) */, diff)
}
// Serialize the number.
if (referenceCode > 0xffff) {
referenceCode -= 0x10000
output += fromCharCode((referenceCode >>> (10 & 0x3ff)) | 0xd800)
referenceCode = 0xdc00 | (referenceCode & 0x3ff)
}
reference = output + fromCharCode(referenceCode)
}
}
// Found it!
// First eat the queued characters as normal text, then eat a reference.
if (reference) {
flush()
previous = now()
index = end - 1
column += end - start + 1
result.push(reference)
const next = now()
next.offset++
if (options.reference) {
options.reference.call(
options.referenceContext,
reference,
{start: previous, end: next},
value.slice(start - 1, end)
)
}
previous = next
} else {
// If we could not find a reference, queue the checked characters (as
// normal characters), and move the pointer to their end.
// This is possible because we can be certain neither newlines nor
// ampersands are included.
characters = value.slice(start - 1, end)
queue += characters
column += characters.length
index = end - 1
}
} else {
// Handle anything other than an ampersand, including newlines and EOF.
if (character === 10 /* `\n` */) {
line++
lines++
column = 0
}
if (Number.isNaN(character)) {
flush()
} else {
queue += fromCharCode(character)
column++
}
}
}
// Return the reduced nodes.
return result.join('')
// Get current position.
function now() {
return {
line,
column,
offset: index + ((point ? point.offset : 0) || 0)
}
}
/**
* Handle the warning.
*
* @param {1|2|3|4|5|6|7} code
* @param {number} offset
*/
function warning(code, offset) {
/** @type {ReturnType<now>} */
let position
if (options.warning) {
position = now()
position.column += offset
position.offset += offset
options.warning.call(
options.warningContext,
messages[code],
position,
code
)
}
}
/**
* Flush `queue` (normal text).
* Macro invoked before each reference and at the end of `value`.
* Does nothing when `queue` is empty.
*/
function flush() {
if (queue) {
result.push(queue)
if (options.text) {
options.text.call(options.textContext, queue, {
start: previous,
end: now()
})
}
queue = ''
}
}
}
/**
* Check if `character` is outside the permissible unicode range.
*
* @param {number} code
* @returns {boolean}
*/
function prohibited(code) {
return (code >= 0xd800 && code <= 0xdfff) || code > 0x10ffff
}
/**
* Check if `character` is disallowed.
*
* @param {number} code
* @returns {boolean}
*/
function disallowed(code) {
return (
(code >= 0x0001 && code <= 0x0008) ||
code === 0x000b ||
(code >= 0x000d && code <= 0x001f) ||
(code >= 0x007f && code <= 0x009f) ||
(code >= 0xfdd0 && code <= 0xfdef) ||
(code & 0xffff) === 0xffff ||
(code & 0xffff) === 0xfffe
)
}