406 lines
11 KiB
JavaScript
406 lines
11 KiB
JavaScript
/**
|
||
* @typedef {import('unist').Point} Point
|
||
* @typedef {import('unist').Position} Position
|
||
*/
|
||
|
||
import {characterEntitiesLegacy} from 'character-entities-legacy'
|
||
import {characterReferenceInvalid} from 'character-reference-invalid'
|
||
import {isDecimal} from 'is-decimal'
|
||
import {isHexadecimal} from 'is-hexadecimal'
|
||
import {isAlphanumerical} from 'is-alphanumerical'
|
||
import {decodeNamedCharacterReference} from 'decode-named-character-reference'
|
||
|
||
const fromCharCode = String.fromCharCode
|
||
|
||
// Warning messages.
|
||
const messages = [
|
||
'',
|
||
/* 1: Non terminated (named) */
|
||
'Named character references must be terminated by a semicolon',
|
||
/* 2: Non terminated (numeric) */
|
||
'Numeric character references must be terminated by a semicolon',
|
||
/* 3: Empty (named) */
|
||
'Named character references cannot be empty',
|
||
/* 4: Empty (numeric) */
|
||
'Numeric character references cannot be empty',
|
||
/* 5: Unknown (named) */
|
||
'Named character references must be known',
|
||
/* 6: Disallowed (numeric) */
|
||
'Numeric character references cannot be disallowed',
|
||
/* 7: Prohibited (numeric) */
|
||
'Numeric character references cannot be outside the permissible Unicode range'
|
||
]
|
||
|
||
/**
|
||
* Parse HTML character references.
|
||
*
|
||
* @param {string} value
|
||
* @param {import('../index.js').Options} [options={}]
|
||
*/
|
||
export function parseEntities(value, options = {}) {
|
||
const additional =
|
||
typeof options.additional === 'string'
|
||
? options.additional.charCodeAt(0)
|
||
: options.additional
|
||
/** @type {Array<string>} */
|
||
const result = []
|
||
let index = 0
|
||
let lines = -1
|
||
let queue = ''
|
||
/** @type {Point|undefined} */
|
||
let point
|
||
/** @type {Array<number>|undefined} */
|
||
let indent
|
||
|
||
if (options.position) {
|
||
if ('start' in options.position || 'indent' in options.position) {
|
||
// @ts-expect-error: points don’t have indent.
|
||
indent = options.position.indent
|
||
// @ts-expect-error: points don’t have indent.
|
||
point = options.position.start
|
||
} else {
|
||
point = options.position
|
||
}
|
||
}
|
||
|
||
let line = (point ? point.line : 0) || 1
|
||
let column = (point ? point.column : 0) || 1
|
||
|
||
// Cache the current point.
|
||
let previous = now()
|
||
/** @type {number|undefined} */
|
||
let character
|
||
|
||
// Ensure the algorithm walks over the first character (inclusive).
|
||
index--
|
||
|
||
while (++index <= value.length) {
|
||
// If the previous character was a newline.
|
||
if (character === 10 /* `\n` */) {
|
||
column = (indent ? indent[lines] : 0) || 1
|
||
}
|
||
|
||
character = value.charCodeAt(index)
|
||
|
||
if (character === 38 /* `&` */) {
|
||
const following = value.charCodeAt(index + 1)
|
||
|
||
// The behavior depends on the identity of the next character.
|
||
if (
|
||
following === 9 /* `\t` */ ||
|
||
following === 10 /* `\n` */ ||
|
||
following === 12 /* `\f` */ ||
|
||
following === 32 /* ` ` */ ||
|
||
following === 38 /* `&` */ ||
|
||
following === 60 /* `<` */ ||
|
||
Number.isNaN(following) ||
|
||
(additional && following === additional)
|
||
) {
|
||
// Not a character reference.
|
||
// No characters are consumed, and nothing is returned.
|
||
// This is not an error, either.
|
||
queue += fromCharCode(character)
|
||
column++
|
||
continue
|
||
}
|
||
|
||
const start = index + 1
|
||
let begin = start
|
||
let end = start
|
||
/** @type {string} */
|
||
let type
|
||
|
||
if (following === 35 /* `#` */) {
|
||
// Numerical reference.
|
||
end = ++begin
|
||
|
||
// The behavior further depends on the next character.
|
||
const following = value.charCodeAt(end)
|
||
|
||
if (following === 88 /* `X` */ || following === 120 /* `x` */) {
|
||
// ASCII hexadecimal digits.
|
||
type = 'hexadecimal'
|
||
end = ++begin
|
||
} else {
|
||
// ASCII decimal digits.
|
||
type = 'decimal'
|
||
}
|
||
} else {
|
||
// Named reference.
|
||
type = 'named'
|
||
}
|
||
|
||
let characterReferenceCharacters = ''
|
||
let characterReference = ''
|
||
let characters = ''
|
||
// Each type of character reference accepts different characters.
|
||
// This test is used to detect whether a reference has ended (as the semicolon
|
||
// is not strictly needed).
|
||
const test =
|
||
type === 'named'
|
||
? isAlphanumerical
|
||
: type === 'decimal'
|
||
? isDecimal
|
||
: isHexadecimal
|
||
|
||
end--
|
||
|
||
while (++end <= value.length) {
|
||
const following = value.charCodeAt(end)
|
||
|
||
if (!test(following)) {
|
||
break
|
||
}
|
||
|
||
characters += fromCharCode(following)
|
||
|
||
// Check if we can match a legacy named reference.
|
||
// If so, we cache that as the last viable named reference.
|
||
// This ensures we do not need to walk backwards later.
|
||
if (type === 'named' && characterEntitiesLegacy.includes(characters)) {
|
||
characterReferenceCharacters = characters
|
||
// @ts-expect-error: always able to decode.
|
||
characterReference = decodeNamedCharacterReference(characters)
|
||
}
|
||
}
|
||
|
||
let terminated = value.charCodeAt(end) === 59 /* `;` */
|
||
|
||
if (terminated) {
|
||
end++
|
||
|
||
const namedReference =
|
||
type === 'named' ? decodeNamedCharacterReference(characters) : false
|
||
|
||
if (namedReference) {
|
||
characterReferenceCharacters = characters
|
||
characterReference = namedReference
|
||
}
|
||
}
|
||
|
||
let diff = 1 + end - start
|
||
let reference = ''
|
||
|
||
if (!terminated && options.nonTerminated === false) {
|
||
// Empty.
|
||
} else if (!characters) {
|
||
// An empty (possible) reference is valid, unless it’s numeric (thus an
|
||
// ampersand followed by an octothorp).
|
||
if (type !== 'named') {
|
||
warning(4 /* Empty (numeric) */, diff)
|
||
}
|
||
} else if (type === 'named') {
|
||
// An ampersand followed by anything unknown, and not terminated, is
|
||
// invalid.
|
||
if (terminated && !characterReference) {
|
||
warning(5 /* Unknown (named) */, 1)
|
||
} else {
|
||
// If there’s something after an named reference which is not known,
|
||
// cap the reference.
|
||
if (characterReferenceCharacters !== characters) {
|
||
end = begin + characterReferenceCharacters.length
|
||
diff = 1 + end - begin
|
||
terminated = false
|
||
}
|
||
|
||
// If the reference is not terminated, warn.
|
||
if (!terminated) {
|
||
const reason = characterReferenceCharacters
|
||
? 1 /* Non terminated (named) */
|
||
: 3 /* Empty (named) */
|
||
|
||
if (options.attribute) {
|
||
const following = value.charCodeAt(end)
|
||
|
||
if (following === 61 /* `=` */) {
|
||
warning(reason, diff)
|
||
characterReference = ''
|
||
} else if (isAlphanumerical(following)) {
|
||
characterReference = ''
|
||
} else {
|
||
warning(reason, diff)
|
||
}
|
||
} else {
|
||
warning(reason, diff)
|
||
}
|
||
}
|
||
}
|
||
|
||
reference = characterReference
|
||
} else {
|
||
if (!terminated) {
|
||
// All nonterminated numeric references are not rendered, and emit a
|
||
// warning.
|
||
warning(2 /* Non terminated (numeric) */, diff)
|
||
}
|
||
|
||
// When terminated and numerical, parse as either hexadecimal or
|
||
// decimal.
|
||
let referenceCode = Number.parseInt(
|
||
characters,
|
||
type === 'hexadecimal' ? 16 : 10
|
||
)
|
||
|
||
// Emit a warning when the parsed number is prohibited, and replace with
|
||
// replacement character.
|
||
if (prohibited(referenceCode)) {
|
||
warning(7 /* Prohibited (numeric) */, diff)
|
||
reference = fromCharCode(65533 /* `<60>` */)
|
||
} else if (referenceCode in characterReferenceInvalid) {
|
||
// Emit a warning when the parsed number is disallowed, and replace by
|
||
// an alternative.
|
||
warning(6 /* Disallowed (numeric) */, diff)
|
||
reference = characterReferenceInvalid[referenceCode]
|
||
} else {
|
||
// Parse the number.
|
||
let output = ''
|
||
|
||
// Emit a warning when the parsed number should not be used.
|
||
if (disallowed(referenceCode)) {
|
||
warning(6 /* Disallowed (numeric) */, diff)
|
||
}
|
||
|
||
// Serialize the number.
|
||
if (referenceCode > 0xffff) {
|
||
referenceCode -= 0x10000
|
||
output += fromCharCode((referenceCode >>> (10 & 0x3ff)) | 0xd800)
|
||
referenceCode = 0xdc00 | (referenceCode & 0x3ff)
|
||
}
|
||
|
||
reference = output + fromCharCode(referenceCode)
|
||
}
|
||
}
|
||
|
||
// Found it!
|
||
// First eat the queued characters as normal text, then eat a reference.
|
||
if (reference) {
|
||
flush()
|
||
|
||
previous = now()
|
||
index = end - 1
|
||
column += end - start + 1
|
||
result.push(reference)
|
||
const next = now()
|
||
next.offset++
|
||
|
||
if (options.reference) {
|
||
options.reference.call(
|
||
options.referenceContext,
|
||
reference,
|
||
{start: previous, end: next},
|
||
value.slice(start - 1, end)
|
||
)
|
||
}
|
||
|
||
previous = next
|
||
} else {
|
||
// If we could not find a reference, queue the checked characters (as
|
||
// normal characters), and move the pointer to their end.
|
||
// This is possible because we can be certain neither newlines nor
|
||
// ampersands are included.
|
||
characters = value.slice(start - 1, end)
|
||
queue += characters
|
||
column += characters.length
|
||
index = end - 1
|
||
}
|
||
} else {
|
||
// Handle anything other than an ampersand, including newlines and EOF.
|
||
if (character === 10 /* `\n` */) {
|
||
line++
|
||
lines++
|
||
column = 0
|
||
}
|
||
|
||
if (Number.isNaN(character)) {
|
||
flush()
|
||
} else {
|
||
queue += fromCharCode(character)
|
||
column++
|
||
}
|
||
}
|
||
}
|
||
|
||
// Return the reduced nodes.
|
||
return result.join('')
|
||
|
||
// Get current position.
|
||
function now() {
|
||
return {
|
||
line,
|
||
column,
|
||
offset: index + ((point ? point.offset : 0) || 0)
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Handle the warning.
|
||
*
|
||
* @param {1|2|3|4|5|6|7} code
|
||
* @param {number} offset
|
||
*/
|
||
function warning(code, offset) {
|
||
/** @type {ReturnType<now>} */
|
||
let position
|
||
|
||
if (options.warning) {
|
||
position = now()
|
||
position.column += offset
|
||
position.offset += offset
|
||
|
||
options.warning.call(
|
||
options.warningContext,
|
||
messages[code],
|
||
position,
|
||
code
|
||
)
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Flush `queue` (normal text).
|
||
* Macro invoked before each reference and at the end of `value`.
|
||
* Does nothing when `queue` is empty.
|
||
*/
|
||
function flush() {
|
||
if (queue) {
|
||
result.push(queue)
|
||
|
||
if (options.text) {
|
||
options.text.call(options.textContext, queue, {
|
||
start: previous,
|
||
end: now()
|
||
})
|
||
}
|
||
|
||
queue = ''
|
||
}
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Check if `character` is outside the permissible unicode range.
|
||
*
|
||
* @param {number} code
|
||
* @returns {boolean}
|
||
*/
|
||
function prohibited(code) {
|
||
return (code >= 0xd800 && code <= 0xdfff) || code > 0x10ffff
|
||
}
|
||
|
||
/**
|
||
* Check if `character` is disallowed.
|
||
*
|
||
* @param {number} code
|
||
* @returns {boolean}
|
||
*/
|
||
function disallowed(code) {
|
||
return (
|
||
(code >= 0x0001 && code <= 0x0008) ||
|
||
code === 0x000b ||
|
||
(code >= 0x000d && code <= 0x001f) ||
|
||
(code >= 0x007f && code <= 0x009f) ||
|
||
(code >= 0xfdd0 && code <= 0xfdef) ||
|
||
(code & 0xffff) === 0xffff ||
|
||
(code & 0xffff) === 0xfffe
|
||
)
|
||
}
|