164 lines
3.8 KiB
JavaScript
164 lines
3.8 KiB
JavaScript
/**
|
|
* @typedef {import('micromark-util-types').Code} Code
|
|
* @typedef {import('micromark-util-types').Construct} Construct
|
|
* @typedef {import('micromark-util-types').State} State
|
|
* @typedef {import('micromark-util-types').TokenizeContext} TokenizeContext
|
|
* @typedef {import('micromark-util-types').Tokenizer} Tokenizer
|
|
*/
|
|
|
|
import {decodeNamedCharacterReference} from 'decode-named-character-reference'
|
|
import {
|
|
asciiAlphanumeric,
|
|
asciiDigit,
|
|
asciiHexDigit
|
|
} from 'micromark-util-character'
|
|
import {codes, constants, types} from 'micromark-util-symbol'
|
|
import {ok as assert} from 'devlop'
|
|
|
|
/** @type {Construct} */
|
|
export const characterReference = {
|
|
name: 'characterReference',
|
|
tokenize: tokenizeCharacterReference
|
|
}
|
|
|
|
/**
|
|
* @this {TokenizeContext}
|
|
* @type {Tokenizer}
|
|
*/
|
|
function tokenizeCharacterReference(effects, ok, nok) {
|
|
const self = this
|
|
let size = 0
|
|
/** @type {number} */
|
|
let max
|
|
/** @type {(code: Code) => boolean} */
|
|
let test
|
|
|
|
return start
|
|
|
|
/**
|
|
* Start of character reference.
|
|
*
|
|
* ```markdown
|
|
* > | a&b
|
|
* ^
|
|
* > | a{b
|
|
* ^
|
|
* > | a	b
|
|
* ^
|
|
* ```
|
|
*
|
|
* @type {State}
|
|
*/
|
|
function start(code) {
|
|
assert(code === codes.ampersand, 'expected `&`')
|
|
effects.enter(types.characterReference)
|
|
effects.enter(types.characterReferenceMarker)
|
|
effects.consume(code)
|
|
effects.exit(types.characterReferenceMarker)
|
|
return open
|
|
}
|
|
|
|
/**
|
|
* After `&`, at `#` for numeric references or alphanumeric for named
|
|
* references.
|
|
*
|
|
* ```markdown
|
|
* > | a&b
|
|
* ^
|
|
* > | a{b
|
|
* ^
|
|
* > | a	b
|
|
* ^
|
|
* ```
|
|
*
|
|
* @type {State}
|
|
*/
|
|
function open(code) {
|
|
if (code === codes.numberSign) {
|
|
effects.enter(types.characterReferenceMarkerNumeric)
|
|
effects.consume(code)
|
|
effects.exit(types.characterReferenceMarkerNumeric)
|
|
return numeric
|
|
}
|
|
|
|
effects.enter(types.characterReferenceValue)
|
|
max = constants.characterReferenceNamedSizeMax
|
|
test = asciiAlphanumeric
|
|
return value(code)
|
|
}
|
|
|
|
/**
|
|
* After `#`, at `x` for hexadecimals or digit for decimals.
|
|
*
|
|
* ```markdown
|
|
* > | a{b
|
|
* ^
|
|
* > | a	b
|
|
* ^
|
|
* ```
|
|
*
|
|
* @type {State}
|
|
*/
|
|
function numeric(code) {
|
|
if (code === codes.uppercaseX || code === codes.lowercaseX) {
|
|
effects.enter(types.characterReferenceMarkerHexadecimal)
|
|
effects.consume(code)
|
|
effects.exit(types.characterReferenceMarkerHexadecimal)
|
|
effects.enter(types.characterReferenceValue)
|
|
max = constants.characterReferenceHexadecimalSizeMax
|
|
test = asciiHexDigit
|
|
return value
|
|
}
|
|
|
|
effects.enter(types.characterReferenceValue)
|
|
max = constants.characterReferenceDecimalSizeMax
|
|
test = asciiDigit
|
|
return value(code)
|
|
}
|
|
|
|
/**
|
|
* After markers (`&#x`, `&#`, or `&`), in value, before `;`.
|
|
*
|
|
* The character reference kind defines what and how many characters are
|
|
* allowed.
|
|
*
|
|
* ```markdown
|
|
* > | a&b
|
|
* ^^^
|
|
* > | a{b
|
|
* ^^^
|
|
* > | a	b
|
|
* ^
|
|
* ```
|
|
*
|
|
* @type {State}
|
|
*/
|
|
function value(code) {
|
|
if (code === codes.semicolon && size) {
|
|
const token = effects.exit(types.characterReferenceValue)
|
|
|
|
if (
|
|
test === asciiAlphanumeric &&
|
|
!decodeNamedCharacterReference(self.sliceSerialize(token))
|
|
) {
|
|
return nok(code)
|
|
}
|
|
|
|
// To do: `markdown-rs` uses a different name:
|
|
// `CharacterReferenceMarkerSemi`.
|
|
effects.enter(types.characterReferenceMarker)
|
|
effects.consume(code)
|
|
effects.exit(types.characterReferenceMarker)
|
|
effects.exit(types.characterReference)
|
|
return ok
|
|
}
|
|
|
|
if (test(code) && size++ < max) {
|
|
effects.consume(code)
|
|
return value
|
|
}
|
|
|
|
return nok(code)
|
|
}
|
|
}
|