site/node_modules/micromark/lib/create-tokenizer.js

583 lines
15 KiB
JavaScript
Raw Permalink Normal View History

2024-10-14 06:09:33 +00:00
/**
* @typedef {import('micromark-util-types').Chunk} Chunk
* @typedef {import('micromark-util-types').Code} Code
* @typedef {import('micromark-util-types').Construct} Construct
* @typedef {import('micromark-util-types').ConstructRecord} ConstructRecord
* @typedef {import('micromark-util-types').Effects} Effects
* @typedef {import('micromark-util-types').InitialConstruct} InitialConstruct
* @typedef {import('micromark-util-types').ParseContext} ParseContext
* @typedef {import('micromark-util-types').Point} Point
* @typedef {import('micromark-util-types').State} State
* @typedef {import('micromark-util-types').Token} Token
* @typedef {import('micromark-util-types').TokenType} TokenType
* @typedef {import('micromark-util-types').TokenizeContext} TokenizeContext
*/
/**
* @callback Restore
* @returns {undefined}
*
* @typedef Info
* @property {Restore} restore
* @property {number} from
*
* @callback ReturnHandle
* Handle a successful run.
* @param {Construct} construct
* @param {Info} info
* @returns {undefined}
*/
import {markdownLineEnding} from 'micromark-util-character'
import {push, splice} from 'micromark-util-chunked'
import {resolveAll} from 'micromark-util-resolve-all'
/**
* Create a tokenizer.
* Tokenizers deal with one type of data (e.g., containers, flow, text).
* The parser is the object dealing with it all.
* `initialize` works like other constructs, except that only its `tokenize`
* function is used, in which case it doesnt receive an `ok` or `nok`.
* `from` can be given to set the point before the first character, although
* when further lines are indented, they must be set with `defineSkip`.
*
* @param {ParseContext} parser
* @param {InitialConstruct} initialize
* @param {Omit<Point, '_bufferIndex' | '_index'> | undefined} [from]
* @returns {TokenizeContext}
*/
export function createTokenizer(parser, initialize, from) {
/** @type {Point} */
let point = Object.assign(
from
? Object.assign({}, from)
: {
line: 1,
column: 1,
offset: 0
},
{
_index: 0,
_bufferIndex: -1
}
)
/** @type {Record<string, number>} */
const columnStart = {}
/** @type {Array<Construct>} */
const resolveAllConstructs = []
/** @type {Array<Chunk>} */
let chunks = []
/** @type {Array<Token>} */
let stack = []
/** @type {boolean | undefined} */
let consumed = true
/**
* Tools used for tokenizing.
*
* @type {Effects}
*/
const effects = {
consume,
enter,
exit,
attempt: constructFactory(onsuccessfulconstruct),
check: constructFactory(onsuccessfulcheck),
interrupt: constructFactory(onsuccessfulcheck, {
interrupt: true
})
}
/**
* State and tools for resolving and serializing.
*
* @type {TokenizeContext}
*/
const context = {
previous: null,
code: null,
containerState: {},
events: [],
parser,
sliceStream,
sliceSerialize,
now,
defineSkip,
write
}
/**
* The state function.
*
* @type {State | undefined}
*/
let state = initialize.tokenize.call(context, effects)
/**
* Track which character we expect to be consumed, to catch bugs.
*
* @type {Code}
*/
let expectedCode
if (initialize.resolveAll) {
resolveAllConstructs.push(initialize)
}
return context
/** @type {TokenizeContext['write']} */
function write(slice) {
chunks = push(chunks, slice)
main()
// Exit if were not done, resolve might change stuff.
if (chunks[chunks.length - 1] !== null) {
return []
}
addResult(initialize, 0)
// Otherwise, resolve, and exit.
context.events = resolveAll(resolveAllConstructs, context.events, context)
return context.events
}
//
// Tools.
//
/** @type {TokenizeContext['sliceSerialize']} */
function sliceSerialize(token, expandTabs) {
return serializeChunks(sliceStream(token), expandTabs)
}
/** @type {TokenizeContext['sliceStream']} */
function sliceStream(token) {
return sliceChunks(chunks, token)
}
/** @type {TokenizeContext['now']} */
function now() {
// This is a hot path, so we clone manually instead of `Object.assign({}, point)`
const {line, column, offset, _index, _bufferIndex} = point
return {
line,
column,
offset,
_index,
_bufferIndex
}
}
/** @type {TokenizeContext['defineSkip']} */
function defineSkip(value) {
columnStart[value.line] = value.column
accountForPotentialSkip()
}
//
// State management.
//
/**
* Main loop (note that `_index` and `_bufferIndex` in `point` are modified by
* `consume`).
* Here is where we walk through the chunks, which either include strings of
* several characters, or numerical character codes.
* The reason to do this in a loop instead of a call is so the stack can
* drain.
*
* @returns {undefined}
*/
function main() {
/** @type {number} */
let chunkIndex
while (point._index < chunks.length) {
const chunk = chunks[point._index]
// If were in a buffer chunk, loop through it.
if (typeof chunk === 'string') {
chunkIndex = point._index
if (point._bufferIndex < 0) {
point._bufferIndex = 0
}
while (
point._index === chunkIndex &&
point._bufferIndex < chunk.length
) {
go(chunk.charCodeAt(point._bufferIndex))
}
} else {
go(chunk)
}
}
}
/**
* Deal with one code.
*
* @param {Code} code
* @returns {undefined}
*/
function go(code) {
consumed = undefined
expectedCode = code
state = state(code)
}
/** @type {Effects['consume']} */
function consume(code) {
if (markdownLineEnding(code)) {
point.line++
point.column = 1
point.offset += code === -3 ? 2 : 1
accountForPotentialSkip()
} else if (code !== -1) {
point.column++
point.offset++
}
// Not in a string chunk.
if (point._bufferIndex < 0) {
point._index++
} else {
point._bufferIndex++
// At end of string chunk.
// @ts-expect-error Points w/ non-negative `_bufferIndex` reference
// strings.
if (point._bufferIndex === chunks[point._index].length) {
point._bufferIndex = -1
point._index++
}
}
// Expose the previous character.
context.previous = code
// Mark as consumed.
consumed = true
}
/** @type {Effects['enter']} */
function enter(type, fields) {
/** @type {Token} */
// @ts-expect-error Patch instead of assign required fields to help GC.
const token = fields || {}
token.type = type
token.start = now()
context.events.push(['enter', token, context])
stack.push(token)
return token
}
/** @type {Effects['exit']} */
function exit(type) {
const token = stack.pop()
token.end = now()
context.events.push(['exit', token, context])
return token
}
/**
* Use results.
*
* @type {ReturnHandle}
*/
function onsuccessfulconstruct(construct, info) {
addResult(construct, info.from)
}
/**
* Discard results.
*
* @type {ReturnHandle}
*/
function onsuccessfulcheck(_, info) {
info.restore()
}
/**
* Factory to attempt/check/interrupt.
*
* @param {ReturnHandle} onreturn
* @param {{interrupt?: boolean | undefined} | undefined} [fields]
*/
function constructFactory(onreturn, fields) {
return hook
/**
* Handle either an object mapping codes to constructs, a list of
* constructs, or a single construct.
*
* @param {Array<Construct> | Construct | ConstructRecord} constructs
* @param {State} returnState
* @param {State | undefined} [bogusState]
* @returns {State}
*/
function hook(constructs, returnState, bogusState) {
/** @type {Array<Construct>} */
let listOfConstructs
/** @type {number} */
let constructIndex
/** @type {Construct} */
let currentConstruct
/** @type {Info} */
let info
return Array.isArray(constructs) /* c8 ignore next 1 */
? handleListOfConstructs(constructs)
: 'tokenize' in constructs
? // @ts-expect-error Looks like a construct.
handleListOfConstructs([constructs])
: handleMapOfConstructs(constructs)
/**
* Handle a list of construct.
*
* @param {ConstructRecord} map
* @returns {State}
*/
function handleMapOfConstructs(map) {
return start
/** @type {State} */
function start(code) {
const def = code !== null && map[code]
const all = code !== null && map.null
const list = [
// To do: add more extension tests.
/* c8 ignore next 2 */
...(Array.isArray(def) ? def : def ? [def] : []),
...(Array.isArray(all) ? all : all ? [all] : [])
]
return handleListOfConstructs(list)(code)
}
}
/**
* Handle a list of construct.
*
* @param {Array<Construct>} list
* @returns {State}
*/
function handleListOfConstructs(list) {
listOfConstructs = list
constructIndex = 0
if (list.length === 0) {
return bogusState
}
return handleConstruct(list[constructIndex])
}
/**
* Handle a single construct.
*
* @param {Construct} construct
* @returns {State}
*/
function handleConstruct(construct) {
return start
/** @type {State} */
function start(code) {
// To do: not needed to store if there is no bogus state, probably?
// Currently doesnt work because `inspect` in document does a check
// w/o a bogus, which doesnt make sense. But it does seem to help perf
// by not storing.
info = store()
currentConstruct = construct
if (!construct.partial) {
context.currentConstruct = construct
}
// Always populated by defaults.
if (
construct.name &&
context.parser.constructs.disable.null.includes(construct.name)
) {
return nok(code)
}
return construct.tokenize.call(
// If we do have fields, create an object w/ `context` as its
// prototype.
// This allows a “live binding”, which is needed for `interrupt`.
fields ? Object.assign(Object.create(context), fields) : context,
effects,
ok,
nok
)(code)
}
}
/** @type {State} */
function ok(code) {
consumed = true
onreturn(currentConstruct, info)
return returnState
}
/** @type {State} */
function nok(code) {
consumed = true
info.restore()
if (++constructIndex < listOfConstructs.length) {
return handleConstruct(listOfConstructs[constructIndex])
}
return bogusState
}
}
}
/**
* @param {Construct} construct
* @param {number} from
* @returns {undefined}
*/
function addResult(construct, from) {
if (construct.resolveAll && !resolveAllConstructs.includes(construct)) {
resolveAllConstructs.push(construct)
}
if (construct.resolve) {
splice(
context.events,
from,
context.events.length - from,
construct.resolve(context.events.slice(from), context)
)
}
if (construct.resolveTo) {
context.events = construct.resolveTo(context.events, context)
}
}
/**
* Store state.
*
* @returns {Info}
*/
function store() {
const startPoint = now()
const startPrevious = context.previous
const startCurrentConstruct = context.currentConstruct
const startEventsIndex = context.events.length
const startStack = Array.from(stack)
return {
restore,
from: startEventsIndex
}
/**
* Restore state.
*
* @returns {undefined}
*/
function restore() {
point = startPoint
context.previous = startPrevious
context.currentConstruct = startCurrentConstruct
context.events.length = startEventsIndex
stack = startStack
accountForPotentialSkip()
}
}
/**
* Move the current point a bit forward in the line when its on a column
* skip.
*
* @returns {undefined}
*/
function accountForPotentialSkip() {
if (point.line in columnStart && point.column < 2) {
point.column = columnStart[point.line]
point.offset += columnStart[point.line] - 1
}
}
}
/**
* Get the chunks from a slice of chunks in the range of a token.
*
* @param {Array<Chunk>} chunks
* @param {Pick<Token, 'end' | 'start'>} token
* @returns {Array<Chunk>}
*/
function sliceChunks(chunks, token) {
const startIndex = token.start._index
const startBufferIndex = token.start._bufferIndex
const endIndex = token.end._index
const endBufferIndex = token.end._bufferIndex
/** @type {Array<Chunk>} */
let view
if (startIndex === endIndex) {
// @ts-expect-error `_bufferIndex` is used on string chunks.
view = [chunks[startIndex].slice(startBufferIndex, endBufferIndex)]
} else {
view = chunks.slice(startIndex, endIndex)
if (startBufferIndex > -1) {
const head = view[0]
if (typeof head === 'string') {
view[0] = head.slice(startBufferIndex)
} else {
view.shift()
}
}
if (endBufferIndex > 0) {
// @ts-expect-error `_bufferIndex` is used on string chunks.
view.push(chunks[endIndex].slice(0, endBufferIndex))
}
}
return view
}
/**
* Get the string value of a slice of chunks.
*
* @param {Array<Chunk>} chunks
* @param {boolean | undefined} [expandTabs=false]
* @returns {string}
*/
function serializeChunks(chunks, expandTabs) {
let index = -1
/** @type {Array<string>} */
const result = []
/** @type {boolean | undefined} */
let atTab
while (++index < chunks.length) {
const chunk = chunks[index]
/** @type {string} */
let value
if (typeof chunk === 'string') {
value = chunk
} else
switch (chunk) {
case -5: {
value = '\r'
break
}
case -4: {
value = '\n'
break
}
case -3: {
value = '\r' + '\n'
break
}
case -2: {
value = expandTabs ? ' ' : '\t'
break
}
case -1: {
if (!expandTabs && atTab) continue
value = ' '
break
}
default: {
// Currently only replacement character.
value = String.fromCharCode(chunk)
}
}
atTab = chunk === -2
result.push(value)
}
return result.join('')
}