site/node_modules/micromark/dev/lib/create-tokenizer.js

671 lines
17 KiB
JavaScript
Raw Normal View History

2024-10-14 08:09:33 +02:00
/**
* @typedef {import('micromark-util-types').Chunk} Chunk
* @typedef {import('micromark-util-types').Code} Code
* @typedef {import('micromark-util-types').Construct} Construct
* @typedef {import('micromark-util-types').ConstructRecord} ConstructRecord
* @typedef {import('micromark-util-types').Effects} Effects
* @typedef {import('micromark-util-types').InitialConstruct} InitialConstruct
* @typedef {import('micromark-util-types').ParseContext} ParseContext
* @typedef {import('micromark-util-types').Point} Point
* @typedef {import('micromark-util-types').State} State
* @typedef {import('micromark-util-types').Token} Token
* @typedef {import('micromark-util-types').TokenType} TokenType
* @typedef {import('micromark-util-types').TokenizeContext} TokenizeContext
*/
/**
* @callback Restore
* @returns {undefined}
*
* @typedef Info
* @property {Restore} restore
* @property {number} from
*
* @callback ReturnHandle
* Handle a successful run.
* @param {Construct} construct
* @param {Info} info
* @returns {undefined}
*/
import createDebug from 'debug'
import {markdownLineEnding} from 'micromark-util-character'
import {push, splice} from 'micromark-util-chunked'
import {resolveAll} from 'micromark-util-resolve-all'
import {codes, values} from 'micromark-util-symbol'
import {ok as assert} from 'devlop'
const debug = createDebug('micromark')
/**
* Create a tokenizer.
* Tokenizers deal with one type of data (e.g., containers, flow, text).
* The parser is the object dealing with it all.
* `initialize` works like other constructs, except that only its `tokenize`
* function is used, in which case it doesnt receive an `ok` or `nok`.
* `from` can be given to set the point before the first character, although
* when further lines are indented, they must be set with `defineSkip`.
*
* @param {ParseContext} parser
* @param {InitialConstruct} initialize
* @param {Omit<Point, '_bufferIndex' | '_index'> | undefined} [from]
* @returns {TokenizeContext}
*/
export function createTokenizer(parser, initialize, from) {
/** @type {Point} */
let point = Object.assign(
from ? Object.assign({}, from) : {line: 1, column: 1, offset: 0},
{_index: 0, _bufferIndex: -1}
)
/** @type {Record<string, number>} */
const columnStart = {}
/** @type {Array<Construct>} */
const resolveAllConstructs = []
/** @type {Array<Chunk>} */
let chunks = []
/** @type {Array<Token>} */
let stack = []
/** @type {boolean | undefined} */
let consumed = true
/**
* Tools used for tokenizing.
*
* @type {Effects}
*/
const effects = {
consume,
enter,
exit,
attempt: constructFactory(onsuccessfulconstruct),
check: constructFactory(onsuccessfulcheck),
interrupt: constructFactory(onsuccessfulcheck, {interrupt: true})
}
/**
* State and tools for resolving and serializing.
*
* @type {TokenizeContext}
*/
const context = {
previous: codes.eof,
code: codes.eof,
containerState: {},
events: [],
parser,
sliceStream,
sliceSerialize,
now,
defineSkip,
write
}
/**
* The state function.
*
* @type {State | undefined}
*/
let state = initialize.tokenize.call(context, effects)
/**
* Track which character we expect to be consumed, to catch bugs.
*
* @type {Code}
*/
let expectedCode
if (initialize.resolveAll) {
resolveAllConstructs.push(initialize)
}
return context
/** @type {TokenizeContext['write']} */
function write(slice) {
chunks = push(chunks, slice)
main()
// Exit if were not done, resolve might change stuff.
if (chunks[chunks.length - 1] !== codes.eof) {
return []
}
addResult(initialize, 0)
// Otherwise, resolve, and exit.
context.events = resolveAll(resolveAllConstructs, context.events, context)
return context.events
}
//
// Tools.
//
/** @type {TokenizeContext['sliceSerialize']} */
function sliceSerialize(token, expandTabs) {
return serializeChunks(sliceStream(token), expandTabs)
}
/** @type {TokenizeContext['sliceStream']} */
function sliceStream(token) {
return sliceChunks(chunks, token)
}
/** @type {TokenizeContext['now']} */
function now() {
// This is a hot path, so we clone manually instead of `Object.assign({}, point)`
const {line, column, offset, _index, _bufferIndex} = point
return {line, column, offset, _index, _bufferIndex}
}
/** @type {TokenizeContext['defineSkip']} */
function defineSkip(value) {
columnStart[value.line] = value.column
accountForPotentialSkip()
debug('position: define skip: `%j`', point)
}
//
// State management.
//
/**
* Main loop (note that `_index` and `_bufferIndex` in `point` are modified by
* `consume`).
* Here is where we walk through the chunks, which either include strings of
* several characters, or numerical character codes.
* The reason to do this in a loop instead of a call is so the stack can
* drain.
*
* @returns {undefined}
*/
function main() {
/** @type {number} */
let chunkIndex
while (point._index < chunks.length) {
const chunk = chunks[point._index]
// If were in a buffer chunk, loop through it.
if (typeof chunk === 'string') {
chunkIndex = point._index
if (point._bufferIndex < 0) {
point._bufferIndex = 0
}
while (
point._index === chunkIndex &&
point._bufferIndex < chunk.length
) {
go(chunk.charCodeAt(point._bufferIndex))
}
} else {
go(chunk)
}
}
}
/**
* Deal with one code.
*
* @param {Code} code
* @returns {undefined}
*/
function go(code) {
assert(consumed === true, 'expected character to be consumed')
consumed = undefined
debug('main: passing `%s` to %s', code, state && state.name)
expectedCode = code
assert(typeof state === 'function', 'expected state')
state = state(code)
}
/** @type {Effects['consume']} */
function consume(code) {
assert(code === expectedCode, 'expected given code to equal expected code')
debug('consume: `%s`', code)
assert(
consumed === undefined,
'expected code to not have been consumed: this might be because `return x(code)` instead of `return x` was used'
)
assert(
code === null
? context.events.length === 0 ||
context.events[context.events.length - 1][0] === 'exit'
: context.events[context.events.length - 1][0] === 'enter',
'expected last token to be open'
)
if (markdownLineEnding(code)) {
point.line++
point.column = 1
point.offset += code === codes.carriageReturnLineFeed ? 2 : 1
accountForPotentialSkip()
debug('position: after eol: `%j`', point)
} else if (code !== codes.virtualSpace) {
point.column++
point.offset++
}
// Not in a string chunk.
if (point._bufferIndex < 0) {
point._index++
} else {
point._bufferIndex++
// At end of string chunk.
// @ts-expect-error Points w/ non-negative `_bufferIndex` reference
// strings.
if (point._bufferIndex === chunks[point._index].length) {
point._bufferIndex = -1
point._index++
}
}
// Expose the previous character.
context.previous = code
// Mark as consumed.
consumed = true
}
/** @type {Effects['enter']} */
function enter(type, fields) {
/** @type {Token} */
// @ts-expect-error Patch instead of assign required fields to help GC.
const token = fields || {}
token.type = type
token.start = now()
assert(typeof type === 'string', 'expected string type')
assert(type.length > 0, 'expected non-empty string')
debug('enter: `%s`', type)
context.events.push(['enter', token, context])
stack.push(token)
return token
}
/** @type {Effects['exit']} */
function exit(type) {
assert(typeof type === 'string', 'expected string type')
assert(type.length > 0, 'expected non-empty string')
const token = stack.pop()
assert(token, 'cannot close w/o open tokens')
token.end = now()
assert(type === token.type, 'expected exit token to match current token')
assert(
!(
token.start._index === token.end._index &&
token.start._bufferIndex === token.end._bufferIndex
),
'expected non-empty token (`' + type + '`)'
)
debug('exit: `%s`', token.type)
context.events.push(['exit', token, context])
return token
}
/**
* Use results.
*
* @type {ReturnHandle}
*/
function onsuccessfulconstruct(construct, info) {
addResult(construct, info.from)
}
/**
* Discard results.
*
* @type {ReturnHandle}
*/
function onsuccessfulcheck(_, info) {
info.restore()
}
/**
* Factory to attempt/check/interrupt.
*
* @param {ReturnHandle} onreturn
* @param {{interrupt?: boolean | undefined} | undefined} [fields]
*/
function constructFactory(onreturn, fields) {
return hook
/**
* Handle either an object mapping codes to constructs, a list of
* constructs, or a single construct.
*
* @param {Array<Construct> | Construct | ConstructRecord} constructs
* @param {State} returnState
* @param {State | undefined} [bogusState]
* @returns {State}
*/
function hook(constructs, returnState, bogusState) {
/** @type {Array<Construct>} */
let listOfConstructs
/** @type {number} */
let constructIndex
/** @type {Construct} */
let currentConstruct
/** @type {Info} */
let info
return Array.isArray(constructs)
? /* c8 ignore next 1 */
handleListOfConstructs(constructs)
: 'tokenize' in constructs
? // @ts-expect-error Looks like a construct.
handleListOfConstructs([constructs])
: handleMapOfConstructs(constructs)
/**
* Handle a list of construct.
*
* @param {ConstructRecord} map
* @returns {State}
*/
function handleMapOfConstructs(map) {
return start
/** @type {State} */
function start(code) {
const def = code !== null && map[code]
const all = code !== null && map.null
const list = [
// To do: add more extension tests.
/* c8 ignore next 2 */
...(Array.isArray(def) ? def : def ? [def] : []),
...(Array.isArray(all) ? all : all ? [all] : [])
]
return handleListOfConstructs(list)(code)
}
}
/**
* Handle a list of construct.
*
* @param {Array<Construct>} list
* @returns {State}
*/
function handleListOfConstructs(list) {
listOfConstructs = list
constructIndex = 0
if (list.length === 0) {
assert(bogusState, 'expected `bogusState` to be given')
return bogusState
}
return handleConstruct(list[constructIndex])
}
/**
* Handle a single construct.
*
* @param {Construct} construct
* @returns {State}
*/
function handleConstruct(construct) {
return start
/** @type {State} */
function start(code) {
// To do: not needed to store if there is no bogus state, probably?
// Currently doesnt work because `inspect` in document does a check
// w/o a bogus, which doesnt make sense. But it does seem to help perf
// by not storing.
info = store()
currentConstruct = construct
if (!construct.partial) {
context.currentConstruct = construct
}
// Always populated by defaults.
assert(
context.parser.constructs.disable.null,
'expected `disable.null` to be populated'
)
if (
construct.name &&
context.parser.constructs.disable.null.includes(construct.name)
) {
return nok(code)
}
return construct.tokenize.call(
// If we do have fields, create an object w/ `context` as its
// prototype.
// This allows a “live binding”, which is needed for `interrupt`.
fields ? Object.assign(Object.create(context), fields) : context,
effects,
ok,
nok
)(code)
}
}
/** @type {State} */
function ok(code) {
assert(code === expectedCode, 'expected code')
consumed = true
onreturn(currentConstruct, info)
return returnState
}
/** @type {State} */
function nok(code) {
assert(code === expectedCode, 'expected code')
consumed = true
info.restore()
if (++constructIndex < listOfConstructs.length) {
return handleConstruct(listOfConstructs[constructIndex])
}
return bogusState
}
}
}
/**
* @param {Construct} construct
* @param {number} from
* @returns {undefined}
*/
function addResult(construct, from) {
if (construct.resolveAll && !resolveAllConstructs.includes(construct)) {
resolveAllConstructs.push(construct)
}
if (construct.resolve) {
splice(
context.events,
from,
context.events.length - from,
construct.resolve(context.events.slice(from), context)
)
}
if (construct.resolveTo) {
context.events = construct.resolveTo(context.events, context)
}
assert(
construct.partial ||
context.events.length === 0 ||
context.events[context.events.length - 1][0] === 'exit',
'expected last token to end'
)
}
/**
* Store state.
*
* @returns {Info}
*/
function store() {
const startPoint = now()
const startPrevious = context.previous
const startCurrentConstruct = context.currentConstruct
const startEventsIndex = context.events.length
const startStack = Array.from(stack)
return {restore, from: startEventsIndex}
/**
* Restore state.
*
* @returns {undefined}
*/
function restore() {
point = startPoint
context.previous = startPrevious
context.currentConstruct = startCurrentConstruct
context.events.length = startEventsIndex
stack = startStack
accountForPotentialSkip()
debug('position: restore: `%j`', point)
}
}
/**
* Move the current point a bit forward in the line when its on a column
* skip.
*
* @returns {undefined}
*/
function accountForPotentialSkip() {
if (point.line in columnStart && point.column < 2) {
point.column = columnStart[point.line]
point.offset += columnStart[point.line] - 1
}
}
}
/**
* Get the chunks from a slice of chunks in the range of a token.
*
* @param {Array<Chunk>} chunks
* @param {Pick<Token, 'end' | 'start'>} token
* @returns {Array<Chunk>}
*/
function sliceChunks(chunks, token) {
const startIndex = token.start._index
const startBufferIndex = token.start._bufferIndex
const endIndex = token.end._index
const endBufferIndex = token.end._bufferIndex
/** @type {Array<Chunk>} */
let view
if (startIndex === endIndex) {
assert(endBufferIndex > -1, 'expected non-negative end buffer index')
assert(startBufferIndex > -1, 'expected non-negative start buffer index')
// @ts-expect-error `_bufferIndex` is used on string chunks.
view = [chunks[startIndex].slice(startBufferIndex, endBufferIndex)]
} else {
view = chunks.slice(startIndex, endIndex)
if (startBufferIndex > -1) {
const head = view[0]
if (typeof head === 'string') {
view[0] = head.slice(startBufferIndex)
} else {
assert(startBufferIndex === 0, 'expected `startBufferIndex` to be `0`')
view.shift()
}
}
if (endBufferIndex > 0) {
// @ts-expect-error `_bufferIndex` is used on string chunks.
view.push(chunks[endIndex].slice(0, endBufferIndex))
}
}
return view
}
/**
* Get the string value of a slice of chunks.
*
* @param {Array<Chunk>} chunks
* @param {boolean | undefined} [expandTabs=false]
* @returns {string}
*/
function serializeChunks(chunks, expandTabs) {
let index = -1
/** @type {Array<string>} */
const result = []
/** @type {boolean | undefined} */
let atTab
while (++index < chunks.length) {
const chunk = chunks[index]
/** @type {string} */
let value
if (typeof chunk === 'string') {
value = chunk
} else
switch (chunk) {
case codes.carriageReturn: {
value = values.cr
break
}
case codes.lineFeed: {
value = values.lf
break
}
case codes.carriageReturnLineFeed: {
value = values.cr + values.lf
break
}
case codes.horizontalTab: {
value = expandTabs ? values.space : values.ht
break
}
case codes.virtualSpace: {
if (!expandTabs && atTab) continue
value = values.space
break
}
default: {
assert(typeof chunk === 'number', 'expected number')
// Currently only replacement character.
value = String.fromCharCode(chunk)
}
}
atTab = chunk === codes.horizontalTab
result.push(value)
}
return result.join('')
}