/**
* While micromark is a lexer/tokenizer, the common case of going from markdown
* to html is currently built in as this module, even though the parts can be
* used separately to build ASTs, CSTs, or many other output formats.
*
* Having an HTML compiler built in is useful because it allows us to check for
* compliancy to CommonMark, the de facto norm of markdown, specified in roughly
* 600 input/output cases.
*
* This module has an interface that accepts lists of events instead of the
* whole at once, however, because markdown can’t be truly streaming, we buffer
* events before processing and outputting the final result.
*/
/**
* @typedef {import('micromark-util-types').Compile} Compile
* @typedef {import('micromark-util-types').CompileContext} CompileContext
* @typedef {import('micromark-util-types').CompileData} CompileData
* @typedef {import('micromark-util-types').CompileOptions} CompileOptions
* @typedef {import('micromark-util-types').Definition} Definition
* @typedef {import('micromark-util-types').Event} Event
* @typedef {import('micromark-util-types').Handle} Handle
* @typedef {import('micromark-util-types').HtmlExtension} HtmlExtension
* @typedef {import('micromark-util-types').NormalizedHtmlExtension} NormalizedHtmlExtension
* @typedef {import('micromark-util-types').Token} Token
*/
/**
* @typedef Media
* @property {boolean | undefined} [image]
* @property {string | undefined} [labelId]
* @property {string | undefined} [label]
* @property {string | undefined} [referenceId]
* @property {string | undefined} [destination]
* @property {string | undefined} [title]
*/
import {decodeNamedCharacterReference} from 'decode-named-character-reference'
import {push} from 'micromark-util-chunked'
import {combineHtmlExtensions} from 'micromark-util-combine-extensions'
import {decodeNumericCharacterReference} from 'micromark-util-decode-numeric-character-reference'
import {encode as _encode} from 'micromark-util-encode'
import {normalizeIdentifier} from 'micromark-util-normalize-identifier'
import {sanitizeUri} from 'micromark-util-sanitize-uri'
const hasOwnProperty = {}.hasOwnProperty
/**
* These two are allowlists of safe protocols for full URLs in respectively the
* `href` (on ``) and `src` (on ``) attributes.
* They are based on what is allowed on GitHub,
*
*/
const protocolHref = /^(https?|ircs?|mailto|xmpp)$/i
const protocolSrc = /^https?$/i
/**
* @param {CompileOptions | null | undefined} [options]
* @returns {Compile}
*/
export function compile(options) {
const settings = options || {}
/**
* Tags is needed because according to markdown, links and emphasis and
* whatnot can exist in images, however, as HTML doesn’t allow content in
* images, the tags are ignored in the `alt` attribute, but the content
* remains.
*
* @type {boolean | undefined}
*/
let tags = true
/**
* An object to track identifiers to media (URLs and titles) defined with
* definitions.
*
* @type {Record}
*/
const definitions = {}
/**
* A lot of the handlers need to capture some of the output data, modify it
* somehow, and then deal with it.
* We do that by tracking a stack of buffers, that can be opened (with
* `buffer`) and closed (with `resume`) to access them.
*
* @type {Array>}
*/
const buffers = [[]]
/**
* As we can have links in images and the other way around, where the deepest
* ones are closed first, we need to track which one we’re in.
*
* @type {Array}
*/
const mediaStack = []
/**
* Same as `mediaStack` for tightness, which is specific to lists.
* We need to track if we’re currently in a tight or loose container.
*
* @type {Array}
*/
const tightStack = []
/** @type {HtmlExtension} */
const defaultHandlers = {
enter: {
blockQuote: onenterblockquote,
codeFenced: onentercodefenced,
codeFencedFenceInfo: buffer,
codeFencedFenceMeta: buffer,
codeIndented: onentercodeindented,
codeText: onentercodetext,
content: onentercontent,
definition: onenterdefinition,
definitionDestinationString: onenterdefinitiondestinationstring,
definitionLabelString: buffer,
definitionTitleString: buffer,
emphasis: onenteremphasis,
htmlFlow: onenterhtmlflow,
htmlText: onenterhtml,
image: onenterimage,
label: buffer,
link: onenterlink,
listItemMarker: onenterlistitemmarker,
listItemValue: onenterlistitemvalue,
listOrdered: onenterlistordered,
listUnordered: onenterlistunordered,
paragraph: onenterparagraph,
reference: buffer,
resource: onenterresource,
resourceDestinationString: onenterresourcedestinationstring,
resourceTitleString: buffer,
setextHeading: onentersetextheading,
strong: onenterstrong
},
exit: {
atxHeading: onexitatxheading,
atxHeadingSequence: onexitatxheadingsequence,
autolinkEmail: onexitautolinkemail,
autolinkProtocol: onexitautolinkprotocol,
blockQuote: onexitblockquote,
characterEscapeValue: onexitdata,
characterReferenceMarkerHexadecimal: onexitcharacterreferencemarker,
characterReferenceMarkerNumeric: onexitcharacterreferencemarker,
characterReferenceValue: onexitcharacterreferencevalue,
codeFenced: onexitflowcode,
codeFencedFence: onexitcodefencedfence,
codeFencedFenceInfo: onexitcodefencedfenceinfo,
codeFencedFenceMeta: onresumedrop,
codeFlowValue: onexitcodeflowvalue,
codeIndented: onexitflowcode,
codeText: onexitcodetext,
codeTextData: onexitdata,
data: onexitdata,
definition: onexitdefinition,
definitionDestinationString: onexitdefinitiondestinationstring,
definitionLabelString: onexitdefinitionlabelstring,
definitionTitleString: onexitdefinitiontitlestring,
emphasis: onexitemphasis,
hardBreakEscape: onexithardbreak,
hardBreakTrailing: onexithardbreak,
htmlFlow: onexithtml,
htmlFlowData: onexitdata,
htmlText: onexithtml,
htmlTextData: onexitdata,
image: onexitmedia,
label: onexitlabel,
labelText: onexitlabeltext,
lineEnding: onexitlineending,
link: onexitmedia,
listOrdered: onexitlistordered,
listUnordered: onexitlistunordered,
paragraph: onexitparagraph,
reference: onresumedrop,
referenceString: onexitreferencestring,
resource: onresumedrop,
resourceDestinationString: onexitresourcedestinationstring,
resourceTitleString: onexitresourcetitlestring,
setextHeading: onexitsetextheading,
setextHeadingLineSequence: onexitsetextheadinglinesequence,
setextHeadingText: onexitsetextheadingtext,
strong: onexitstrong,
thematicBreak: onexitthematicbreak
}
}
/**
* Combine the HTML extensions with the default handlers.
* An HTML extension is an object whose fields are either `enter` or `exit`
* (reflecting whether a token is entered or exited).
* The values at such objects are names of tokens mapping to handlers.
* Handlers are called, respectively when a token is opener or closed, with
* that token, and a context as `this`.
*/
const handlers =
/** @type {NormalizedHtmlExtension} */
combineHtmlExtensions(
[defaultHandlers].concat(settings.htmlExtensions || [])
)
/**
* Handlers do often need to keep track of some state.
* That state is provided here as a key-value store (an object).
*
* @type {CompileData}
*/
const data = {
tightStack,
definitions
}
/**
* The context for handlers references a couple of useful functions.
* In handlers from extensions, those can be accessed at `this`.
* For the handlers here, they can be accessed directly.
*
* @type {Omit}
*/
const context = {
lineEndingIfNeeded,
options: settings,
encode,
raw,
tag,
buffer,
resume,
setData,
getData
}
/**
* Generally, micromark copies line endings (`'\r'`, `'\n'`, `'\r\n'`) in the
* markdown document over to the compiled HTML.
* In some cases, such as `> a`, CommonMark requires that extra line endings
* are added: `\na
\n
`.
* This variable hold the default line ending when given (or `undefined`),
* and in the latter case will be updated to the first found line ending if
* there is one.
*/
let lineEndingStyle = settings.defaultLineEnding
// Return the function that handles a slice of events.
return compile
/**
* Deal w/ a slice of events.
* Return either the empty string if there’s nothing of note to return, or the
* result when done.
*
* @param {Array} events
* @returns {string}
*/
function compile(events) {
let index = -1
let start = 0
/** @type {Array} */
const listStack = []
// As definitions can come after references, we need to figure out the media
// (urls and titles) defined by them before handling the references.
// So, we do sort of what HTML does: put metadata at the start (in head), and
// then put content after (`body`).
/** @type {Array} */
let head = []
/** @type {Array} */
let body = []
while (++index < events.length) {
// Figure out the line ending style used in the document.
if (
!lineEndingStyle &&
(events[index][1].type === 'lineEnding' ||
events[index][1].type === 'lineEndingBlank')
) {
// @ts-expect-error Hush, it’s a line ending.
lineEndingStyle = events[index][2].sliceSerialize(events[index][1])
}
// Preprocess lists to infer whether the list is loose or not.
if (
events[index][1].type === 'listOrdered' ||
events[index][1].type === 'listUnordered'
) {
if (events[index][0] === 'enter') {
listStack.push(index)
} else {
prepareList(events.slice(listStack.pop(), index))
}
}
// Move definitions to the front.
if (events[index][1].type === 'definition') {
if (events[index][0] === 'enter') {
body = push(body, events.slice(start, index))
start = index
} else {
head = push(head, events.slice(start, index + 1))
start = index + 1
}
}
}
head = push(head, body)
head = push(head, events.slice(start))
index = -1
const result = head
// Handle the start of the document, if defined.
if (handlers.enter.null) {
handlers.enter.null.call(context)
}
// Handle all events.
while (++index < events.length) {
const handles = handlers[result[index][0]]
const kind = result[index][1].type
const handle = handles[kind]
if (hasOwnProperty.call(handles, kind) && handle) {
handle.call(
Object.assign(
{
sliceSerialize: result[index][2].sliceSerialize
},
context
),
result[index][1]
)
}
}
// Handle the end of the document, if defined.
if (handlers.exit.null) {
handlers.exit.null.call(context)
}
return buffers[0].join('')
}
/**
* Figure out whether lists are loose or not.
*
* @param {Array} slice
* @returns {undefined}
*/
function prepareList(slice) {
const length = slice.length
let index = 0 // Skip open.
let containerBalance = 0
let loose = false
/** @type {boolean | undefined} */
let atMarker
while (++index < length) {
const event = slice[index]
if (event[1]._container) {
atMarker = undefined
if (event[0] === 'enter') {
containerBalance++
} else {
containerBalance--
}
} else
switch (event[1].type) {
case 'listItemPrefix': {
if (event[0] === 'exit') {
atMarker = true
}
break
}
case 'linePrefix': {
// Ignore
break
}
case 'lineEndingBlank': {
if (event[0] === 'enter' && !containerBalance) {
if (atMarker) {
atMarker = undefined
} else {
loose = true
}
}
break
}
default: {
atMarker = undefined
}
}
}
slice[0][1]._loose = loose
}
/**
* @type {CompileContext['setData']}
*/
function setData(key, value) {
// @ts-expect-error: assume `value` is omitted (`undefined` is passed) only
// if allowed.
data[key] = value
}
/**
* @type {CompileContext['getData']}
*/
function getData(key) {
return data[key]
}
/** @type {CompileContext['buffer']} */
function buffer() {
buffers.push([])
}
/** @type {CompileContext['resume']} */
function resume() {
const buf = buffers.pop()
return buf.join('')
}
/** @type {CompileContext['tag']} */
function tag(value) {
if (!tags) return
setData('lastWasTag', true)
buffers[buffers.length - 1].push(value)
}
/** @type {CompileContext['raw']} */
function raw(value) {
setData('lastWasTag')
buffers[buffers.length - 1].push(value)
}
/**
* Output an extra line ending.
*
* @returns {undefined}
*/
function lineEnding() {
raw(lineEndingStyle || '\n')
}
/** @type {CompileContext['lineEndingIfNeeded']} */
function lineEndingIfNeeded() {
const buffer = buffers[buffers.length - 1]
const slice = buffer[buffer.length - 1]
const previous = slice ? slice.charCodeAt(slice.length - 1) : null
if (previous === 10 || previous === 13 || previous === null) {
return
}
lineEnding()
}
/** @type {CompileContext['encode']} */
function encode(value) {
return getData('ignoreEncode') ? value : _encode(value)
}
//
// Handlers.
//
/**
* @returns {undefined}
*/
function onresumedrop() {
resume()
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onenterlistordered(token) {
tightStack.push(!token._loose)
lineEndingIfNeeded()
tag('')
} else {
onexitlistitem()
}
lineEndingIfNeeded()
tag('- ')
setData('expectFirstItem')
// “Hack” to prevent a line ending from showing up if the item is empty.
setData('lastWasTag')
}
/**
* @returns {undefined}
*/
function onexitlistordered() {
onexitlistitem()
tightStack.pop()
lineEnding()
tag('
')
}
/**
* @returns {undefined}
*/
function onexitlistunordered() {
onexitlistitem()
tightStack.pop()
lineEnding()
tag('')
}
/**
* @returns {undefined}
*/
function onexitlistitem() {
if (getData('lastWasTag') && !getData('slurpAllLineEndings')) {
lineEndingIfNeeded()
}
tag('')
setData('slurpAllLineEndings')
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onenterblockquote() {
tightStack.push(false)
lineEndingIfNeeded()
tag('')
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onexitblockquote() {
tightStack.pop()
lineEndingIfNeeded()
tag('
')
setData('slurpAllLineEndings')
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onenterparagraph() {
if (!tightStack[tightStack.length - 1]) {
lineEndingIfNeeded()
tag('')
}
setData('slurpAllLineEndings')
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onexitparagraph() {
if (tightStack[tightStack.length - 1]) {
setData('slurpAllLineEndings', true)
} else {
tag('
')
}
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onentercodefenced() {
lineEndingIfNeeded()
tag('')
setData('slurpOneLineEnding', true)
}
setData('fencesCount', count + 1)
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onentercodeindented() {
lineEndingIfNeeded()
tag('')
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onexitflowcode() {
const count = getData('fencesCount')
// One special case is if we are inside a container, and the fenced code was
// not closed (meaning it runs to the end).
// In that case, the following line ending, is considered *outside* the
// fenced code and block quote by micromark, but CM wants to treat that
// ending as part of the code.
if (
count !== undefined &&
count < 2 &&
data.tightStack.length > 0 &&
!getData('lastWasTag')
) {
lineEnding()
}
// But in most cases, it’s simpler: when we’ve seen some data, emit an extra
// line ending when needed.
if (getData('flowCodeSeenData')) {
lineEndingIfNeeded()
}
tag('
')
if (count !== undefined && count < 2) lineEndingIfNeeded()
setData('flowCodeSeenData')
setData('fencesCount')
setData('slurpOneLineEnding')
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onenterimage() {
mediaStack.push({
image: true
})
tags = undefined // Disallow tags.
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onenterlink() {
mediaStack.push({})
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onexitlabeltext(token) {
mediaStack[mediaStack.length - 1].labelId = this.sliceSerialize(token)
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onexitlabel() {
mediaStack[mediaStack.length - 1].label = resume()
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onexitreferencestring(token) {
mediaStack[mediaStack.length - 1].referenceId = this.sliceSerialize(token)
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onenterresource() {
buffer() // We can have line endings in the resource, ignore them.
mediaStack[mediaStack.length - 1].destination = ''
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onenterresourcedestinationstring() {
buffer()
// Ignore encoding the result, as we’ll first percent encode the url and
// encode manually after.
setData('ignoreEncode', true)
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onexitresourcedestinationstring() {
mediaStack[mediaStack.length - 1].destination = resume()
setData('ignoreEncode')
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onexitresourcetitlestring() {
mediaStack[mediaStack.length - 1].title = resume()
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onexitmedia() {
let index = mediaStack.length - 1 // Skip current.
const media = mediaStack[index]
const id = media.referenceId || media.labelId
const context =
media.destination === undefined
? definitions[normalizeIdentifier(id)]
: media
tags = true
while (index--) {
if (mediaStack[index].image) {
tags = undefined
break
}
}
if (media.image) {
tag(
'')
} else {
tag('>')
raw(media.label)
tag('
')
}
mediaStack.pop()
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onenterdefinition() {
buffer()
mediaStack.push({})
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onexitdefinitionlabelstring(token) {
// Discard label, use the source content instead.
resume()
mediaStack[mediaStack.length - 1].labelId = this.sliceSerialize(token)
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onenterdefinitiondestinationstring() {
buffer()
setData('ignoreEncode', true)
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onexitdefinitiondestinationstring() {
mediaStack[mediaStack.length - 1].destination = resume()
setData('ignoreEncode')
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onexitdefinitiontitlestring() {
mediaStack[mediaStack.length - 1].title = resume()
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onexitdefinition() {
const media = mediaStack[mediaStack.length - 1]
const id = normalizeIdentifier(media.labelId)
resume()
if (!hasOwnProperty.call(definitions, id)) {
definitions[id] = mediaStack[mediaStack.length - 1]
}
mediaStack.pop()
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onentercontent() {
setData('slurpAllLineEndings', true)
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onexitatxheadingsequence(token) {
// Exit for further sequences.
if (getData('headingRank')) return
setData('headingRank', this.sliceSerialize(token).length)
lineEndingIfNeeded()
tag('')
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onentersetextheading() {
buffer()
setData('slurpAllLineEndings')
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onexitsetextheadingtext() {
setData('slurpAllLineEndings', true)
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onexitatxheading() {
tag('')
setData('headingRank')
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onexitsetextheadinglinesequence(token) {
setData(
'headingRank',
this.sliceSerialize(token).charCodeAt(0) === 61 ? 1 : 2
)
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onexitsetextheading() {
const value = resume()
lineEndingIfNeeded()
tag('')
raw(value)
tag('')
setData('slurpAllLineEndings')
setData('headingRank')
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onexitdata(token) {
raw(encode(this.sliceSerialize(token)))
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onexitlineending(token) {
if (getData('slurpAllLineEndings')) {
return
}
if (getData('slurpOneLineEnding')) {
setData('slurpOneLineEnding')
return
}
if (getData('inCodeText')) {
raw(' ')
return
}
raw(encode(this.sliceSerialize(token)))
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onexitcodeflowvalue(token) {
raw(encode(this.sliceSerialize(token)))
setData('flowCodeSeenData', true)
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onexithardbreak() {
tag('
')
}
/**
* @returns {undefined}
*/
function onenterhtmlflow() {
lineEndingIfNeeded()
onenterhtml()
}
/**
* @returns {undefined}
*/
function onexithtml() {
setData('ignoreEncode')
}
/**
* @returns {undefined}
*/
function onenterhtml() {
if (settings.allowDangerousHtml) {
setData('ignoreEncode', true)
}
}
/**
* @returns {undefined}
*/
function onenteremphasis() {
tag('')
}
/**
* @returns {undefined}
*/
function onenterstrong() {
tag('')
}
/**
* @returns {undefined}
*/
function onentercodetext() {
setData('inCodeText', true)
tag('')
}
/**
* @returns {undefined}
*/
function onexitcodetext() {
setData('inCodeText')
tag('
')
}
/**
* @returns {undefined}
*/
function onexitemphasis() {
tag('')
}
/**
* @returns {undefined}
*/
function onexitstrong() {
tag('')
}
/**
* @returns {undefined}
*/
function onexitthematicbreak() {
lineEndingIfNeeded()
tag('
')
}
/**
* @this {CompileContext}
* @param {Token} token
* @returns {undefined}
*/
function onexitcharacterreferencemarker(token) {
setData('characterReferenceType', token.type)
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onexitcharacterreferencevalue(token) {
let value = this.sliceSerialize(token)
// @ts-expect-error `decodeNamedCharacterReference` can return false for
// invalid named character references, but everything we’ve tokenized is
// valid.
value = getData('characterReferenceType')
? decodeNumericCharacterReference(
value,
getData('characterReferenceType') ===
'characterReferenceMarkerNumeric'
? 10
: 16
)
: decodeNamedCharacterReference(value)
raw(encode(value))
setData('characterReferenceType')
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onexitautolinkprotocol(token) {
const uri = this.sliceSerialize(token)
tag(
''
)
raw(encode(uri))
tag('')
}
/**
* @this {CompileContext}
* @type {Handle}
*/
function onexitautolinkemail(token) {
const uri = this.sliceSerialize(token)
tag('')
raw(encode(uri))
tag('')
}
}