site/node_modules/micromark-extension-gfm-autolink-literal/dev/lib/syntax.js
2024-10-14 08:09:33 +02:00

979 lines
22 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* @typedef {import('micromark-util-types').Code} Code
* @typedef {import('micromark-util-types').ConstructRecord} ConstructRecord
* @typedef {import('micromark-util-types').Event} Event
* @typedef {import('micromark-util-types').Extension} Extension
* @typedef {import('micromark-util-types').Previous} Previous
* @typedef {import('micromark-util-types').State} State
* @typedef {import('micromark-util-types').TokenizeContext} TokenizeContext
* @typedef {import('micromark-util-types').Tokenizer} Tokenizer
*/
import {
asciiAlpha,
asciiAlphanumeric,
asciiControl,
markdownLineEndingOrSpace,
unicodePunctuation,
unicodeWhitespace
} from 'micromark-util-character'
import {codes} from 'micromark-util-symbol'
const wwwPrefix = {tokenize: tokenizeWwwPrefix, partial: true}
const domain = {tokenize: tokenizeDomain, partial: true}
const path = {tokenize: tokenizePath, partial: true}
const trail = {tokenize: tokenizeTrail, partial: true}
const emailDomainDotTrail = {
tokenize: tokenizeEmailDomainDotTrail,
partial: true
}
const wwwAutolink = {tokenize: tokenizeWwwAutolink, previous: previousWww}
const protocolAutolink = {
tokenize: tokenizeProtocolAutolink,
previous: previousProtocol
}
const emailAutolink = {tokenize: tokenizeEmailAutolink, previous: previousEmail}
/** @type {ConstructRecord} */
const text = {}
/**
* Create an extension for `micromark` to support GitHub autolink literal
* syntax.
*
* @returns {Extension}
* Extension for `micromark` that can be passed in `extensions` to enable GFM
* autolink literal syntax.
*/
export function gfmAutolinkLiteral() {
return {text}
}
/** @type {Code} */
let code = codes.digit0
// Add alphanumerics.
while (code < codes.leftCurlyBrace) {
text[code] = emailAutolink
code++
if (code === codes.colon) code = codes.uppercaseA
else if (code === codes.leftSquareBracket) code = codes.lowercaseA
}
text[codes.plusSign] = emailAutolink
text[codes.dash] = emailAutolink
text[codes.dot] = emailAutolink
text[codes.underscore] = emailAutolink
text[codes.uppercaseH] = [emailAutolink, protocolAutolink]
text[codes.lowercaseH] = [emailAutolink, protocolAutolink]
text[codes.uppercaseW] = [emailAutolink, wwwAutolink]
text[codes.lowercaseW] = [emailAutolink, wwwAutolink]
// To do: perform email autolink literals on events, afterwards.
// Thats where `markdown-rs` and `cmark-gfm` perform it.
// It should look for `@`, then for atext backwards, and then for a label
// forwards.
// To do: `mailto:`, `xmpp:` protocol as prefix.
/**
* Email autolink literal.
*
* ```markdown
* > | a contact@example.org b
* ^^^^^^^^^^^^^^^^^^^
* ```
*
* @this {TokenizeContext}
* @type {Tokenizer}
*/
function tokenizeEmailAutolink(effects, ok, nok) {
const self = this
/** @type {boolean | undefined} */
let dot
/** @type {boolean} */
let data
return start
/**
* Start of email autolink literal.
*
* ```markdown
* > | a contact@example.org b
* ^
* ```
*
* @type {State}
*/
function start(code) {
if (
!gfmAtext(code) ||
!previousEmail.call(self, self.previous) ||
previousUnbalanced(self.events)
) {
return nok(code)
}
effects.enter('literalAutolink')
effects.enter('literalAutolinkEmail')
return atext(code)
}
/**
* In email atext.
*
* ```markdown
* > | a contact@example.org b
* ^
* ```
*
* @type {State}
*/
function atext(code) {
if (gfmAtext(code)) {
effects.consume(code)
return atext
}
if (code === codes.atSign) {
effects.consume(code)
return emailDomain
}
return nok(code)
}
/**
* In email domain.
*
* The reference code is a bit overly complex as it handles the `@`, of which
* there may be just one.
* Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L318>
*
* ```markdown
* > | a contact@example.org b
* ^
* ```
*
* @type {State}
*/
function emailDomain(code) {
// Dot followed by alphanumerical (not `-` or `_`).
if (code === codes.dot) {
return effects.check(
emailDomainDotTrail,
emailDomainAfter,
emailDomainDot
)(code)
}
// Alphanumerical, `-`, and `_`.
if (
code === codes.dash ||
code === codes.underscore ||
asciiAlphanumeric(code)
) {
data = true
effects.consume(code)
return emailDomain
}
// To do: `/` if xmpp.
// Note: normally wed truncate trailing punctuation from the link.
// However, email autolink literals cannot contain any of those markers,
// except for `.`, but that can only occur if it isnt trailing.
// So we can ignore truncating!
return emailDomainAfter(code)
}
/**
* In email domain, on dot that is not a trail.
*
* ```markdown
* > | a contact@example.org b
* ^
* ```
*
* @type {State}
*/
function emailDomainDot(code) {
effects.consume(code)
dot = true
return emailDomain
}
/**
* After email domain.
*
* ```markdown
* > | a contact@example.org b
* ^
* ```
*
* @type {State}
*/
function emailDomainAfter(code) {
// Domain must not be empty, must include a dot, and must end in alphabetical.
// Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L332>.
if (data && dot && asciiAlpha(self.previous)) {
effects.exit('literalAutolinkEmail')
effects.exit('literalAutolink')
return ok(code)
}
return nok(code)
}
}
/**
* `www` autolink literal.
*
* ```markdown
* > | a www.example.org b
* ^^^^^^^^^^^^^^^
* ```
*
* @this {TokenizeContext}
* @type {Tokenizer}
*/
function tokenizeWwwAutolink(effects, ok, nok) {
const self = this
return wwwStart
/**
* Start of www autolink literal.
*
* ```markdown
* > | www.example.com/a?b#c
* ^
* ```
*
* @type {State}
*/
function wwwStart(code) {
if (
(code !== codes.uppercaseW && code !== codes.lowercaseW) ||
!previousWww.call(self, self.previous) ||
previousUnbalanced(self.events)
) {
return nok(code)
}
effects.enter('literalAutolink')
effects.enter('literalAutolinkWww')
// Note: we *check*, so we can discard the `www.` we parsed.
// If it worked, we consider it as a part of the domain.
return effects.check(
wwwPrefix,
effects.attempt(domain, effects.attempt(path, wwwAfter), nok),
nok
)(code)
}
/**
* After a www autolink literal.
*
* ```markdown
* > | www.example.com/a?b#c
* ^
* ```
*
* @type {State}
*/
function wwwAfter(code) {
effects.exit('literalAutolinkWww')
effects.exit('literalAutolink')
return ok(code)
}
}
/**
* Protocol autolink literal.
*
* ```markdown
* > | a https://example.org b
* ^^^^^^^^^^^^^^^^^^^
* ```
*
* @this {TokenizeContext}
* @type {Tokenizer}
*/
function tokenizeProtocolAutolink(effects, ok, nok) {
const self = this
let buffer = ''
let seen = false
return protocolStart
/**
* Start of protocol autolink literal.
*
* ```markdown
* > | https://example.com/a?b#c
* ^
* ```
*
* @type {State}
*/
function protocolStart(code) {
if (
(code === codes.uppercaseH || code === codes.lowercaseH) &&
previousProtocol.call(self, self.previous) &&
!previousUnbalanced(self.events)
) {
effects.enter('literalAutolink')
effects.enter('literalAutolinkHttp')
buffer += String.fromCodePoint(code)
effects.consume(code)
return protocolPrefixInside
}
return nok(code)
}
/**
* In protocol.
*
* ```markdown
* > | https://example.com/a?b#c
* ^^^^^
* ```
*
* @type {State}
*/
function protocolPrefixInside(code) {
// `5` is size of `https`
if (asciiAlpha(code) && buffer.length < 5) {
// @ts-expect-error: definitely number.
buffer += String.fromCodePoint(code)
effects.consume(code)
return protocolPrefixInside
}
if (code === codes.colon) {
const protocol = buffer.toLowerCase()
if (protocol === 'http' || protocol === 'https') {
effects.consume(code)
return protocolSlashesInside
}
}
return nok(code)
}
/**
* In slashes.
*
* ```markdown
* > | https://example.com/a?b#c
* ^^
* ```
*
* @type {State}
*/
function protocolSlashesInside(code) {
if (code === codes.slash) {
effects.consume(code)
if (seen) {
return afterProtocol
}
seen = true
return protocolSlashesInside
}
return nok(code)
}
/**
* After protocol, before domain.
*
* ```markdown
* > | https://example.com/a?b#c
* ^
* ```
*
* @type {State}
*/
function afterProtocol(code) {
// To do: this is different from `markdown-rs`:
// https://github.com/wooorm/markdown-rs/blob/b3a921c761309ae00a51fe348d8a43adbc54b518/src/construct/gfm_autolink_literal.rs#L172-L182
return code === codes.eof ||
asciiControl(code) ||
markdownLineEndingOrSpace(code) ||
unicodeWhitespace(code) ||
unicodePunctuation(code)
? nok(code)
: effects.attempt(domain, effects.attempt(path, protocolAfter), nok)(code)
}
/**
* After a protocol autolink literal.
*
* ```markdown
* > | https://example.com/a?b#c
* ^
* ```
*
* @type {State}
*/
function protocolAfter(code) {
effects.exit('literalAutolinkHttp')
effects.exit('literalAutolink')
return ok(code)
}
}
/**
* `www` prefix.
*
* ```markdown
* > | a www.example.org b
* ^^^^
* ```
*
* @this {TokenizeContext}
* @type {Tokenizer}
*/
function tokenizeWwwPrefix(effects, ok, nok) {
let size = 0
return wwwPrefixInside
/**
* In www prefix.
*
* ```markdown
* > | www.example.com
* ^^^^
* ```
*
* @type {State}
*/
function wwwPrefixInside(code) {
if ((code === codes.uppercaseW || code === codes.lowercaseW) && size < 3) {
size++
effects.consume(code)
return wwwPrefixInside
}
if (code === codes.dot && size === 3) {
effects.consume(code)
return wwwPrefixAfter
}
return nok(code)
}
/**
* After www prefix.
*
* ```markdown
* > | www.example.com
* ^
* ```
*
* @type {State}
*/
function wwwPrefixAfter(code) {
// If there is *anything*, we can link.
return code === codes.eof ? nok(code) : ok(code)
}
}
/**
* Domain.
*
* ```markdown
* > | a https://example.org b
* ^^^^^^^^^^^
* ```
*
* @this {TokenizeContext}
* @type {Tokenizer}
*/
function tokenizeDomain(effects, ok, nok) {
/** @type {boolean | undefined} */
let underscoreInLastSegment
/** @type {boolean | undefined} */
let underscoreInLastLastSegment
/** @type {boolean | undefined} */
let seen
return domainInside
/**
* In domain.
*
* ```markdown
* > | https://example.com/a
* ^^^^^^^^^^^
* ```
*
* @type {State}
*/
function domainInside(code) {
// Check whether this marker, which is a trailing punctuation
// marker, optionally followed by more trailing markers, and then
// followed by an end.
if (code === codes.dot || code === codes.underscore) {
return effects.check(trail, domainAfter, domainAtPunctuation)(code)
}
// GH documents that only alphanumerics (other than `-`, `.`, and `_`) can
// occur, which sounds like ASCII only, but they also support `www.點看.com`,
// so thats Unicode.
// Instead of some new production for Unicode alphanumerics, markdown
// already has that for Unicode punctuation and whitespace, so use those.
// Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L12>.
if (
code === codes.eof ||
markdownLineEndingOrSpace(code) ||
unicodeWhitespace(code) ||
(code !== codes.dash && unicodePunctuation(code))
) {
return domainAfter(code)
}
seen = true
effects.consume(code)
return domainInside
}
/**
* In domain, at potential trailing punctuation, that was not trailing.
*
* ```markdown
* > | https://example.com
* ^
* ```
*
* @type {State}
*/
function domainAtPunctuation(code) {
// There is an underscore in the last segment of the domain
if (code === codes.underscore) {
underscoreInLastSegment = true
}
// Otherwise, its a `.`: save the last segment underscore in the
// penultimate segment slot.
else {
underscoreInLastLastSegment = underscoreInLastSegment
underscoreInLastSegment = undefined
}
effects.consume(code)
return domainInside
}
/**
* After domain.
*
* ```markdown
* > | https://example.com/a
* ^
* ```
*
* @type {State} */
function domainAfter(code) {
// Note: thats GH says a dot is needed, but its not true:
// <https://github.com/github/cmark-gfm/issues/279>
if (underscoreInLastLastSegment || underscoreInLastSegment || !seen) {
return nok(code)
}
return ok(code)
}
}
/**
* Path.
*
* ```markdown
* > | a https://example.org/stuff b
* ^^^^^^
* ```
*
* @this {TokenizeContext}
* @type {Tokenizer}
*/
function tokenizePath(effects, ok) {
let sizeOpen = 0
let sizeClose = 0
return pathInside
/**
* In path.
*
* ```markdown
* > | https://example.com/a
* ^^
* ```
*
* @type {State}
*/
function pathInside(code) {
if (code === codes.leftParenthesis) {
sizeOpen++
effects.consume(code)
return pathInside
}
// To do: `markdown-rs` also needs this.
// If this is a paren, and there are less closings than openings,
// we dont check for a trail.
if (code === codes.rightParenthesis && sizeClose < sizeOpen) {
return pathAtPunctuation(code)
}
// Check whether this trailing punctuation marker is optionally
// followed by more trailing markers, and then followed
// by an end.
if (
code === codes.exclamationMark ||
code === codes.quotationMark ||
code === codes.ampersand ||
code === codes.apostrophe ||
code === codes.rightParenthesis ||
code === codes.asterisk ||
code === codes.comma ||
code === codes.dot ||
code === codes.colon ||
code === codes.semicolon ||
code === codes.lessThan ||
code === codes.questionMark ||
code === codes.rightSquareBracket ||
code === codes.underscore ||
code === codes.tilde
) {
return effects.check(trail, ok, pathAtPunctuation)(code)
}
if (
code === codes.eof ||
markdownLineEndingOrSpace(code) ||
unicodeWhitespace(code)
) {
return ok(code)
}
effects.consume(code)
return pathInside
}
/**
* In path, at potential trailing punctuation, that was not trailing.
*
* ```markdown
* > | https://example.com/a"b
* ^
* ```
*
* @type {State}
*/
function pathAtPunctuation(code) {
// Count closing parens.
if (code === codes.rightParenthesis) {
sizeClose++
}
effects.consume(code)
return pathInside
}
}
/**
* Trail.
*
* This calls `ok` if this *is* the trail, followed by an end, which means
* the entire trail is not part of the link.
* It calls `nok` if this *is* part of the link.
*
* ```markdown
* > | https://example.com").
* ^^^
* ```
*
* @this {TokenizeContext}
* @type {Tokenizer}
*/
function tokenizeTrail(effects, ok, nok) {
return trail
/**
* In trail of domain or path.
*
* ```markdown
* > | https://example.com").
* ^
* ```
*
* @type {State}
*/
function trail(code) {
// Regular trailing punctuation.
if (
code === codes.exclamationMark ||
code === codes.quotationMark ||
code === codes.apostrophe ||
code === codes.rightParenthesis ||
code === codes.asterisk ||
code === codes.comma ||
code === codes.dot ||
code === codes.colon ||
code === codes.semicolon ||
code === codes.questionMark ||
code === codes.underscore ||
code === codes.tilde
) {
effects.consume(code)
return trail
}
// `&` followed by one or more alphabeticals and then a `;`, is
// as a whole considered as trailing punctuation.
// In all other cases, it is considered as continuation of the URL.
if (code === codes.ampersand) {
effects.consume(code)
return trailCharRefStart
}
// Needed because we allow literals after `[`, as we fix:
// <https://github.com/github/cmark-gfm/issues/278>.
// Check that it is not followed by `(` or `[`.
if (code === codes.rightSquareBracket) {
effects.consume(code)
return trailBracketAfter
}
if (
// `<` is an end.
code === codes.lessThan ||
// So is whitespace.
code === codes.eof ||
markdownLineEndingOrSpace(code) ||
unicodeWhitespace(code)
) {
return ok(code)
}
return nok(code)
}
/**
* In trail, after `]`.
*
* > 👉 **Note**: this deviates from `cmark-gfm` to fix a bug.
* > See end of <https://github.com/github/cmark-gfm/issues/278> for more.
*
* ```markdown
* > | https://example.com](
* ^
* ```
*
* @type {State}
*/
function trailBracketAfter(code) {
// Whitespace or something that could start a resource or reference is the end.
// Switch back to trail otherwise.
if (
code === codes.eof ||
code === codes.leftParenthesis ||
code === codes.leftSquareBracket ||
markdownLineEndingOrSpace(code) ||
unicodeWhitespace(code)
) {
return ok(code)
}
return trail(code)
}
/**
* In character-reference like trail, after `&`.
*
* ```markdown
* > | https://example.com&amp;).
* ^
* ```
*
* @type {State}
*/
function trailCharRefStart(code) {
// When non-alpha, its not a trail.
return asciiAlpha(code) ? trailCharRefInside(code) : nok(code)
}
/**
* In character-reference like trail.
*
* ```markdown
* > | https://example.com&amp;).
* ^
* ```
*
* @type {State}
*/
function trailCharRefInside(code) {
// Switch back to trail if this is well-formed.
if (code === codes.semicolon) {
effects.consume(code)
return trail
}
if (asciiAlpha(code)) {
effects.consume(code)
return trailCharRefInside
}
// Its not a trail.
return nok(code)
}
}
/**
* Dot in email domain trail.
*
* This calls `ok` if this *is* the trail, followed by an end, which means
* the trail is not part of the link.
* It calls `nok` if this *is* part of the link.
*
* ```markdown
* > | contact@example.org.
* ^
* ```
*
* @this {TokenizeContext}
* @type {Tokenizer}
*/
function tokenizeEmailDomainDotTrail(effects, ok, nok) {
return start
/**
* Dot.
*
* ```markdown
* > | contact@example.org.
* ^ ^
* ```
*
* @type {State}
*/
function start(code) {
// Must be dot.
effects.consume(code)
return after
}
/**
* After dot.
*
* ```markdown
* > | contact@example.org.
* ^ ^
* ```
*
* @type {State}
*/
function after(code) {
// Not a trail if alphanumeric.
return asciiAlphanumeric(code) ? nok(code) : ok(code)
}
}
/**
* See:
* <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L156>.
*
* @type {Previous}
*/
function previousWww(code) {
return (
code === codes.eof ||
code === codes.leftParenthesis ||
code === codes.asterisk ||
code === codes.underscore ||
code === codes.leftSquareBracket ||
code === codes.rightSquareBracket ||
code === codes.tilde ||
markdownLineEndingOrSpace(code)
)
}
/**
* See:
* <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L214>.
*
* @type {Previous}
*/
function previousProtocol(code) {
return !asciiAlpha(code)
}
/**
* @this {TokenizeContext}
* @type {Previous}
*/
function previousEmail(code) {
// Do not allow a slash “inside” atext.
// The reference code is a bit weird, but thats what it results in.
// Source: <https://github.com/github/cmark-gfm/blob/ef1cfcb/extensions/autolink.c#L307>.
// Other than slash, every preceding character is allowed.
return !(code === codes.slash || gfmAtext(code))
}
/**
* @param {Code} code
* @returns {boolean}
*/
function gfmAtext(code) {
return (
code === codes.plusSign ||
code === codes.dash ||
code === codes.dot ||
code === codes.underscore ||
asciiAlphanumeric(code)
)
}
/**
* @param {Array<Event>} events
* @returns {boolean}
*/
function previousUnbalanced(events) {
let index = events.length
let result = false
while (index--) {
const token = events[index][1]
if (
(token.type === 'labelLink' || token.type === 'labelImage') &&
!token._balanced
) {
result = true
break
}
// If weve seen this token, and it was marked as not having any unbalanced
// bracket before it, we can exit.
if (token._gfmAutolinkLiteralWalkedInto) {
result = false
break
}
}
if (events.length > 0 && !result) {
// Mark the last token as “walked into” w/o finding
// anything.
events[events.length - 1][1]._gfmAutolinkLiteralWalkedInto = true
}
return result
}