site/node_modules/parse-latin/lib/index.js

353 lines
9.2 KiB
JavaScript
Raw Permalink Normal View History

2024-10-14 06:09:33 +00:00
/**
* @typedef {import('nlcst').Nodes} Nodes
* @typedef {import('nlcst').Parents} Parents
* @typedef {import('nlcst').Paragraph} Paragraph
* @typedef {import('nlcst').Root} Root
* @typedef {import('nlcst').RootContent} RootContent
* @typedef {import('nlcst').Sentence} Sentence
* @typedef {import('nlcst').SentenceContent} SentenceContent
* @typedef {import('vfile').VFile} VFile
*/
/**
* @template {Nodes} Node
* Node type.
* @callback Plugin
* Transform a node.
* @param {Node} node
* The node.
* @returns {undefined | void}
* Nothing.
*/
import {toString} from 'nlcst-to-string'
import {mergeAffixExceptions} from './plugin/merge-affix-exceptions.js'
import {mergeAffixSymbol} from './plugin/merge-affix-symbol.js'
import {breakImplicitSentences} from './plugin/break-implicit-sentences.js'
import {makeFinalWhiteSpaceSiblings} from './plugin/make-final-white-space-siblings.js'
import {makeInitialWhiteSpaceSiblings} from './plugin/make-initial-white-space-siblings.js'
import {mergeFinalWordSymbol} from './plugin/merge-final-word-symbol.js'
import {mergeInitialDigitSentences} from './plugin/merge-initial-digit-sentences.js'
import {mergeInitialLowerCaseLetterSentences} from './plugin/merge-initial-lower-case-letter-sentences.js'
import {mergeInitialWordSymbol} from './plugin/merge-initial-word-symbol.js'
import {mergeInitialisms} from './plugin/merge-initialisms.js'
import {mergeInnerWordSymbol} from './plugin/merge-inner-word-symbol.js'
import {mergeInnerWordSlash} from './plugin/merge-inner-word-slash.js'
import {mergeNonWordSentences} from './plugin/merge-non-word-sentences.js'
import {mergePrefixExceptions} from './plugin/merge-prefix-exceptions.js'
import {mergeRemainingFullStops} from './plugin/merge-remaining-full-stops.js'
import {removeEmptyNodes} from './plugin/remove-empty-nodes.js'
import {patchPosition} from './plugin/patch-position.js'
import {
newLine,
punctuation,
surrogates,
terminalMarker,
whiteSpace,
word
} from './expressions.js'
// PARSE LATIN
/**
* Create a new parser.
*/
export class ParseLatin {
/**
* Create a new parser.
*
* This additionally supports `retext`-like call: where an instance is
* created for each file, and the file is given on construction.
*
* @param {string | null | undefined} [doc]
* Value to parse (optional).
* @param {VFile | null | undefined} [file]
* Corresponding file (optional).
*/
constructor(doc, file) {
const value = file || doc
/** @type {string | undefined} */
this.doc = value ? String(value) : undefined
/** @type {Array<Plugin<Root>>} */
this.tokenizeRootPlugins = [...this.tokenizeRootPlugins]
/** @type {Array<Plugin<Paragraph>>} */
this.tokenizeParagraphPlugins = [...this.tokenizeParagraphPlugins]
/** @type {Array<Plugin<Sentence>>} */
this.tokenizeSentencePlugins = [...this.tokenizeSentencePlugins]
}
/**
* Turn natural language into a syntax tree.
*
* @param {string | null | undefined} [value]
* Value to parse (optional).
* @returns {Root}
* Tree.
*/
parse(value) {
return this.tokenizeRoot(value || this.doc)
}
/**
* Parse as a root.
*
* @param {string | null | undefined} [value]
* Value to parse (optional).
* @returns {Root}
* Built tree.
*/
tokenizeRoot(value) {
const paragraph = this.tokenizeParagraph(value)
/** @type {Root} */
const result = {
type: 'RootNode',
children: splitNode(paragraph, 'WhiteSpaceNode', newLine)
}
let index = -1
while (this.tokenizeRootPlugins[++index]) {
this.tokenizeRootPlugins[index](result)
}
return result
}
/**
* Parse as a paragraph.
*
* @param {string | null | undefined} [value]
* Value to parse (optional).
* @returns {Paragraph}
* Built tree.
*/
tokenizeParagraph(value) {
const sentence = this.tokenizeSentence(value)
/** @type {Paragraph} */
const result = {
type: 'ParagraphNode',
children: splitNode(sentence, 'PunctuationNode', terminalMarker)
}
let index = -1
while (this.tokenizeParagraphPlugins[++index]) {
this.tokenizeParagraphPlugins[index](result)
}
return result
}
/**
* Parse as a sentence.
*
* @param {string | null | undefined} [value]
* Value to parse (optional).
* @returns {Sentence}
* Built tree.
*/
tokenizeSentence(value) {
const children = this.tokenize(value)
/** @type {Sentence} */
const result = {type: 'SentenceNode', children}
let index = -1
while (this.tokenizeSentencePlugins[++index]) {
this.tokenizeSentencePlugins[index](result)
}
return result
}
/**
* Transform a `value` into a list of nlcsts.
*
* @param {string | null | undefined} [value]
* Value to parse (optional).
* @returns {Array<SentenceContent>}
* Built sentence content.
*/
tokenize(value) {
/** @type {Array<SentenceContent>} */
const children = []
if (!value) {
return children
}
const currentPoint = {line: 1, column: 1, offset: 0}
let from = 0
let index = 0
let start = {...currentPoint}
/** @type {SentenceContent['type'] | undefined} */
let previousType
/** @type {string | undefined} */
let previous
while (index < value.length) {
const current = value.charAt(index)
const currentType = whiteSpace.test(current)
? 'WhiteSpaceNode'
: punctuation.test(current)
? 'PunctuationNode'
: word.test(current)
? 'WordNode'
: 'SymbolNode'
if (
from < index &&
previousType &&
currentType &&
!(
previousType === currentType &&
// Words or white space continue.
(previousType === 'WordNode' ||
previousType === 'WhiteSpaceNode' ||
// Same character of punctuation or symbol also continues.
current === previous ||
// Surrogates of punctuation or symbol also continue.
surrogates.test(current))
)
) {
// Flush the previous queue.
children.push(createNode(previousType, value.slice(from, index)))
from = index
start = {...currentPoint}
}
if (current === '\r' || (current === '\n' && previous !== '\r')) {
currentPoint.line++
currentPoint.column = 1
} else if (current !== '\n') {
currentPoint.column++
}
currentPoint.offset++
previousType = currentType
previous = current
index++
}
if (previousType && from < index) {
children.push(createNode(previousType, value.slice(from, index)))
}
return children
/**
* @param {SentenceContent['type']} type
* Node type to build.
* @param {string} value
* Value.
* @returns {SentenceContent}
* Node.
*/
function createNode(type, value) {
return type === 'WordNode'
? {
type: 'WordNode',
children: [
{
type: 'TextNode',
value,
position: {start, end: {...currentPoint}}
}
],
position: {start, end: {...currentPoint}}
}
: {type, value, position: {start, end: {...currentPoint}}}
}
}
}
/**
* List of transforms handling a sentence.
*/
ParseLatin.prototype.tokenizeSentencePlugins = [
mergeInitialWordSymbol,
mergeFinalWordSymbol,
mergeInnerWordSymbol,
mergeInnerWordSlash,
mergeInitialisms,
patchPosition
]
/**
* List of transforms handling a paragraph.
*/
ParseLatin.prototype.tokenizeParagraphPlugins = [
mergeNonWordSentences,
mergeAffixSymbol,
mergeInitialLowerCaseLetterSentences,
mergeInitialDigitSentences,
mergePrefixExceptions,
mergeAffixExceptions,
mergeRemainingFullStops,
makeInitialWhiteSpaceSiblings,
makeFinalWhiteSpaceSiblings,
breakImplicitSentences,
removeEmptyNodes,
patchPosition
]
/**
* List of transforms handling a root.
*/
ParseLatin.prototype.tokenizeRootPlugins = [
makeInitialWhiteSpaceSiblings,
makeFinalWhiteSpaceSiblings,
removeEmptyNodes,
patchPosition
]
/**
* A function that splits one node into several nodes.
*
* @template {Parents} Node
* Node type.
* @param {Node} node
* Node to split.
* @param {RegExp} expression
* Split on this regex.
* @param {Node['children'][number]['type']} childType
* Split this node type.
* @returns {Array<Node>}
* The given node, split into several nodes.
*/
function splitNode(node, childType, expression) {
/** @type {Array<Node>} */
const result = []
let index = -1
let start = 0
while (++index < node.children.length) {
const token = node.children[index]
if (
index === node.children.length - 1 ||
(token.type === childType && expression.test(toString(token)))
) {
/** @type {Node} */
// @ts-expect-error: fine
const parent = {
type: node.type,
children: node.children.slice(start, index + 1)
}
const first = node.children[start]
const last = token
if (first.position && last.position) {
parent.position = {
start: first.position.start,
end: last.position.end
}
}
result.push(parent)
start = index + 1
}
}
return result
}