site/node_modules/parse-latin/lib/index.js

/**
 * @typedef {import('nlcst').Nodes} Nodes
 * @typedef {import('nlcst').Parents} Parents
 * @typedef {import('nlcst').Paragraph} Paragraph
 * @typedef {import('nlcst').Root} Root
 * @typedef {import('nlcst').RootContent} RootContent
 * @typedef {import('nlcst').Sentence} Sentence
 * @typedef {import('nlcst').SentenceContent} SentenceContent
 * @typedef {import('vfile').VFile} VFile
 */

/**
 * @template {Nodes} Node
 *   Node type.
 * @callback Plugin
 *   Transform a node.
 * @param {Node} node
 *   The node.
 * @returns {undefined | void}
 *   Nothing.
 */

import {toString} from 'nlcst-to-string'
import {mergeAffixExceptions} from './plugin/merge-affix-exceptions.js'
import {mergeAffixSymbol} from './plugin/merge-affix-symbol.js'
import {breakImplicitSentences} from './plugin/break-implicit-sentences.js'
import {makeFinalWhiteSpaceSiblings} from './plugin/make-final-white-space-siblings.js'
import {makeInitialWhiteSpaceSiblings} from './plugin/make-initial-white-space-siblings.js'
import {mergeFinalWordSymbol} from './plugin/merge-final-word-symbol.js'
import {mergeInitialDigitSentences} from './plugin/merge-initial-digit-sentences.js'
import {mergeInitialLowerCaseLetterSentences} from './plugin/merge-initial-lower-case-letter-sentences.js'
import {mergeInitialWordSymbol} from './plugin/merge-initial-word-symbol.js'
import {mergeInitialisms} from './plugin/merge-initialisms.js'
import {mergeInnerWordSymbol} from './plugin/merge-inner-word-symbol.js'
import {mergeInnerWordSlash} from './plugin/merge-inner-word-slash.js'
import {mergeNonWordSentences} from './plugin/merge-non-word-sentences.js'
import {mergePrefixExceptions} from './plugin/merge-prefix-exceptions.js'
import {mergeRemainingFullStops} from './plugin/merge-remaining-full-stops.js'
import {removeEmptyNodes} from './plugin/remove-empty-nodes.js'
import {patchPosition} from './plugin/patch-position.js'
import {
  newLine,
  punctuation,
  surrogates,
  terminalMarker,
  whiteSpace,
  word
} from './expressions.js'

// PARSE LATIN

/**
 * Create a new parser.
 */
export class ParseLatin {
  /**
   * Create a new parser.
   *
   * This additionally supports `retext`-like call: where an instance is
   * created for each file, and the file is given on construction.
   *
   * @param {string | null | undefined} [doc]
   *   Value to parse (optional).
   * @param {VFile | null | undefined} [file]
   *   Corresponding file (optional).
   */
  constructor(doc, file) {
    const value = file || doc

    /** @type {string | undefined} */
    this.doc = value ? String(value) : undefined

    /** @type {Array<Plugin<Root>>} */
    this.tokenizeRootPlugins = [...this.tokenizeRootPlugins]
    /** @type {Array<Plugin<Paragraph>>} */
    this.tokenizeParagraphPlugins = [...this.tokenizeParagraphPlugins]
    /** @type {Array<Plugin<Sentence>>} */
    this.tokenizeSentencePlugins = [...this.tokenizeSentencePlugins]
  }

  /**
   * Turn natural language into a syntax tree.
   *
   * @param {string | null | undefined} [value]
   *   Value to parse (optional).
   * @returns {Root}
   *   Tree.
   */
  parse(value) {
    return this.tokenizeRoot(value || this.doc)
  }

  /**
   * Parse as a root.
   *
   * @param {string | null | undefined} [value]
   *   Value to parse (optional).
   * @returns {Root}
   *   Built tree.
   */
  tokenizeRoot(value) {
    const paragraph = this.tokenizeParagraph(value)
    /** @type {Root} */
    const result = {
      type: 'RootNode',
      children: splitNode(paragraph, 'WhiteSpaceNode', newLine)
    }

    let index = -1
    while (this.tokenizeRootPlugins[++index]) {
      this.tokenizeRootPlugins[index](result)
    }

    return result
  }

  /**
   * Parse as a paragraph.
   *
   * @param {string | null | undefined} [value]
   *   Value to parse (optional).
   * @returns {Paragraph}
   *   Built tree.
   */
  tokenizeParagraph(value) {
    const sentence = this.tokenizeSentence(value)
    /** @type {Paragraph} */
    const result = {
      type: 'ParagraphNode',
      children: splitNode(sentence, 'PunctuationNode', terminalMarker)
    }

    let index = -1
    while (this.tokenizeParagraphPlugins[++index]) {
      this.tokenizeParagraphPlugins[index](result)
    }

    return result
  }

  /**
   * Parse as a sentence.
   *
   * @param {string | null | undefined} [value]
   *   Value to parse (optional).
   * @returns {Sentence}
   *   Built tree.
   */
  tokenizeSentence(value) {
    const children = this.tokenize(value)
    /** @type {Sentence} */
    const result = {type: 'SentenceNode', children}

    let index = -1
    while (this.tokenizeSentencePlugins[++index]) {
      this.tokenizeSentencePlugins[index](result)
    }

    return result
  }

  /**
   *  Transform a `value` into a list of nlcsts.
   *
   * @param {string | null | undefined} [value]
   *   Value to parse (optional).
   * @returns {Array<SentenceContent>}
   *   Built sentence content.
   */
  tokenize(value) {
    /** @type {Array<SentenceContent>} */
    const children = []

    if (!value) {
      return children
    }

    const currentPoint = {line: 1, column: 1, offset: 0}
    let from = 0
    let index = 0
    let start = {...currentPoint}
    /** @type {SentenceContent['type'] | undefined} */
    let previousType
    /** @type {string | undefined} */
    let previous

    while (index < value.length) {
      const current = value.charAt(index)
      const currentType = whiteSpace.test(current)
        ? 'WhiteSpaceNode'
        : punctuation.test(current)
        ? 'PunctuationNode'
        : word.test(current)
        ? 'WordNode'
        : 'SymbolNode'

      if (
        from < index &&
        previousType &&
        currentType &&
        !(
          previousType === currentType &&
          // Words or white space continue.
          (previousType === 'WordNode' ||
            previousType === 'WhiteSpaceNode' ||
            // Same character of punctuation or symbol also continues.
            current === previous ||
            // Surrogates of  punctuation or symbol also continue.
            surrogates.test(current))
        )
      ) {
        // Flush the previous queue.
        children.push(createNode(previousType, value.slice(from, index)))
        from = index
        start = {...currentPoint}
      }

      if (current === '\r' || (current === '\n' && previous !== '\r')) {
        currentPoint.line++
        currentPoint.column = 1
      } else if (current !== '\n') {
        currentPoint.column++
      }

      currentPoint.offset++
      previousType = currentType
      previous = current
      index++
    }

    if (previousType && from < index) {
      children.push(createNode(previousType, value.slice(from, index)))
    }

    return children

    /**
     * @param {SentenceContent['type']} type
     *   Node type to build.
     * @param {string} value
     *   Value.
     * @returns {SentenceContent}
     *   Node.
     */
    function createNode(type, value) {
      return type === 'WordNode'
        ? {
            type: 'WordNode',
            children: [
              {
                type: 'TextNode',
                value,
                position: {start, end: {...currentPoint}}
              }
            ],
            position: {start, end: {...currentPoint}}
          }
        : {type, value, position: {start, end: {...currentPoint}}}
    }
  }
}

/**
 * List of transforms handling a sentence.
 */
ParseLatin.prototype.tokenizeSentencePlugins = [
  mergeInitialWordSymbol,
  mergeFinalWordSymbol,
  mergeInnerWordSymbol,
  mergeInnerWordSlash,
  mergeInitialisms,
  patchPosition
]

/**
 * List of transforms handling a paragraph.
 */
ParseLatin.prototype.tokenizeParagraphPlugins = [
  mergeNonWordSentences,
  mergeAffixSymbol,
  mergeInitialLowerCaseLetterSentences,
  mergeInitialDigitSentences,
  mergePrefixExceptions,
  mergeAffixExceptions,
  mergeRemainingFullStops,
  makeInitialWhiteSpaceSiblings,
  makeFinalWhiteSpaceSiblings,
  breakImplicitSentences,
  removeEmptyNodes,
  patchPosition
]

/**
 * List of transforms handling a root.
 */
ParseLatin.prototype.tokenizeRootPlugins = [
  makeInitialWhiteSpaceSiblings,
  makeFinalWhiteSpaceSiblings,
  removeEmptyNodes,
  patchPosition
]

/**
 * A function that splits one node into several nodes.
 *
 * @template {Parents} Node
 *   Node type.
 * @param {Node} node
 *   Node to split.
 * @param {RegExp} expression
 *   Split on this regex.
 * @param {Node['children'][number]['type']} childType
 *   Split this node type.
 * @returns {Array<Node>}
 *   The given node, split into several nodes.
 */
function splitNode(node, childType, expression) {
  /** @type {Array<Node>} */
  const result = []
  let index = -1
  let start = 0

  while (++index < node.children.length) {
    const token = node.children[index]

    if (
      index === node.children.length - 1 ||
      (token.type === childType && expression.test(toString(token)))
    ) {
      /** @type {Node} */
      // @ts-expect-error: fine
      const parent = {
        type: node.type,
        children: node.children.slice(start, index + 1)
      }

      const first = node.children[start]
      const last = token
      if (first.position && last.position) {
        parent.position = {
          start: first.position.start,
          end: last.position.end
        }
      }

      result.push(parent)
      start = index + 1
    }
  }

  return result
}
updated shit 2024-10-14 06:09:33 +00:00			`/**`
			`* @typedef {import('nlcst').Nodes} Nodes`
			`* @typedef {import('nlcst').Parents} Parents`
			`* @typedef {import('nlcst').Paragraph} Paragraph`
			`* @typedef {import('nlcst').Root} Root`
			`* @typedef {import('nlcst').RootContent} RootContent`
			`* @typedef {import('nlcst').Sentence} Sentence`
			`* @typedef {import('nlcst').SentenceContent} SentenceContent`
			`* @typedef {import('vfile').VFile} VFile`
			`*/`

			`/**`
			`* @template {Nodes} Node`
			`* Node type.`
			`* @callback Plugin`
			`* Transform a node.`
			`* @param {Node} node`
			`* The node.`
			`* @returns {undefined \| void}`
			`* Nothing.`
			`*/`

			`import {toString} from 'nlcst-to-string'`
			`import {mergeAffixExceptions} from './plugin/merge-affix-exceptions.js'`
			`import {mergeAffixSymbol} from './plugin/merge-affix-symbol.js'`
			`import {breakImplicitSentences} from './plugin/break-implicit-sentences.js'`
			`import {makeFinalWhiteSpaceSiblings} from './plugin/make-final-white-space-siblings.js'`
			`import {makeInitialWhiteSpaceSiblings} from './plugin/make-initial-white-space-siblings.js'`
			`import {mergeFinalWordSymbol} from './plugin/merge-final-word-symbol.js'`
			`import {mergeInitialDigitSentences} from './plugin/merge-initial-digit-sentences.js'`
			`import {mergeInitialLowerCaseLetterSentences} from './plugin/merge-initial-lower-case-letter-sentences.js'`
			`import {mergeInitialWordSymbol} from './plugin/merge-initial-word-symbol.js'`
			`import {mergeInitialisms} from './plugin/merge-initialisms.js'`
			`import {mergeInnerWordSymbol} from './plugin/merge-inner-word-symbol.js'`
			`import {mergeInnerWordSlash} from './plugin/merge-inner-word-slash.js'`
			`import {mergeNonWordSentences} from './plugin/merge-non-word-sentences.js'`
			`import {mergePrefixExceptions} from './plugin/merge-prefix-exceptions.js'`
			`import {mergeRemainingFullStops} from './plugin/merge-remaining-full-stops.js'`
			`import {removeEmptyNodes} from './plugin/remove-empty-nodes.js'`
			`import {patchPosition} from './plugin/patch-position.js'`
			`import {`
			`newLine,`
			`punctuation,`
			`surrogates,`
			`terminalMarker,`
			`whiteSpace,`
			`word`
			`} from './expressions.js'`

			`// PARSE LATIN`

			`/**`
			`* Create a new parser.`
			`*/`
			`export class ParseLatin {`
			`/**`
			`* Create a new parser.`
			`*`
			* This additionally supports `retext`-like call: where an instance is
			`* created for each file, and the file is given on construction.`
			`*`
			`* @param {string \| null \| undefined} [doc]`
			`* Value to parse (optional).`
			`* @param {VFile \| null \| undefined} [file]`
			`* Corresponding file (optional).`
			`*/`
			`constructor(doc, file) {`
			`const value = file \|\| doc`

			`/** @type {string \| undefined} */`
			`this.doc = value ? String(value) : undefined`

			`/** @type {Array<Plugin<Root>>} */`
			`this.tokenizeRootPlugins = [...this.tokenizeRootPlugins]`
			`/** @type {Array<Plugin<Paragraph>>} */`
			`this.tokenizeParagraphPlugins = [...this.tokenizeParagraphPlugins]`
			`/** @type {Array<Plugin<Sentence>>} */`
			`this.tokenizeSentencePlugins = [...this.tokenizeSentencePlugins]`
			`}`

			`/**`
			`* Turn natural language into a syntax tree.`
			`*`
			`* @param {string \| null \| undefined} [value]`
			`* Value to parse (optional).`
			`* @returns {Root}`
			`* Tree.`
			`*/`
			`parse(value) {`
			`return this.tokenizeRoot(value \|\| this.doc)`
			`}`

			`/**`
			`* Parse as a root.`
			`*`
			`* @param {string \| null \| undefined} [value]`
			`* Value to parse (optional).`
			`* @returns {Root}`
			`* Built tree.`
			`*/`
			`tokenizeRoot(value) {`
			`const paragraph = this.tokenizeParagraph(value)`
			`/** @type {Root} */`
			`const result = {`
			`type: 'RootNode',`
			`children: splitNode(paragraph, 'WhiteSpaceNode', newLine)`
			`}`

			`let index = -1`
			`while (this.tokenizeRootPlugins[++index]) {`
			`this.tokenizeRootPlugins[index](result)`
			`}`

			`return result`
			`}`

			`/**`
			`* Parse as a paragraph.`
			`*`
			`* @param {string \| null \| undefined} [value]`
			`* Value to parse (optional).`
			`* @returns {Paragraph}`
			`* Built tree.`
			`*/`
			`tokenizeParagraph(value) {`
			`const sentence = this.tokenizeSentence(value)`
			`/** @type {Paragraph} */`
			`const result = {`
			`type: 'ParagraphNode',`
			`children: splitNode(sentence, 'PunctuationNode', terminalMarker)`
			`}`

			`let index = -1`
			`while (this.tokenizeParagraphPlugins[++index]) {`
			`this.tokenizeParagraphPlugins[index](result)`
			`}`

			`return result`
			`}`

			`/**`
			`* Parse as a sentence.`
			`*`
			`* @param {string \| null \| undefined} [value]`
			`* Value to parse (optional).`
			`* @returns {Sentence}`
			`* Built tree.`
			`*/`
			`tokenizeSentence(value) {`
			`const children = this.tokenize(value)`
			`/** @type {Sentence} */`
			`const result = {type: 'SentenceNode', children}`

			`let index = -1`
			`while (this.tokenizeSentencePlugins[++index]) {`
			`this.tokenizeSentencePlugins[index](result)`
			`}`

			`return result`
			`}`

			`/**`
			* Transform a `value` into a list of nlcsts.
			`*`
			`* @param {string \| null \| undefined} [value]`
			`* Value to parse (optional).`
			`* @returns {Array<SentenceContent>}`
			`* Built sentence content.`
			`*/`
			`tokenize(value) {`
			`/** @type {Array<SentenceContent>} */`
			`const children = []`

			`if (!value) {`
			`return children`
			`}`

			`const currentPoint = {line: 1, column: 1, offset: 0}`
			`let from = 0`
			`let index = 0`
			`let start = {...currentPoint}`
			`/** @type {SentenceContent['type'] \| undefined} */`
			`let previousType`
			`/** @type {string \| undefined} */`
			`let previous`

			`while (index < value.length) {`
			`const current = value.charAt(index)`
			`const currentType = whiteSpace.test(current)`
			`? 'WhiteSpaceNode'`
			`: punctuation.test(current)`
			`? 'PunctuationNode'`
			`: word.test(current)`
			`? 'WordNode'`
			`: 'SymbolNode'`

			`if (`
			`from < index &&`
			`previousType &&`
			`currentType &&`
			`!(`
			`previousType === currentType &&`
			`// Words or white space continue.`
			`(previousType === 'WordNode' \|\|`
			`previousType === 'WhiteSpaceNode' \|\|`
			`// Same character of punctuation or symbol also continues.`
			`current === previous \|\|`
			`// Surrogates of punctuation or symbol also continue.`
			`surrogates.test(current))`
			`)`
			`) {`
			`// Flush the previous queue.`
			`children.push(createNode(previousType, value.slice(from, index)))`
			`from = index`
			`start = {...currentPoint}`
			`}`

			`if (current === '\r' \|\| (current === '\n' && previous !== '\r')) {`
			`currentPoint.line++`
			`currentPoint.column = 1`
			`} else if (current !== '\n') {`
			`currentPoint.column++`
			`}`

			`currentPoint.offset++`
			`previousType = currentType`
			`previous = current`
			`index++`
			`}`

			`if (previousType && from < index) {`
			`children.push(createNode(previousType, value.slice(from, index)))`
			`}`

			`return children`

			`/**`
			`* @param {SentenceContent['type']} type`
			`* Node type to build.`
			`* @param {string} value`
			`* Value.`
			`* @returns {SentenceContent}`
			`* Node.`
			`*/`
			`function createNode(type, value) {`
			`return type === 'WordNode'`
			`? {`
			`type: 'WordNode',`
			`children: [`
			`{`
			`type: 'TextNode',`
			`value,`
			`position: {start, end: {...currentPoint}}`
			`}`
			`],`
			`position: {start, end: {...currentPoint}}`
			`}`
			`: {type, value, position: {start, end: {...currentPoint}}}`
			`}`
			`}`
			`}`

			`/**`
			`* List of transforms handling a sentence.`
			`*/`
			`ParseLatin.prototype.tokenizeSentencePlugins = [`
			`mergeInitialWordSymbol,`
			`mergeFinalWordSymbol,`
			`mergeInnerWordSymbol,`
			`mergeInnerWordSlash,`
			`mergeInitialisms,`
			`patchPosition`
			`]`

			`/**`
			`* List of transforms handling a paragraph.`
			`*/`
			`ParseLatin.prototype.tokenizeParagraphPlugins = [`
			`mergeNonWordSentences,`
			`mergeAffixSymbol,`
			`mergeInitialLowerCaseLetterSentences,`
			`mergeInitialDigitSentences,`
			`mergePrefixExceptions,`
			`mergeAffixExceptions,`
			`mergeRemainingFullStops,`
			`makeInitialWhiteSpaceSiblings,`
			`makeFinalWhiteSpaceSiblings,`
			`breakImplicitSentences,`
			`removeEmptyNodes,`
			`patchPosition`
			`]`

			`/**`
			`* List of transforms handling a root.`
			`*/`
			`ParseLatin.prototype.tokenizeRootPlugins = [`
			`makeInitialWhiteSpaceSiblings,`
			`makeFinalWhiteSpaceSiblings,`
			`removeEmptyNodes,`
			`patchPosition`
			`]`

			`/**`
			`* A function that splits one node into several nodes.`
			`*`
			`* @template {Parents} Node`
			`* Node type.`
			`* @param {Node} node`
			`* Node to split.`
			`* @param {RegExp} expression`
			`* Split on this regex.`
			`* @param {Node['children'][number]['type']} childType`
			`* Split this node type.`
			`* @returns {Array<Node>}`
			`* The given node, split into several nodes.`
			`*/`
			`function splitNode(node, childType, expression) {`
			`/** @type {Array<Node>} */`
			`const result = []`
			`let index = -1`
			`let start = 0`

			`while (++index < node.children.length) {`
			`const token = node.children[index]`

			`if (`
			`index === node.children.length - 1 \|\|`
			`(token.type === childType && expression.test(toString(token)))`
			`) {`
			`/** @type {Node} */`
			`// @ts-expect-error: fine`
			`const parent = {`
			`type: node.type,`
			`children: node.children.slice(start, index + 1)`
			`}`

			`const first = node.children[start]`
			`const last = token`
			`if (first.position && last.position) {`
			`parent.position = {`
			`start: first.position.start,`
			`end: last.position.end`
			`}`
			`}`

			`result.push(parent)`
			`start = index + 1`
			`}`
			`}`

			`return result`
			`}`