405 lines
14 KiB
TypeScript
405 lines
14 KiB
TypeScript
|
|
/*************************************************************
|
|
*
|
|
* Copyright (c) 2018-2022 The MathJax Consortium
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
/**
|
|
* @fileoverview Implements a lightweight DOM adaptor
|
|
*
|
|
* @author dpvc@mathjax.org (Davide Cervone)
|
|
*/
|
|
|
|
import {AttributeData} from '../../core/DOMAdaptor.js';
|
|
import {MinDOMParser} from '../HTMLAdaptor.js';
|
|
import * as Entities from '../../util/Entities.js';
|
|
import {LiteDocument} from './Document.js';
|
|
import {LiteElement} from './Element.js';
|
|
import {LiteText, LiteComment} from './Text.js';
|
|
import {LiteAdaptor} from '../liteAdaptor.js';
|
|
|
|
/**
|
|
* Patterns used in parsing serialized HTML
|
|
*/
|
|
export namespace PATTERNS {
|
|
export const TAGNAME = '[a-z][^\\s\\n>]*';
|
|
export const ATTNAME = '[a-z][^\\s\\n>=]*';
|
|
export const VALUE = `(?:'[^']*'|"[^"]*"|[^\\s\\n]+)`;
|
|
export const VALUESPLIT = `(?:'([^']*)'|"([^"]*)"|([^\\s\\n]+))`;
|
|
export const SPACE = '(?:\\s|\\n)+';
|
|
export const OPTIONALSPACE = '(?:\\s|\\n)*';
|
|
export const ATTRIBUTE = ATTNAME + '(?:' + OPTIONALSPACE + '=' + OPTIONALSPACE + VALUE + ')?';
|
|
export const ATTRIBUTESPLIT = '(' + ATTNAME + ')(?:' + OPTIONALSPACE + '=' + OPTIONALSPACE + VALUESPLIT + ')?';
|
|
export const TAG = '(<(?:' + TAGNAME + '(?:' + SPACE + ATTRIBUTE + ')*'
|
|
+ OPTIONALSPACE + '/?|/' + TAGNAME + '|!--[^]*?--|![^]*?)(?:>|$))';
|
|
export const tag = new RegExp(TAG, 'i');
|
|
export const attr = new RegExp(ATTRIBUTE, 'i');
|
|
export const attrsplit = new RegExp(ATTRIBUTESPLIT, 'i');
|
|
}
|
|
|
|
/************************************************************/
|
|
/**
|
|
* Implements a lightweight DOMParser replacement
|
|
* (Not perfect, but handles most well-formed HTML)
|
|
*/
|
|
export class LiteParser implements MinDOMParser<LiteDocument> {
|
|
|
|
/**
|
|
* The list of self-closing tags
|
|
*/
|
|
public static SELF_CLOSING: {[name: string]: boolean} = {
|
|
area: true,
|
|
base: true,
|
|
br: true,
|
|
col: true,
|
|
command: true,
|
|
embed: true,
|
|
hr: true,
|
|
img: true,
|
|
input: true,
|
|
keygen: true,
|
|
link: true,
|
|
menuitem: true,
|
|
meta: true,
|
|
param: true,
|
|
source: true,
|
|
track: true,
|
|
wbr: true
|
|
};
|
|
|
|
/**
|
|
* The list of tags chose content is not parsed (PCDATA)
|
|
*/
|
|
public static PCDATA: {[name: string]: boolean} = {
|
|
option: true,
|
|
textarea: true,
|
|
fieldset: true,
|
|
title: true,
|
|
style: true,
|
|
script: true
|
|
};
|
|
|
|
/**
|
|
* The list of attributes that don't get entity translation
|
|
*/
|
|
public static CDATA_ATTR: {[name: string]: boolean} = {
|
|
style: true,
|
|
datafld: true,
|
|
datasrc: true,
|
|
href: true,
|
|
src: true,
|
|
longdesc: true,
|
|
usemap: true,
|
|
cite: true,
|
|
datetime: true,
|
|
action: true,
|
|
axis: true,
|
|
profile: true,
|
|
content: true,
|
|
scheme: true
|
|
};
|
|
|
|
/**
|
|
* @override
|
|
*/
|
|
public parseFromString(text: string, _format: string = 'text/html', adaptor: LiteAdaptor = null) {
|
|
const root = adaptor.createDocument();
|
|
let node = adaptor.body(root);
|
|
//
|
|
// Split the HTML into an array of text, tag, text, tag, ...
|
|
// Then loop through them and add text nodes and process tags.
|
|
//
|
|
let parts = text.replace(/<\?.*?\?>/g, '').split(PATTERNS.tag);
|
|
while (parts.length) {
|
|
const text = parts.shift();
|
|
const tag = parts.shift();
|
|
if (text) {
|
|
this.addText(adaptor, node, text);
|
|
}
|
|
if (tag && tag.charAt(tag.length - 1) === '>') {
|
|
if (tag.charAt(1) === '!') {
|
|
this.addComment(adaptor, node, tag);
|
|
} else if (tag.charAt(1) === '/') {
|
|
node = this.closeTag(adaptor, node, tag);
|
|
} else {
|
|
node = this.openTag(adaptor, node, tag, parts);
|
|
}
|
|
}
|
|
}
|
|
this.checkDocument(adaptor, root);
|
|
return root;
|
|
}
|
|
|
|
/**
|
|
* @param {LiteAdaptor} adaptor The adaptor for managing nodes
|
|
* @param {LiteElement} node The node to add a text element to
|
|
* @param {string} text The text for the text node
|
|
* @return {LiteText} The text element
|
|
*/
|
|
protected addText(adaptor: LiteAdaptor, node: LiteElement, text: string): LiteText {
|
|
text = Entities.translate(text);
|
|
return adaptor.append(node, adaptor.text(text)) as LiteText;
|
|
}
|
|
|
|
/**
|
|
* @param {LiteAdaptor} adaptor The adaptor for managing nodes
|
|
* @param {LiteElement} node The node to add a comment to
|
|
* @param {string} comment The text for the comment node
|
|
* @return {LiteComment} The comment element
|
|
*/
|
|
protected addComment(adaptor: LiteAdaptor, node: LiteElement, comment: string): LiteComment {
|
|
return adaptor.append(node, new LiteComment(comment)) as LiteComment;
|
|
}
|
|
|
|
/**
|
|
* @param {LiteAdaptor} adaptor The adaptor for managing nodes
|
|
* @param {LiteElement} node The node to close
|
|
* @param {string} tag The close tag being processed
|
|
* @return {LiteElement} The first unclosed parent node
|
|
*/
|
|
protected closeTag(adaptor: LiteAdaptor, node: LiteElement, tag: string): LiteElement {
|
|
const kind = tag.slice(2, tag.length - 1).toLowerCase();
|
|
while (adaptor.parent(node) && adaptor.kind(node) !== kind) {
|
|
node = adaptor.parent(node);
|
|
}
|
|
return adaptor.parent(node);
|
|
}
|
|
|
|
/**
|
|
* @param {LiteAdaptor} adaptor The adaptor for managing nodes
|
|
* @param {LiteElement} node The parent node for the tag
|
|
* @param {string} tag The tag being processed
|
|
* @param {string[]} parts The rest of the text/tag array
|
|
* @return {LiteElement} The node to which the next tag will be added
|
|
*/
|
|
protected openTag(adaptor: LiteAdaptor, node: LiteElement, tag: string, parts: string[]): LiteElement {
|
|
const PCDATA = (this.constructor as typeof LiteParser).PCDATA;
|
|
const SELF_CLOSING = (this.constructor as typeof LiteParser).SELF_CLOSING;
|
|
//
|
|
// Get the child to be added to the node
|
|
//
|
|
const kind = tag.match(/<(.*?)[\s\n>\/]/)[1].toLowerCase();
|
|
const child = adaptor.node(kind) as LiteElement;
|
|
//
|
|
// Split out the tag attributes as an array of space, name, value1, value3, value3,
|
|
// where value1, value2, and value3 are the value of the node (only one is defined)
|
|
// that come from matching quoted strings with ' (value1), " (value2) or no quotes (value3).
|
|
//
|
|
const attributes = tag.replace(/^<.*?[\s\n>]/, '').split(PATTERNS.attrsplit);
|
|
//
|
|
// If the tag was complete (it ends with > or has no attributes)
|
|
//
|
|
if (attributes.pop().match(/>$/) || attributes.length < 5) {
|
|
this.addAttributes(adaptor, child, attributes);
|
|
adaptor.append(node, child);
|
|
//
|
|
// For non-self-closing tags,
|
|
// For tags whose contents is PCDATA (like <script>), collect the
|
|
// content up until the end tag, and continue adding nee tags
|
|
// to the current parent node.
|
|
// Otherwise, the child tag becames the parent node to which
|
|
// new tags are added
|
|
//
|
|
if (!SELF_CLOSING[kind] && !tag.match(/\/>$/)) {
|
|
if (PCDATA[kind]) {
|
|
this.handlePCDATA(adaptor, child, kind, parts);
|
|
} else {
|
|
node = child;
|
|
}
|
|
}
|
|
}
|
|
return node;
|
|
}
|
|
|
|
/**
|
|
* @param {LiteAdaptor} adaptor The adaptor for managing nodes
|
|
* @param {LiteElement} node The node getting the attributes
|
|
* @param {string[]} attributes The array of space, name, value1, value2, value3
|
|
* as described above.
|
|
*/
|
|
protected addAttributes(adaptor: LiteAdaptor, node: LiteElement, attributes: string[]) {
|
|
const CDATA_ATTR = (this.constructor as typeof LiteParser).CDATA_ATTR;
|
|
while (attributes.length) {
|
|
let [ , name, v1, v2, v3] = attributes.splice(0, 5);
|
|
let value = v1 || v2 || v3 || '';
|
|
if (!CDATA_ATTR[name]) {
|
|
value = Entities.translate(value);
|
|
}
|
|
adaptor.setAttribute(node, name, value);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @param {LiteAdaptor} adaptor The adaptor for managing nodes
|
|
* @param {LiteElement} node The node whose PCDATA content is being collected
|
|
* @param {string} kind The tag name being handled
|
|
* @param {string[]} parts The array of text/tag data for the document
|
|
*/
|
|
protected handlePCDATA(adaptor: LiteAdaptor, node: LiteElement, kind: string, parts: string[]) {
|
|
const pcdata = [] as string[];
|
|
const etag = '</' + kind + '>';
|
|
let ptag = '';
|
|
//
|
|
// Look through the parts until the end tag is found
|
|
// Add the unmatched tag and the following text
|
|
// and try the next tag until we find the end tag.
|
|
//
|
|
while (parts.length && ptag !== etag) {
|
|
pcdata.push(ptag);
|
|
pcdata.push(parts.shift());
|
|
ptag = parts.shift();
|
|
}
|
|
//
|
|
// Add the collected contents as a text node
|
|
//
|
|
adaptor.append(node, adaptor.text(pcdata.join('')));
|
|
}
|
|
|
|
/**
|
|
* Check the contents of the parsed document and move html, head, and body
|
|
* tags into the document structure. That way, you can parse fragments or
|
|
* full documents and still get a valid document.
|
|
*
|
|
* @param {LiteAdaptor} adaptor The adaptor for managing nodes
|
|
* @param {LiteDocument} root The document being checked
|
|
*/
|
|
protected checkDocument(adaptor: LiteAdaptor, root: LiteDocument) {
|
|
let node = this.getOnlyChild(adaptor, adaptor.body(root));
|
|
if (!node) return;
|
|
for (const child of adaptor.childNodes(adaptor.body(root))) {
|
|
if (child === node) {
|
|
break;
|
|
}
|
|
if (child instanceof LiteComment && child.value.match(/^<!DOCTYPE/)) {
|
|
root.type = child.value;
|
|
}
|
|
}
|
|
switch (adaptor.kind(node)) {
|
|
case 'html':
|
|
//
|
|
// Look through the children for the head and body
|
|
//
|
|
for (const child of node.children) {
|
|
switch (adaptor.kind(child)) {
|
|
case 'head':
|
|
root.head = child as LiteElement;
|
|
break;
|
|
case 'body':
|
|
root.body = child as LiteElement;
|
|
break;
|
|
}
|
|
}
|
|
//
|
|
// Make sure the elements are linked in properly
|
|
//
|
|
root.root = node;
|
|
adaptor.remove(node);
|
|
if (adaptor.parent(root.body) !== node) {
|
|
adaptor.append(node, root.body);
|
|
}
|
|
if (adaptor.parent(root.head) !== node) {
|
|
adaptor.insert(root.head, root.body);
|
|
}
|
|
break;
|
|
|
|
case 'head':
|
|
root.head = adaptor.replace(node, root.head) as LiteElement;
|
|
break;
|
|
|
|
case 'body':
|
|
root.body = adaptor.replace(node, root.body) as LiteElement;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Checks if the body has only one element child (as opposed to comments or text nodes)
|
|
* and returns that sole element (or null if none or more than one)
|
|
*
|
|
* @param {LiteAdaptor} adaptor The adaptor for managing nodes
|
|
* @param {LiteElement} body The body element being checked
|
|
* @return {LiteElement} The sole LiteElement child of the body, or null if none or more than one
|
|
*/
|
|
protected getOnlyChild(adaptor: LiteAdaptor, body: LiteElement): LiteElement {
|
|
let node: LiteElement = null;
|
|
for (const child of adaptor.childNodes(body)) {
|
|
if (child instanceof LiteElement) {
|
|
if (node) return null;
|
|
node = child;
|
|
}
|
|
}
|
|
return node;
|
|
}
|
|
|
|
/**
|
|
* @param {LiteAdaptor} adaptor The adaptor for managing nodes
|
|
* @param {LiteElement} node The node to serialize
|
|
* @param {boolean} xml True when producing XML, false for HTML
|
|
* @return {string} The serialized element (like outerHTML)
|
|
*/
|
|
public serialize(adaptor: LiteAdaptor, node: LiteElement, xml: boolean = false): string {
|
|
const SELF_CLOSING = (this.constructor as typeof LiteParser).SELF_CLOSING;
|
|
const CDATA = (this.constructor as typeof LiteParser).CDATA_ATTR;
|
|
const tag = adaptor.kind(node);
|
|
const attributes = adaptor.allAttributes(node).map(
|
|
(x: AttributeData) => x.name + '="' + (CDATA[x.name] ? x.value : this.protectAttribute(x.value)) + '"'
|
|
).join(' ');
|
|
const content = this.serializeInner(adaptor, node, xml);
|
|
const html =
|
|
'<' + tag + (attributes ? ' ' + attributes : '')
|
|
+ ((!xml || content) && !SELF_CLOSING[tag] ? `>${content}</${tag}>` : xml ? '/>' : '>');
|
|
return html;
|
|
}
|
|
|
|
/**
|
|
* @param {LiteAdaptor} adaptor The adaptor for managing nodes
|
|
* @param {LiteElement} node The node whose innerHTML is needed
|
|
* @return {string} The serialized element (like innerHTML)
|
|
*/
|
|
public serializeInner(adaptor: LiteAdaptor, node: LiteElement, xml: boolean = false): string {
|
|
const PCDATA = (this.constructor as typeof LiteParser).PCDATA;
|
|
if (PCDATA.hasOwnProperty(node.kind)) {
|
|
return adaptor.childNodes(node).map(x => adaptor.value(x)).join('');
|
|
}
|
|
return adaptor.childNodes(node).map(x => {
|
|
const kind = adaptor.kind(x);
|
|
return (kind === '#text' ? this.protectHTML(adaptor.value(x)) :
|
|
kind === '#comment' ? (x as LiteComment).value :
|
|
this.serialize(adaptor, x as LiteElement, xml));
|
|
}).join('');
|
|
}
|
|
|
|
/**
|
|
* @param {string} text The attribute value to be HTML escaped
|
|
* @return {string} The string with " replaced by entities
|
|
*/
|
|
public protectAttribute(text: string): string {
|
|
if (typeof text !== 'string') {
|
|
text = String(text);
|
|
}
|
|
return text.replace(/"/g, '"');
|
|
}
|
|
|
|
/**
|
|
* @param {string} text The text to be HTML escaped
|
|
* @return {string} The string with &, <, and > replaced by entities
|
|
*/
|
|
public protectHTML(text: string): string {
|
|
return text.replace(/&/g, '&')
|
|
.replace(/</g, '<')
|
|
.replace(/>/g, '>');
|
|
}
|
|
|
|
}
|