"use strict"; const { cjkPattern, kPattern, punctuationPattern } = require("./constants.evaluate"); const { getLast } = require("../common/util"); const INLINE_NODE_TYPES = [ "liquidNode", "inlineCode", "emphasis", "strong", "delete", "link", "linkReference", "image", "imageReference", "footnote", "footnoteReference", "sentence", "whitespace", "word", "break", "inlineMath" ]; const INLINE_NODE_WRAPPER_TYPES = INLINE_NODE_TYPES.concat([ "tableCell", "paragraph", "heading" ]); const kRegex = new RegExp(kPattern); const punctuationRegex = new RegExp(punctuationPattern); /** * split text into whitespaces and words * @param {string} text * @return {Array<{ type: "whitespace", value: " " | "\n" | "" } | { type: "word", value: string }>} */ function splitText(text, options) { const KIND_NON_CJK = "non-cjk"; const KIND_CJ_LETTER = "cj-letter"; const KIND_K_LETTER = "k-letter"; const KIND_CJK_PUNCTUATION = "cjk-punctuation"; const nodes = []; (options.proseWrap === "preserve" ? text : text.replace(new RegExp(`(${cjkPattern})\n(${cjkPattern})`, "g"), "$1$2") ) .split(/([ \t\n]+)/) .forEach((token, index, tokens) => { // whitespace if (index % 2 === 1) { nodes.push({ type: "whitespace", value: /\n/.test(token) ? "\n" : " " }); return; } // word separated by whitespace if ((index === 0 || index === tokens.length - 1) && token === "") { return; } token .split(new RegExp(`(${cjkPattern})`)) .forEach((innerToken, innerIndex, innerTokens) => { if ( (innerIndex === 0 || innerIndex === innerTokens.length - 1) && innerToken === "" ) { return; } // non-CJK word if (innerIndex % 2 === 0) { if (innerToken !== "") { appendNode({ type: "word", value: innerToken, kind: KIND_NON_CJK, hasLeadingPunctuation: punctuationRegex.test(innerToken[0]), hasTrailingPunctuation: punctuationRegex.test( getLast(innerToken) ) }); } return; } // CJK character appendNode( punctuationRegex.test(innerToken) ? { type: "word", value: innerToken, kind: KIND_CJK_PUNCTUATION, hasLeadingPunctuation: true, hasTrailingPunctuation: true } : { type: "word", value: innerToken, kind: kRegex.test(innerToken) ? KIND_K_LETTER : KIND_CJ_LETTER, hasLeadingPunctuation: false, hasTrailingPunctuation: false } ); }); }); return nodes; function appendNode(node) { const lastNode = getLast(nodes); if (lastNode && lastNode.type === "word") { if ( (lastNode.kind === KIND_NON_CJK && node.kind === KIND_CJ_LETTER && !lastNode.hasTrailingPunctuation) || (lastNode.kind === KIND_CJ_LETTER && node.kind === KIND_NON_CJK && !node.hasLeadingPunctuation) ) { nodes.push({ type: "whitespace", value: " " }); } else if ( !isBetween(KIND_NON_CJK, KIND_CJK_PUNCTUATION) && // disallow leading/trailing full-width whitespace ![lastNode.value, node.value].some(value => /\u3000/.test(value)) ) { nodes.push({ type: "whitespace", value: "" }); } } nodes.push(node); function isBetween(kind1, kind2) { return ( (lastNode.kind === kind1 && node.kind === kind2) || (lastNode.kind === kind2 && node.kind === kind1) ); } } } function getOrderedListItemInfo(orderListItem, originalText) { const [, numberText, marker, leadingSpaces] = originalText .slice( orderListItem.position.start.offset, orderListItem.position.end.offset ) .match(/^\s*(\d+)(\.|\))(\s*)/); return { numberText, marker, leadingSpaces }; } // workaround for https://github.com/remarkjs/remark/issues/351 // leading and trailing newlines are stripped by remark function getFencedCodeBlockValue(node, originalText) { const text = originalText.slice( node.position.start.offset, node.position.end.offset ); const leadingSpaceCount = text.match(/^\s*/)[0].length; const replaceRegex = new RegExp(`^\\s{0,${leadingSpaceCount}}`); const lineContents = text.split("\n"); const markerStyle = text[leadingSpaceCount]; // ` or ~ const marker = text .slice(leadingSpaceCount) .match(new RegExp(`^[${markerStyle}]+`))[0]; // https://spec.commonmark.org/0.28/#example-104: Closing fences may be indented by 0-3 spaces // https://spec.commonmark.org/0.28/#example-93: The closing code fence must be at least as long as the opening fence const hasEndMarker = new RegExp(`^\\s{0,3}${marker}`).test( lineContents[lineContents.length - 1].slice( getIndent(lineContents.length - 1) ) ); return lineContents .slice(1, hasEndMarker ? -1 : undefined) .map((x, i) => x.slice(getIndent(i + 1)).replace(replaceRegex, "")) .join("\n"); function getIndent(lineIndex) { return node.position.indent[lineIndex - 1] - 1; } } function mapAst(ast, handler) { return (function preorder(node, index, parentStack) { parentStack = parentStack || []; const newNode = Object.assign({}, handler(node, index, parentStack)); if (newNode.children) { newNode.children = newNode.children.map((child, index) => { return preorder(child, index, [newNode].concat(parentStack)); }); } return newNode; })(ast, null, null); } module.exports = { mapAst, splitText, punctuationPattern, getFencedCodeBlockValue, getOrderedListItemInfo, INLINE_NODE_TYPES, INLINE_NODE_WRAPPER_TYPES };