diff --git a/package.json b/package.json index a7ecf21f..78cf2b09 100644 --- a/package.json +++ b/package.json @@ -49,6 +49,7 @@ "strip-bom": "3.0.0", "typescript": "2.5.3", "typescript-eslint-parser": "git://github.com/eslint/typescript-eslint-parser.git#9c71a627da36e97da52ed2731d58509c952b67ae", + "unicode-regex": "1.0.1", "unified": "6.1.5" }, "devDependencies": { diff --git a/src/printer-markdown.js b/src/printer-markdown.js index e79228d4..0cb2e077 100644 --- a/src/printer-markdown.js +++ b/src/printer-markdown.js @@ -11,13 +11,7 @@ const fill = docBuilders.fill; const align = docBuilders.align; const docPrinter = require("./doc-printer"); const printDocToString = docPrinter.printDocToString; -const asciiPunctuationPattern = util.asciiPunctuationPattern; -const getCjkRegex = require("cjk-regex"); - -const punctuationPattern = `(?:${[ - getCjkRegex.punctuations().source, - `[${asciiPunctuationPattern}]` -].join("|")})`; +const punctuationCharRange = util.punctuationCharRange; const SINGLE_LINE_NODE_TYPES = [ "heading", @@ -87,7 +81,10 @@ function genericPrint(path, options, print) { .replace(/[*]/g, "\\*") // escape all `*` .replace( new RegExp( - `(^|${punctuationPattern})(_+)|(_+)(${punctuationPattern}|$)`, + [ + `(^|[${punctuationCharRange}])(_+)`, + `(_+)([${punctuationCharRange}]|$)` + ].join("|"), "g" ), (_, text1, underscore1, underscore2, text2) => @@ -118,14 +115,14 @@ function genericPrint(path, options, print) { prevNode.type === "sentence" && prevNode.children.length > 0 && prevNode.children[prevNode.children.length - 1].type === "word" && - new RegExp(`[^${asciiPunctuationPattern}]$`).test( + new RegExp(`[^${punctuationCharRange}]$`).test( prevNode.children[prevNode.children.length - 1].value )) || (nextNode && nextNode.type === "sentence" && nextNode.children.length > 0 && nextNode.children[0].type === "word" && - new RegExp(`^[^${asciiPunctuationPattern}]`).test( + new RegExp(`^[^${punctuationCharRange}]`).test( nextNode.children[0].value )); const style = diff --git a/src/util.js b/src/util.js index 7764e1b6..11d8b612 100644 --- a/src/util.js +++ b/src/util.js @@ -3,19 +3,27 @@ const stringWidth = require("string-width"); const emojiRegex = require("emoji-regex")(); const escapeStringRegexp = require("escape-string-regexp"); - const getCjkRegex = require("cjk-regex"); -const cjkRegex = getCjkRegex(); +const getUnicodeRegex = require("unicode-regex"); -// the `g` flag is dangerous in RegExp#test() -// https://stackoverflow.com/a/21373261 -const cjkPunctuationRegex = new RegExp(getCjkRegex.punctuations().source, ""); +const cjkPattern = getCjkRegex().source; // http://spec.commonmark.org/0.25/#ascii-punctuation-character -const asciiPunctuationPattern = escapeStringRegexp( +const asciiPunctuationCharRange = escapeStringRegexp( "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" ); +// http://spec.commonmark.org/0.25/#punctuation-character +const punctuationCharRange = `${asciiPunctuationCharRange}${getUnicodeRegex([ + "Pc", + "Pd", + "Pe", + "Pf", + "Pi", + "Po", + "Ps" +]).source.slice(1, -1)}`; // remove bracket expression `[` and `]` + function isExportDeclaration(node) { if (node) { switch (node.type) { @@ -672,10 +680,7 @@ function splitText(text) { const nodes = []; text - .replace( - new RegExp(`(${cjkRegex.source})\n(${cjkRegex.source})`, "g"), - "$1$2" - ) + .replace(new RegExp(`(${cjkPattern})\n(${cjkPattern})`, "g"), "$1$2") // `\s` but exclude full-width whitspace (`\u3000`) .split(/([^\S\u3000]+)/) .forEach((token, index, tokens) => { @@ -692,7 +697,7 @@ function splitText(text) { } token - .split(new RegExp(`(${cjkRegex.source})`)) + .split(new RegExp(`(${cjkPattern})`)) .forEach((innerToken, innerIndex, innerTokens) => { if ( (innerIndex === 0 || innerIndex === innerTokens.length - 1) && @@ -714,7 +719,7 @@ function splitText(text) { } // CJK character - const kind = cjkPunctuationRegex.test(innerToken) + const kind = new RegExp(`[${punctuationCharRange}]`).test(innerToken) ? KIND_CJK_PUNCTUATION : KIND_CJK_CHARACTER; appendNode({ type: "word", value: innerToken, kind }); @@ -729,10 +734,10 @@ function splitText(text) { if ( (lastNode.kind === KIND_NON_CJK && node.kind === KIND_CJK_CHARACTER && - !new RegExp(`[${asciiPunctuationPattern}]$`).test(lastNode.value)) || + !new RegExp(`[${punctuationCharRange}]$`).test(lastNode.value)) || (lastNode.kind === KIND_CJK_CHARACTER && node.kind === KIND_NON_CJK && - !new RegExp(`^[${asciiPunctuationPattern}]`).test(node.value)) + !new RegExp(`^[${punctuationCharRange}]`).test(node.value)) ) { nodes.push({ type: "whitespace", value: " " }); } else if ( @@ -766,7 +771,7 @@ function getStringWidth(text) { } module.exports = { - asciiPunctuationPattern, + punctuationCharRange, getStringWidth, splitText, mapDoc, diff --git a/tests/markdown_paragraph/__snapshots__/jsfmt.spec.js.snap b/tests/markdown_paragraph/__snapshots__/jsfmt.spec.js.snap index c5bf39b4..baa312bc 100644 --- a/tests/markdown_paragraph/__snapshots__/jsfmt.spec.js.snap +++ b/tests/markdown_paragraph/__snapshots__/jsfmt.spec.js.snap @@ -9,6 +9,8 @@ exports[`cjk.md 1`] = ` This ia an english paragraph with a CJK quote "中文". +This ia an english paragraph with a CJK quote “中文“. + 扩展运算符(spread)是三个点(\`...\`)。 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 這是一段很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長 @@ -30,6 +32,8 @@ English 混合著中文的一段 Paragraph! This ia an english paragraph with a CJK quote "中文". +This ia an english paragraph with a CJK quote “中文“. + 扩展运算符(spread)是三个点(\`...\`)。 `; @@ -43,6 +47,8 @@ exports[`cjk.md 2`] = ` This ia an english paragraph with a CJK quote "中文". +This ia an english paragraph with a CJK quote “中文“. + 扩展运算符(spread)是三个点(\`...\`)。 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 這是一段很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長的段落 @@ -53,6 +59,8 @@ This ia an english paragraph with a CJK quote "中文". This ia an english paragraph with a CJK quote "中文". +This ia an english paragraph with a CJK quote “中文“. + 扩展运算符(spread)是三个点(\`...\`)。 `; diff --git a/tests/markdown_paragraph/cjk.md b/tests/markdown_paragraph/cjk.md index 2cc6927e..ebe0a72d 100644 --- a/tests/markdown_paragraph/cjk.md +++ b/tests/markdown_paragraph/cjk.md @@ -6,4 +6,6 @@ This ia an english paragraph with a CJK quote "中文". +This ia an english paragraph with a CJK quote “中文“. + 扩展运算符(spread)是三个点(`...`)。 diff --git a/yarn.lock b/yarn.lock index edf4cce7..33bfcc9d 100644 --- a/yarn.lock +++ b/yarn.lock @@ -4279,6 +4279,10 @@ unherit@^1.0.4: inherits "^2.0.1" xtend "^4.0.1" +unicode-regex@1.0.1: + version "1.0.1" + resolved "https://registry.yarnpkg.com/unicode-regex/-/unicode-regex-1.0.1.tgz#f819e050191d5b9561a339a58dd3b9095ed94b35" + unified@6.1.5: version "6.1.5" resolved "https://registry.yarnpkg.com/unified/-/unified-6.1.5.tgz#716937872621a63135e62ced2f3ac6a063c6fb87"