fix(markdown): handle punctuation variants (#3254)

* fix(markdown): handle punctuation variants * docs: add comment
2017-11-13 00:09:04 +08:00 · 2017-11-13 00:09:04 +08:00 · d08df0b221
parent ece764a049
commit d08df0b221
6 changed files with 42 additions and 25 deletions
--- a/package.json
+++ b/package.json
@ -49,6 +49,7 @@
    "strip-bom": "3.0.0",
    "typescript": "2.5.3",
    "typescript-eslint-parser": "git://github.com/eslint/typescript-eslint-parser.git#9c71a627da36e97da52ed2731d58509c952b67ae",
+    "unicode-regex": "1.0.1",
    "unified": "6.1.5"
  },
  "devDependencies": {
--- a/src/printer-markdown.js
+++ b/src/printer-markdown.js
@ -11,13 +11,7 @@ const fill = docBuilders.fill;
 const align = docBuilders.align;
 const docPrinter = require("./doc-printer");
 const printDocToString = docPrinter.printDocToString;
-const asciiPunctuationPattern = util.asciiPunctuationPattern;
-const getCjkRegex = require("cjk-regex");
-
-const punctuationPattern = `(?:${[
-  getCjkRegex.punctuations().source,
-  `[${asciiPunctuationPattern}]`
-].join("|")})`;
+const punctuationCharRange = util.punctuationCharRange;

 const SINGLE_LINE_NODE_TYPES = [
  "heading",
@ -87,7 +81,10 @@ function genericPrint(path, options, print) {
        .replace(/[*]/g, "\\*") // escape all `*`
        .replace(
          new RegExp(
-            `(^|${punctuationPattern})(_+)|(_+)(${punctuationPattern}|$)`,
+            [
+              `(^|[${punctuationCharRange}])(_+)`,
+              `(_+)([${punctuationCharRange}]|$)`
+            ].join("|"),
            "g"
          ),
          (_, text1, underscore1, underscore2, text2) =>
@ -118,14 +115,14 @@ function genericPrint(path, options, print) {
          prevNode.type === "sentence" &&
          prevNode.children.length > 0 &&
          prevNode.children[prevNode.children.length - 1].type === "word" &&
-          new RegExp(`[^${asciiPunctuationPattern}]$`).test(
+          new RegExp(`[^${punctuationCharRange}]$`).test(
            prevNode.children[prevNode.children.length - 1].value
          )) ||
        (nextNode &&
          nextNode.type === "sentence" &&
          nextNode.children.length > 0 &&
          nextNode.children[0].type === "word" &&
-          new RegExp(`^[^${asciiPunctuationPattern}]`).test(
+          new RegExp(`^[^${punctuationCharRange}]`).test(
            nextNode.children[0].value
          ));
      const style =
--- a/src/util.js
+++ b/src/util.js
@ -3,19 +3,27 @@
 const stringWidth = require("string-width");
 const emojiRegex = require("emoji-regex")();
 const escapeStringRegexp = require("escape-string-regexp");
-
 const getCjkRegex = require("cjk-regex");
-const cjkRegex = getCjkRegex();
+const getUnicodeRegex = require("unicode-regex");

-// the `g` flag is dangerous in RegExp#test()
-// https://stackoverflow.com/a/21373261
-const cjkPunctuationRegex = new RegExp(getCjkRegex.punctuations().source, "");
+const cjkPattern = getCjkRegex().source;

 // http://spec.commonmark.org/0.25/#ascii-punctuation-character
-const asciiPunctuationPattern = escapeStringRegexp(
+const asciiPunctuationCharRange = escapeStringRegexp(
  "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
 );

+// http://spec.commonmark.org/0.25/#punctuation-character
+const punctuationCharRange = `${asciiPunctuationCharRange}${getUnicodeRegex([
+  "Pc",
+  "Pd",
+  "Pe",
+  "Pf",
+  "Pi",
+  "Po",
+  "Ps"
+]).source.slice(1, -1)}`; // remove bracket expression `[` and `]`
+
 function isExportDeclaration(node) {
  if (node) {
    switch (node.type) {
@ -672,10 +680,7 @@ function splitText(text) {
  const nodes = [];

  text
-    .replace(
-      new RegExp(`(${cjkRegex.source})\n(${cjkRegex.source})`, "g"),
-      "$1$2"
-    )
+    .replace(new RegExp(`(${cjkPattern})\n(${cjkPattern})`, "g"), "$1$2")
    // `\s` but exclude full-width whitspace (`\u3000`)
    .split(/([^\S\u3000]+)/)
    .forEach((token, index, tokens) => {
@ -692,7 +697,7 @@ function splitText(text) {
      }

      token
-        .split(new RegExp(`(${cjkRegex.source})`))
+        .split(new RegExp(`(${cjkPattern})`))
        .forEach((innerToken, innerIndex, innerTokens) => {
          if (
            (innerIndex === 0 || innerIndex === innerTokens.length - 1) &&
@ -714,7 +719,7 @@ function splitText(text) {
          }

          // CJK character
-          const kind = cjkPunctuationRegex.test(innerToken)
+          const kind = new RegExp(`[${punctuationCharRange}]`).test(innerToken)
            ? KIND_CJK_PUNCTUATION
            : KIND_CJK_CHARACTER;
          appendNode({ type: "word", value: innerToken, kind });
@ -729,10 +734,10 @@ function splitText(text) {
      if (
        (lastNode.kind === KIND_NON_CJK &&
          node.kind === KIND_CJK_CHARACTER &&
-          !new RegExp(`[${asciiPunctuationPattern}]$`).test(lastNode.value)) ||
+          !new RegExp(`[${punctuationCharRange}]$`).test(lastNode.value)) ||
        (lastNode.kind === KIND_CJK_CHARACTER &&
          node.kind === KIND_NON_CJK &&
-          !new RegExp(`^[${asciiPunctuationPattern}]`).test(node.value))
+          !new RegExp(`^[${punctuationCharRange}]`).test(node.value))
      ) {
        nodes.push({ type: "whitespace", value: " " });
      } else if (
@ -766,7 +771,7 @@ function getStringWidth(text) {
 }

 module.exports = {
-  asciiPunctuationPattern,
+  punctuationCharRange,
  getStringWidth,
  splitText,
  mapDoc,
--- a/tests/markdown_paragraph/snapshots/jsfmt.spec.js.snap
+++ b/tests/markdown_paragraph/snapshots/jsfmt.spec.js.snap
@ -9,6 +9,8 @@ exports[`cjk.md 1`] = `

 This ia an english paragraph with a CJK quote "中文".

+This ia an english paragraph with a CJK quote “中文“.
+
 扩展运算符（spread）是三个点（\`...\`）。
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 這是一段很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長
@ -30,6 +32,8 @@ English 混合著中文的一段 Paragraph！

 This ia an english paragraph with a CJK quote "中文".

+This ia an english paragraph with a CJK quote “中文“.
+
 扩展运算符（spread）是三个点（\`...\`）。

 `;
@ -43,6 +47,8 @@ exports[`cjk.md 2`] = `

 This ia an english paragraph with a CJK quote "中文".

+This ia an english paragraph with a CJK quote “中文“.
+
 扩展运算符（spread）是三个点（\`...\`）。
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 這是一段很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長的段落
@ -53,6 +59,8 @@ This ia an english paragraph with a CJK quote "中文".

 This ia an english paragraph with a CJK quote "中文".

+This ia an english paragraph with a CJK quote “中文“.
+
 扩展运算符（spread）是三个点（\`...\`）。

 `;
--- a/tests/markdown_paragraph/cjk.md
+++ b/tests/markdown_paragraph/cjk.md
@ -6,4 +6,6 @@

 This ia an english paragraph with a CJK quote "中文".

+This ia an english paragraph with a CJK quote “中文“.
+
 扩展运算符（spread）是三个点（`...`）。
--- a/yarn.lock
+++ b/yarn.lock
@ -4279,6 +4279,10 @@ unherit@^1.0.4:
    inherits "^2.0.1"
    xtend "^4.0.1"

+unicode-regex@1.0.1:
+  version "1.0.1"
+  resolved "https://registry.yarnpkg.com/unicode-regex/-/unicode-regex-1.0.1.tgz#f819e050191d5b9561a339a58dd3b9095ed94b35"
+
 unified@6.1.5:
  version "6.1.5"
  resolved "https://registry.yarnpkg.com/unified/-/unified-6.1.5.tgz#716937872621a63135e62ced2f3ac6a063c6fb87"