fix(markdown): do not add whitespaces between Latin and Hangul (#5040)

Based on https://github.com/prettier/prettier/issues/5028#issuecomment-417825085, it seems Korean text uses conventional space so we're not going to add whitespaces between Latin and Hangul.
2018-09-02 16:20:22 +08:00 · 2018-09-02 16:20:22 +08:00 · 91206891cd
parent e86f08555a
commit 91206891cd
12 changed files with 252 additions and 172 deletions
--- a/package.json
+++ b/package.json
@ -20,7 +20,7 @@
    "@iarna/toml": "2.0.0",
    "camelcase": "4.1.0",
    "chalk": "2.1.0",
-    "cjk-regex": "1.0.2",
+    "cjk-regex": "2.0.0",
    "cosmiconfig": "5.0.6",
    "dashify": "0.2.2",
    "dedent": "0.7.0",
@ -53,13 +53,14 @@
    "postcss-scss": "1.0.6",
    "postcss-selector-parser": "2.2.3",
    "postcss-values-parser": "1.5.0",
+    "regexp-util": "1.2.2",
    "remark-parse": "5.0.0",
    "resolve": "1.5.0",
    "semver": "5.4.1",
    "string-width": "2.1.1",
    "typescript": "3.0.1",
    "typescript-eslint-parser": "18.0.0",
-    "unicode-regex": "1.0.1",
+    "unicode-regex": "2.0.0",
    "unified": "6.1.6",
    "vnopts": "1.0.2",
    "yaml": "1.0.0-rc.8",
--- a/scripts/build/bundler.js
+++ b/scripts/build/bundler.js
@ -14,6 +14,7 @@ const uglify = require("rollup-plugin-uglify");
 const babel = require("rollup-plugin-babel");
 const nativeShims = require("./rollup-plugins/native-shims");
 const executable = require("./rollup-plugins/executable");
+const evaluate = require("./rollup-plugins/evaluate");

 const EXTERNALS = [
  "assert",
@ -108,6 +109,7 @@ function getRollupConfig(bundle) {
  config.plugins = [
    replace(replaceStrings),
    executable(),
+    evaluate(),
    json(),
    bundle.alias && alias(bundle.alias),
    bundle.target === "universal" &&
--- a/scripts/build/rollup-plugins/evaluate.js
+++ b/scripts/build/rollup-plugins/evaluate.js
@ -0,0 +1,28 @@
+"use strict";
+
+module.exports = function() {
+  return {
+    name: "evaluate",
+
+    transform(_text, id) {
+      if (!/\.evaluate\.js$/.test(id)) {
+        return null;
+      }
+
+      const json = JSON.stringify(
+        require(id.replace(/^\0commonjs-proxy:/, "")),
+        (_, v) => {
+          if (typeof v === "function") {
+            throw new Error(`Cannot evaluate functions.`);
+          }
+          return v;
+        }
+      );
+
+      return {
+        code: `const json = ${json}; export default json;`,
+        map: { mappings: "" }
+      };
+    }
+  };
+};
--- a/src/common/util.js
+++ b/src/common/util.js
@ -3,32 +3,10 @@
 const stringWidth = require("string-width");
 const emojiRegex = require("emoji-regex")();
 const escapeStringRegexp = require("escape-string-regexp");
-const getCjkRegex = require("cjk-regex");
-const getUnicodeRegex = require("unicode-regex");

 // eslint-disable-next-line no-control-regex
 const notAsciiRegex = /[^\x20-\x7F]/;

-const cjkPattern = getCjkRegex().source;
-
-// http://spec.commonmark.org/0.25/#ascii-punctuation-character
-const asciiPunctuationCharRange = escapeStringRegexp(
-  "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
-);
-
-// http://spec.commonmark.org/0.25/#punctuation-character
-const punctuationCharRange = `${asciiPunctuationCharRange}${getUnicodeRegex([
-  "Pc",
-  "Pd",
-  "Pe",
-  "Pf",
-  "Pi",
-  "Po",
-  "Ps"
-]).source.slice(1, -1)}`; // remove bracket expression `[` and `]`
-
-const punctuationRegex = new RegExp(`[${punctuationCharRange}]`);
-
 function isExportDeclaration(node) {
  if (node) {
    switch (node.type) {
@ -598,119 +576,6 @@ function getMaxContinuousCount(str, target) {
  );
 }

-/**
- * split text into whitespaces and words
- * @param {string} text
- * @return {Array<{ type: "whitespace", value: " " | "\n" | "" } | { type: "word", value: string }>}
- */
-function splitText(text, options) {
-  const KIND_NON_CJK = "non-cjk";
-  const KIND_CJK_CHARACTER = "cjk-character";
-  const KIND_CJK_PUNCTUATION = "cjk-punctuation";
-
-  const nodes = [];
-
-  (options.proseWrap === "preserve"
-    ? text
-    : text.replace(new RegExp(`(${cjkPattern})\n(${cjkPattern})`, "g"), "$1$2")
-  )
-    .split(/([ \t\n]+)/)
-    .forEach((token, index, tokens) => {
-      // whitespace
-      if (index % 2 === 1) {
-        nodes.push({
-          type: "whitespace",
-          value: /\n/.test(token) ? "\n" : " "
-        });
-        return;
-      }
-
-      // word separated by whitespace
-
-      if ((index === 0 || index === tokens.length - 1) && token === "") {
-        return;
-      }
-
-      token
-        .split(new RegExp(`(${cjkPattern})`))
-        .forEach((innerToken, innerIndex, innerTokens) => {
-          if (
-            (innerIndex === 0 || innerIndex === innerTokens.length - 1) &&
-            innerToken === ""
-          ) {
-            return;
-          }
-
-          // non-CJK word
-          if (innerIndex % 2 === 0) {
-            if (innerToken !== "") {
-              appendNode({
-                type: "word",
-                value: innerToken,
-                kind: KIND_NON_CJK,
-                hasLeadingPunctuation: punctuationRegex.test(innerToken[0]),
-                hasTrailingPunctuation: punctuationRegex.test(
-                  getLast(innerToken)
-                )
-              });
-            }
-            return;
-          }
-
-          // CJK character
-          appendNode(
-            punctuationRegex.test(innerToken)
-              ? {
-                  type: "word",
-                  value: innerToken,
-                  kind: KIND_CJK_PUNCTUATION,
-                  hasLeadingPunctuation: true,
-                  hasTrailingPunctuation: true
-                }
-              : {
-                  type: "word",
-                  value: innerToken,
-                  kind: KIND_CJK_CHARACTER,
-                  hasLeadingPunctuation: false,
-                  hasTrailingPunctuation: false
-                }
-          );
-        });
-    });
-
-  return nodes;
-
-  function appendNode(node) {
-    const lastNode = getLast(nodes);
-    if (lastNode && lastNode.type === "word") {
-      if (
-        (lastNode.kind === KIND_NON_CJK &&
-          node.kind === KIND_CJK_CHARACTER &&
-          !lastNode.hasTrailingPunctuation) ||
-        (lastNode.kind === KIND_CJK_CHARACTER &&
-          node.kind === KIND_NON_CJK &&
-          !node.hasLeadingPunctuation)
-      ) {
-        nodes.push({ type: "whitespace", value: " " });
-      } else if (
-        !isBetween(KIND_NON_CJK, KIND_CJK_PUNCTUATION) &&
-        // disallow leading/trailing full-width whitespace
-        ![lastNode.value, node.value].some(value => /\u3000/.test(value))
-      ) {
-        nodes.push({ type: "whitespace", value: "" });
-      }
-    }
-    nodes.push(node);
-
-    function isBetween(kind1, kind2) {
-      return (
-        (lastNode.kind === kind1 && node.kind === kind2) ||
-        (lastNode.kind === kind2 && node.kind === kind1)
-      );
-    }
-  }
-}
-
 function getStringWidth(text) {
  if (!text) {
    return 0;
@ -803,10 +668,7 @@ function isWithinParentArrayProperty(path, propertyName) {
 }

 module.exports = {
-  punctuationRegex,
-  punctuationCharRange,
  getStringWidth,
-  splitText,
  getMaxContinuousCount,
  getPrecedence,
  shouldFlatten,
--- a/src/language-markdown/constants.evaluate.js
+++ b/src/language-markdown/constants.evaluate.js
@ -0,0 +1,38 @@
+"use strict";
+
+const cjkRegex = require("cjk-regex");
+const regexpUtil = require("regexp-util");
+const unicodeRegex = require("unicode-regex");
+
+const cjkPattern = cjkRegex().toString();
+const kPattern = unicodeRegex({ Script: ["Hangul"] }).toString();
+
+// http://spec.commonmark.org/0.25/#ascii-punctuation-character
+const asciiPunctuationCharset = /* prettier-ignore */ regexpUtil.charset(
+  "!", '"', "#",  "$", "%", "&", "'", "(", ")", "*",
+  "+", ",", "-",  ".", "/", ":", ";", "<", "=", ">",
+  "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|",
+  "}", "~"
+);
+
+// http://spec.commonmark.org/0.25/#punctuation-character
+const punctuationCharset = unicodeRegex({
+  // http://unicode.org/Public/5.1.0/ucd/UCD.html#General_Category_Values
+  General_Category: [
+    /* Pc */ "Connector_Punctuation",
+    /* Pd */ "Dash_Punctuation",
+    /* Pe */ "Close_Punctuation",
+    /* Pf */ "Final_Punctuation",
+    /* Pi */ "Initial_Punctuation",
+    /* Po */ "Other_Punctuation",
+    /* Ps */ "Open_Punctuation"
+  ]
+}).union(asciiPunctuationCharset);
+
+const punctuationPattern = punctuationCharset.toString();
+
+module.exports = {
+  cjkPattern,
+  kPattern,
+  punctuationPattern
+};
--- a/src/language-markdown/parser-markdown.js
+++ b/src/language-markdown/parser-markdown.js
@ -4,8 +4,7 @@ const remarkParse = require("remark-parse");
 const unified = require("unified");
 const pragma = require("./pragma");
 const parseFrontMatter = require("../utils/front-matter");
-const util = require("../common/util");
-const { getOrderedListItemInfo } = require("./utils");
+const { getOrderedListItemInfo, splitText } = require("./utils");
 const mdx = require("./mdx");

 // 0x0 ~ 0x10ffff
@ -46,7 +45,7 @@ function createParse({ isMDX }) {
      .use(transformInlineCode)
      .use(transformIndentedCodeblockAndMarkItsParentList(text))
      .use(markAlignedList(text, opts))
-      .use(splitText(opts))
+      .use(splitTextIntoSentences(opts))
      .use(isMDX ? htmlToJsx : identity)
      .use(isMDX ? mergeContinuousImportExport : identity);
    return processor.runSync(processor.parse(text));
@ -170,7 +169,7 @@ function mergeContinuousTexts() {
  );
 }

-function splitText(options) {
+function splitTextIntoSentences(options) {
  return () => ast =>
    map(ast, (node, index, [parentNode]) => {
      if (node.type !== "text") {
@ -191,7 +190,7 @@ function splitText(options) {
      return {
        type: "sentence",
        position: node.position,
-        children: util.splitText(value, options)
+        children: splitText(value, options)
      };
    });
 }
--- a/src/language-markdown/printer-markdown.js
+++ b/src/language-markdown/printer-markdown.js
@ -20,7 +20,12 @@ const {
  utils: { mapDoc },
  printer: { printDocToString }
 } = require("../doc");
-const { getOrderedListItemInfo, getFencedCodeBlockValue } = require("./utils");
+const {
+  getFencedCodeBlockValue,
+  getOrderedListItemInfo,
+  splitText,
+  punctuationPattern
+} = require("./utils");

 const TRAILING_HARDLINE_NODES = ["importExport"];

@ -57,22 +62,20 @@ function genericPrint(path, options, print) {

  if (shouldRemainTheSameContent(path)) {
    return concat(
-      privateUtil
-        .splitText(
-          options.originalText.slice(
-            node.position.start.offset,
-            node.position.end.offset
-          ),
-          options
-        )
-        .map(
-          node =>
-            node.type === "word"
-              ? node.value
-              : node.value === ""
-                ? ""
-                : printLine(path, node.value, options)
-        )
+      splitText(
+        options.originalText.slice(
+          node.position.start.offset,
+          node.position.end.offset
+        ),
+        options
+      ).map(
+        node =>
+          node.type === "word"
+            ? node.value
+            : node.value === ""
+              ? ""
+              : printLine(path, node.value, options)
+      )
    );
  }

@ -99,8 +102,8 @@ function genericPrint(path, options, print) {
        .replace(
          new RegExp(
            [
-              `(^|[${privateUtil.punctuationCharRange}])(_+)`,
-              `(_+)([${privateUtil.punctuationCharRange}]|$)`
+              `(^|${punctuationPattern})(_+)`,
+              `(_+)(${punctuationPattern}|$)`
            ].join("|"),
            "g"
          ),
--- a/src/language-markdown/utils.js
+++ b/src/language-markdown/utils.js
@ -1,5 +1,131 @@
 "use strict";

+const {
+  cjkPattern,
+  kPattern,
+  punctuationPattern
+} = require("./constants.evaluate");
+const { getLast } = require("../common/util");
+
+const kRegex = new RegExp(kPattern);
+const punctuationRegex = new RegExp(punctuationPattern);
+
+/**
+ * split text into whitespaces and words
+ * @param {string} text
+ * @return {Array<{ type: "whitespace", value: " " | "\n" | "" } | { type: "word", value: string }>}
+ */
+function splitText(text, options) {
+  const KIND_NON_CJK = "non-cjk";
+  const KIND_CJ_LETTER = "cj-letter";
+  const KIND_K_LETTER = "k-letter";
+  const KIND_CJK_PUNCTUATION = "cjk-punctuation";
+
+  const nodes = [];
+
+  (options.proseWrap === "preserve"
+    ? text
+    : text.replace(new RegExp(`(${cjkPattern})\n(${cjkPattern})`, "g"), "$1$2")
+  )
+    .split(/([ \t\n]+)/)
+    .forEach((token, index, tokens) => {
+      // whitespace
+      if (index % 2 === 1) {
+        nodes.push({
+          type: "whitespace",
+          value: /\n/.test(token) ? "\n" : " "
+        });
+        return;
+      }
+
+      // word separated by whitespace
+
+      if ((index === 0 || index === tokens.length - 1) && token === "") {
+        return;
+      }
+
+      token
+        .split(new RegExp(`(${cjkPattern})`))
+        .forEach((innerToken, innerIndex, innerTokens) => {
+          if (
+            (innerIndex === 0 || innerIndex === innerTokens.length - 1) &&
+            innerToken === ""
+          ) {
+            return;
+          }
+
+          // non-CJK word
+          if (innerIndex % 2 === 0) {
+            if (innerToken !== "") {
+              appendNode({
+                type: "word",
+                value: innerToken,
+                kind: KIND_NON_CJK,
+                hasLeadingPunctuation: punctuationRegex.test(innerToken[0]),
+                hasTrailingPunctuation: punctuationRegex.test(
+                  getLast(innerToken)
+                )
+              });
+            }
+            return;
+          }
+
+          // CJK character
+          appendNode(
+            punctuationRegex.test(innerToken)
+              ? {
+                  type: "word",
+                  value: innerToken,
+                  kind: KIND_CJK_PUNCTUATION,
+                  hasLeadingPunctuation: true,
+                  hasTrailingPunctuation: true
+                }
+              : {
+                  type: "word",
+                  value: innerToken,
+                  kind: kRegex.test(innerToken)
+                    ? KIND_K_LETTER
+                    : KIND_CJ_LETTER,
+                  hasLeadingPunctuation: false,
+                  hasTrailingPunctuation: false
+                }
+          );
+        });
+    });
+
+  return nodes;
+
+  function appendNode(node) {
+    const lastNode = getLast(nodes);
+    if (lastNode && lastNode.type === "word") {
+      if (
+        (lastNode.kind === KIND_NON_CJK &&
+          node.kind === KIND_CJ_LETTER &&
+          !lastNode.hasTrailingPunctuation) ||
+        (lastNode.kind === KIND_CJ_LETTER &&
+          node.kind === KIND_NON_CJK &&
+          !node.hasLeadingPunctuation)
+      ) {
+        nodes.push({ type: "whitespace", value: " " });
+      } else if (
+        !isBetween(KIND_NON_CJK, KIND_CJK_PUNCTUATION) &&
+        // disallow leading/trailing full-width whitespace
+        ![lastNode.value, node.value].some(value => /\u3000/.test(value))
+      ) {
+        nodes.push({ type: "whitespace", value: "" });
+      }
+    }
+    nodes.push(node);
+
+    function isBetween(kind1, kind2) {
+      return (
+        (lastNode.kind === kind1 && node.kind === kind2) ||
+        (lastNode.kind === kind2 && node.kind === kind1)
+      );
+    }
+  }
+}
+
 function getOrderedListItemInfo(orderListItem, originalText) {
  const [, numberText, marker, leadingSpaces] = originalText
    .slice(
@ -48,6 +174,8 @@ function getFencedCodeBlockValue(node, originalText) {
 }

 module.exports = {
+  splitText,
+  punctuationPattern,
  getFencedCodeBlockValue,
  getOrderedListItemInfo
 };
--- a/tests/markdown_splitCjkText/snapshots/jsfmt.spec.js.snap
+++ b/tests/markdown_splitCjkText/snapshots/jsfmt.spec.js.snap
@ -1,6 +1,6 @@
 // Jest Snapshot v1, https://goo.gl/fbAQLP

-exports[`cjk.md - markdown-verify 1`] = `
+exports[`chinese-japanese.md - markdown-verify 1`] = `
 這是一段很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長的段落

 全　　形　空白全　　形　空白全　　形　空白全　　形　空白全　　形　空白全　　形　空白全　　形　空白
@ -18,6 +18,13 @@ exports[`cjk.md - markdown-verify 1`] = `

 `;

+exports[`korean.md - markdown-verify 1`] = `
+예문Latin예문Latin 예문Latin예문 Latin예문Latin 예문 Latin 예문
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+예문Latin예문Latin 예문Latin예문 Latin예문Latin 예문 Latin 예문
+
+`;
+
 exports[`link.md - markdown-verify 1`] = `
 [這是一段很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長的段落][]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/tests/markdown_splitCjkText/chinese-japanese.md
+++ b/tests/markdown_splitCjkText/chinese-japanese.md
--- a/tests/markdown_splitCjkText/korean.md
+++ b/tests/markdown_splitCjkText/korean.md
@ -0,0 +1 @@
+예문Latin예문Latin 예문Latin예문 Latin예문Latin 예문 Latin 예문
--- a/yarn.lock
+++ b/yarn.lock
@ -1344,9 +1344,12 @@ circular-json@^0.3.1:
  version "0.3.1"
  resolved "https://registry.yarnpkg.com/circular-json/-/circular-json-0.3.1.tgz#be8b36aefccde8b3ca7aa2d6afc07a37242c0d2d"

-cjk-regex@1.0.2:
-  version "1.0.2"
-  resolved "https://registry.yarnpkg.com/cjk-regex/-/cjk-regex-1.0.2.tgz#86f5170ecfaef9049ec91f8068e15d63d8e10154"
+cjk-regex@2.0.0:
+  version "2.0.0"
+  resolved "https://registry.yarnpkg.com/cjk-regex/-/cjk-regex-2.0.0.tgz#060aa111e61092768c438ccc9c643a53e8fe1ee5"
+  dependencies:
+    regexp-util "^1.2.1"
+    unicode-regex "^2.0.0"

 class-utils@^0.3.5:
  version "0.3.6"
@ -4815,6 +4818,12 @@ regex-not@^1.0.0, regex-not@^1.0.2:
    extend-shallow "^3.0.2"
    safe-regex "^1.1.0"

+regexp-util@1.2.2, regexp-util@^1.2.0, regexp-util@^1.2.1:
+  version "1.2.2"
+  resolved "https://registry.yarnpkg.com/regexp-util/-/regexp-util-1.2.2.tgz#5cf599134921eb0d776e41d41e9c0da33f0fa2fc"
+  dependencies:
+    tslib "^1.9.0"
+
 regexpu-core@^4.1.3:
  version "4.1.5"
  resolved "https://registry.yarnpkg.com/regexpu-core/-/regexpu-core-4.1.5.tgz#57fdfe1148f8a7a069086228515130cf1820ddd0"
@ -5669,7 +5678,7 @@ tslib@^1.8.0:
  version "1.9.1"
  resolved "https://registry.yarnpkg.com/tslib/-/tslib-1.9.1.tgz#a5d1f0532a49221c87755cfcc89ca37197242ba7"

-tslib@^1.9.1, tslib@^1.9.3:
+tslib@^1.9.0, tslib@^1.9.1, tslib@^1.9.3:
  version "1.9.3"
  resolved "https://registry.yarnpkg.com/tslib/-/tslib-1.9.3.tgz#d7e4dd79245d85428c4d7e4822a79917954ca286"

@ -5789,9 +5798,11 @@ unicode-property-aliases-ecmascript@^1.0.4:
  version "1.0.4"
  resolved "https://registry.yarnpkg.com/unicode-property-aliases-ecmascript/-/unicode-property-aliases-ecmascript-1.0.4.tgz#5a533f31b4317ea76f17d807fa0d116546111dd0"

-unicode-regex@1.0.1:
-  version "1.0.1"
-  resolved "https://registry.yarnpkg.com/unicode-regex/-/unicode-regex-1.0.1.tgz#f819e050191d5b9561a339a58dd3b9095ed94b35"
+unicode-regex@2.0.0, unicode-regex@^2.0.0:
+  version "2.0.0"
+  resolved "https://registry.yarnpkg.com/unicode-regex/-/unicode-regex-2.0.0.tgz#ef8f6642c37dddcaa0c09af5b9456aabf6b436a3"
+  dependencies:
+    regexp-util "^1.2.0"

 unified@6.1.6:
  version "6.1.6"
				`@ -0,0 +1 @@`
				`예문Latin예문Latin 예문Latin예문 Latin예문Latin 예문 Latin 예문`