fix(markdown): do not add whitespaces between Latin and Hangul (#5040)

Based on https://github.com/prettier/prettier/issues/5028#issuecomment-417825085, it seems Korean text uses conventional space so we're not going to add whitespaces between Latin and Hangul.
master
Ika 2018-09-02 16:20:22 +08:00 committed by GitHub
parent e86f08555a
commit 91206891cd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 252 additions and 172 deletions

View File

@ -20,7 +20,7 @@
"@iarna/toml": "2.0.0",
"camelcase": "4.1.0",
"chalk": "2.1.0",
"cjk-regex": "1.0.2",
"cjk-regex": "2.0.0",
"cosmiconfig": "5.0.6",
"dashify": "0.2.2",
"dedent": "0.7.0",
@ -53,13 +53,14 @@
"postcss-scss": "1.0.6",
"postcss-selector-parser": "2.2.3",
"postcss-values-parser": "1.5.0",
"regexp-util": "1.2.2",
"remark-parse": "5.0.0",
"resolve": "1.5.0",
"semver": "5.4.1",
"string-width": "2.1.1",
"typescript": "3.0.1",
"typescript-eslint-parser": "18.0.0",
"unicode-regex": "1.0.1",
"unicode-regex": "2.0.0",
"unified": "6.1.6",
"vnopts": "1.0.2",
"yaml": "1.0.0-rc.8",

View File

@ -14,6 +14,7 @@ const uglify = require("rollup-plugin-uglify");
const babel = require("rollup-plugin-babel");
const nativeShims = require("./rollup-plugins/native-shims");
const executable = require("./rollup-plugins/executable");
const evaluate = require("./rollup-plugins/evaluate");
const EXTERNALS = [
"assert",
@ -108,6 +109,7 @@ function getRollupConfig(bundle) {
config.plugins = [
replace(replaceStrings),
executable(),
evaluate(),
json(),
bundle.alias && alias(bundle.alias),
bundle.target === "universal" &&

View File

@ -0,0 +1,28 @@
"use strict";
module.exports = function() {
return {
name: "evaluate",
transform(_text, id) {
if (!/\.evaluate\.js$/.test(id)) {
return null;
}
const json = JSON.stringify(
require(id.replace(/^\0commonjs-proxy:/, "")),
(_, v) => {
if (typeof v === "function") {
throw new Error(`Cannot evaluate functions.`);
}
return v;
}
);
return {
code: `const json = ${json}; export default json;`,
map: { mappings: "" }
};
}
};
};

View File

@ -3,32 +3,10 @@
const stringWidth = require("string-width");
const emojiRegex = require("emoji-regex")();
const escapeStringRegexp = require("escape-string-regexp");
const getCjkRegex = require("cjk-regex");
const getUnicodeRegex = require("unicode-regex");
// eslint-disable-next-line no-control-regex
const notAsciiRegex = /[^\x20-\x7F]/;
const cjkPattern = getCjkRegex().source;
// http://spec.commonmark.org/0.25/#ascii-punctuation-character
const asciiPunctuationCharRange = escapeStringRegexp(
"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
);
// http://spec.commonmark.org/0.25/#punctuation-character
const punctuationCharRange = `${asciiPunctuationCharRange}${getUnicodeRegex([
"Pc",
"Pd",
"Pe",
"Pf",
"Pi",
"Po",
"Ps"
]).source.slice(1, -1)}`; // remove bracket expression `[` and `]`
const punctuationRegex = new RegExp(`[${punctuationCharRange}]`);
function isExportDeclaration(node) {
if (node) {
switch (node.type) {
@ -598,119 +576,6 @@ function getMaxContinuousCount(str, target) {
);
}
/**
* split text into whitespaces and words
* @param {string} text
* @return {Array<{ type: "whitespace", value: " " | "\n" | "" } | { type: "word", value: string }>}
*/
function splitText(text, options) {
const KIND_NON_CJK = "non-cjk";
const KIND_CJK_CHARACTER = "cjk-character";
const KIND_CJK_PUNCTUATION = "cjk-punctuation";
const nodes = [];
(options.proseWrap === "preserve"
? text
: text.replace(new RegExp(`(${cjkPattern})\n(${cjkPattern})`, "g"), "$1$2")
)
.split(/([ \t\n]+)/)
.forEach((token, index, tokens) => {
// whitespace
if (index % 2 === 1) {
nodes.push({
type: "whitespace",
value: /\n/.test(token) ? "\n" : " "
});
return;
}
// word separated by whitespace
if ((index === 0 || index === tokens.length - 1) && token === "") {
return;
}
token
.split(new RegExp(`(${cjkPattern})`))
.forEach((innerToken, innerIndex, innerTokens) => {
if (
(innerIndex === 0 || innerIndex === innerTokens.length - 1) &&
innerToken === ""
) {
return;
}
// non-CJK word
if (innerIndex % 2 === 0) {
if (innerToken !== "") {
appendNode({
type: "word",
value: innerToken,
kind: KIND_NON_CJK,
hasLeadingPunctuation: punctuationRegex.test(innerToken[0]),
hasTrailingPunctuation: punctuationRegex.test(
getLast(innerToken)
)
});
}
return;
}
// CJK character
appendNode(
punctuationRegex.test(innerToken)
? {
type: "word",
value: innerToken,
kind: KIND_CJK_PUNCTUATION,
hasLeadingPunctuation: true,
hasTrailingPunctuation: true
}
: {
type: "word",
value: innerToken,
kind: KIND_CJK_CHARACTER,
hasLeadingPunctuation: false,
hasTrailingPunctuation: false
}
);
});
});
return nodes;
function appendNode(node) {
const lastNode = getLast(nodes);
if (lastNode && lastNode.type === "word") {
if (
(lastNode.kind === KIND_NON_CJK &&
node.kind === KIND_CJK_CHARACTER &&
!lastNode.hasTrailingPunctuation) ||
(lastNode.kind === KIND_CJK_CHARACTER &&
node.kind === KIND_NON_CJK &&
!node.hasLeadingPunctuation)
) {
nodes.push({ type: "whitespace", value: " " });
} else if (
!isBetween(KIND_NON_CJK, KIND_CJK_PUNCTUATION) &&
// disallow leading/trailing full-width whitespace
![lastNode.value, node.value].some(value => /\u3000/.test(value))
) {
nodes.push({ type: "whitespace", value: "" });
}
}
nodes.push(node);
function isBetween(kind1, kind2) {
return (
(lastNode.kind === kind1 && node.kind === kind2) ||
(lastNode.kind === kind2 && node.kind === kind1)
);
}
}
}
function getStringWidth(text) {
if (!text) {
return 0;
@ -803,10 +668,7 @@ function isWithinParentArrayProperty(path, propertyName) {
}
module.exports = {
punctuationRegex,
punctuationCharRange,
getStringWidth,
splitText,
getMaxContinuousCount,
getPrecedence,
shouldFlatten,

View File

@ -0,0 +1,38 @@
"use strict";
const cjkRegex = require("cjk-regex");
const regexpUtil = require("regexp-util");
const unicodeRegex = require("unicode-regex");
const cjkPattern = cjkRegex().toString();
const kPattern = unicodeRegex({ Script: ["Hangul"] }).toString();
// http://spec.commonmark.org/0.25/#ascii-punctuation-character
const asciiPunctuationCharset = /* prettier-ignore */ regexpUtil.charset(
"!", '"', "#", "$", "%", "&", "'", "(", ")", "*",
"+", ",", "-", ".", "/", ":", ";", "<", "=", ">",
"?", "@", "[", "\\", "]", "^", "_", "`", "{", "|",
"}", "~"
);
// http://spec.commonmark.org/0.25/#punctuation-character
const punctuationCharset = unicodeRegex({
// http://unicode.org/Public/5.1.0/ucd/UCD.html#General_Category_Values
General_Category: [
/* Pc */ "Connector_Punctuation",
/* Pd */ "Dash_Punctuation",
/* Pe */ "Close_Punctuation",
/* Pf */ "Final_Punctuation",
/* Pi */ "Initial_Punctuation",
/* Po */ "Other_Punctuation",
/* Ps */ "Open_Punctuation"
]
}).union(asciiPunctuationCharset);
const punctuationPattern = punctuationCharset.toString();
module.exports = {
cjkPattern,
kPattern,
punctuationPattern
};

View File

@ -4,8 +4,7 @@ const remarkParse = require("remark-parse");
const unified = require("unified");
const pragma = require("./pragma");
const parseFrontMatter = require("../utils/front-matter");
const util = require("../common/util");
const { getOrderedListItemInfo } = require("./utils");
const { getOrderedListItemInfo, splitText } = require("./utils");
const mdx = require("./mdx");
// 0x0 ~ 0x10ffff
@ -46,7 +45,7 @@ function createParse({ isMDX }) {
.use(transformInlineCode)
.use(transformIndentedCodeblockAndMarkItsParentList(text))
.use(markAlignedList(text, opts))
.use(splitText(opts))
.use(splitTextIntoSentences(opts))
.use(isMDX ? htmlToJsx : identity)
.use(isMDX ? mergeContinuousImportExport : identity);
return processor.runSync(processor.parse(text));
@ -170,7 +169,7 @@ function mergeContinuousTexts() {
);
}
function splitText(options) {
function splitTextIntoSentences(options) {
return () => ast =>
map(ast, (node, index, [parentNode]) => {
if (node.type !== "text") {
@ -191,7 +190,7 @@ function splitText(options) {
return {
type: "sentence",
position: node.position,
children: util.splitText(value, options)
children: splitText(value, options)
};
});
}

View File

@ -20,7 +20,12 @@ const {
utils: { mapDoc },
printer: { printDocToString }
} = require("../doc");
const { getOrderedListItemInfo, getFencedCodeBlockValue } = require("./utils");
const {
getFencedCodeBlockValue,
getOrderedListItemInfo,
splitText,
punctuationPattern
} = require("./utils");
const TRAILING_HARDLINE_NODES = ["importExport"];
@ -57,22 +62,20 @@ function genericPrint(path, options, print) {
if (shouldRemainTheSameContent(path)) {
return concat(
privateUtil
.splitText(
options.originalText.slice(
node.position.start.offset,
node.position.end.offset
),
options
)
.map(
node =>
node.type === "word"
? node.value
: node.value === ""
? ""
: printLine(path, node.value, options)
)
splitText(
options.originalText.slice(
node.position.start.offset,
node.position.end.offset
),
options
).map(
node =>
node.type === "word"
? node.value
: node.value === ""
? ""
: printLine(path, node.value, options)
)
);
}
@ -99,8 +102,8 @@ function genericPrint(path, options, print) {
.replace(
new RegExp(
[
`(^|[${privateUtil.punctuationCharRange}])(_+)`,
`(_+)([${privateUtil.punctuationCharRange}]|$)`
`(^|${punctuationPattern})(_+)`,
`(_+)(${punctuationPattern}|$)`
].join("|"),
"g"
),

View File

@ -1,5 +1,131 @@
"use strict";
const {
cjkPattern,
kPattern,
punctuationPattern
} = require("./constants.evaluate");
const { getLast } = require("../common/util");
const kRegex = new RegExp(kPattern);
const punctuationRegex = new RegExp(punctuationPattern);
/**
* split text into whitespaces and words
* @param {string} text
* @return {Array<{ type: "whitespace", value: " " | "\n" | "" } | { type: "word", value: string }>}
*/
function splitText(text, options) {
const KIND_NON_CJK = "non-cjk";
const KIND_CJ_LETTER = "cj-letter";
const KIND_K_LETTER = "k-letter";
const KIND_CJK_PUNCTUATION = "cjk-punctuation";
const nodes = [];
(options.proseWrap === "preserve"
? text
: text.replace(new RegExp(`(${cjkPattern})\n(${cjkPattern})`, "g"), "$1$2")
)
.split(/([ \t\n]+)/)
.forEach((token, index, tokens) => {
// whitespace
if (index % 2 === 1) {
nodes.push({
type: "whitespace",
value: /\n/.test(token) ? "\n" : " "
});
return;
}
// word separated by whitespace
if ((index === 0 || index === tokens.length - 1) && token === "") {
return;
}
token
.split(new RegExp(`(${cjkPattern})`))
.forEach((innerToken, innerIndex, innerTokens) => {
if (
(innerIndex === 0 || innerIndex === innerTokens.length - 1) &&
innerToken === ""
) {
return;
}
// non-CJK word
if (innerIndex % 2 === 0) {
if (innerToken !== "") {
appendNode({
type: "word",
value: innerToken,
kind: KIND_NON_CJK,
hasLeadingPunctuation: punctuationRegex.test(innerToken[0]),
hasTrailingPunctuation: punctuationRegex.test(
getLast(innerToken)
)
});
}
return;
}
// CJK character
appendNode(
punctuationRegex.test(innerToken)
? {
type: "word",
value: innerToken,
kind: KIND_CJK_PUNCTUATION,
hasLeadingPunctuation: true,
hasTrailingPunctuation: true
}
: {
type: "word",
value: innerToken,
kind: kRegex.test(innerToken)
? KIND_K_LETTER
: KIND_CJ_LETTER,
hasLeadingPunctuation: false,
hasTrailingPunctuation: false
}
);
});
});
return nodes;
function appendNode(node) {
const lastNode = getLast(nodes);
if (lastNode && lastNode.type === "word") {
if (
(lastNode.kind === KIND_NON_CJK &&
node.kind === KIND_CJ_LETTER &&
!lastNode.hasTrailingPunctuation) ||
(lastNode.kind === KIND_CJ_LETTER &&
node.kind === KIND_NON_CJK &&
!node.hasLeadingPunctuation)
) {
nodes.push({ type: "whitespace", value: " " });
} else if (
!isBetween(KIND_NON_CJK, KIND_CJK_PUNCTUATION) &&
// disallow leading/trailing full-width whitespace
![lastNode.value, node.value].some(value => /\u3000/.test(value))
) {
nodes.push({ type: "whitespace", value: "" });
}
}
nodes.push(node);
function isBetween(kind1, kind2) {
return (
(lastNode.kind === kind1 && node.kind === kind2) ||
(lastNode.kind === kind2 && node.kind === kind1)
);
}
}
}
function getOrderedListItemInfo(orderListItem, originalText) {
const [, numberText, marker, leadingSpaces] = originalText
.slice(
@ -48,6 +174,8 @@ function getFencedCodeBlockValue(node, originalText) {
}
module.exports = {
splitText,
punctuationPattern,
getFencedCodeBlockValue,
getOrderedListItemInfo
};

View File

@ -1,6 +1,6 @@
// Jest Snapshot v1, https://goo.gl/fbAQLP
exports[`cjk.md - markdown-verify 1`] = `
exports[`chinese-japanese.md - markdown-verify 1`] = `
這是一段很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長的段落
全  形 空白全  形 空白全  形 空白全  形 空白全  形 空白全  形 空白全  形 空白
@ -18,6 +18,13 @@ exports[`cjk.md - markdown-verify 1`] = `
`;
exports[`korean.md - markdown-verify 1`] = `
예문Latin예문Latin 예문Latin예문 Latin예문Latin 예문 Latin 예문
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
예문Latin예문Latin 예문Latin예문 Latin예문Latin 예문 Latin 예문
`;
exports[`link.md - markdown-verify 1`] = `
[這是一段很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長的段落][]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@ -0,0 +1 @@
예문Latin예문Latin 예문Latin예문 Latin예문Latin 예문 Latin 예문

View File

@ -1344,9 +1344,12 @@ circular-json@^0.3.1:
version "0.3.1"
resolved "https://registry.yarnpkg.com/circular-json/-/circular-json-0.3.1.tgz#be8b36aefccde8b3ca7aa2d6afc07a37242c0d2d"
cjk-regex@1.0.2:
version "1.0.2"
resolved "https://registry.yarnpkg.com/cjk-regex/-/cjk-regex-1.0.2.tgz#86f5170ecfaef9049ec91f8068e15d63d8e10154"
cjk-regex@2.0.0:
version "2.0.0"
resolved "https://registry.yarnpkg.com/cjk-regex/-/cjk-regex-2.0.0.tgz#060aa111e61092768c438ccc9c643a53e8fe1ee5"
dependencies:
regexp-util "^1.2.1"
unicode-regex "^2.0.0"
class-utils@^0.3.5:
version "0.3.6"
@ -4815,6 +4818,12 @@ regex-not@^1.0.0, regex-not@^1.0.2:
extend-shallow "^3.0.2"
safe-regex "^1.1.0"
regexp-util@1.2.2, regexp-util@^1.2.0, regexp-util@^1.2.1:
version "1.2.2"
resolved "https://registry.yarnpkg.com/regexp-util/-/regexp-util-1.2.2.tgz#5cf599134921eb0d776e41d41e9c0da33f0fa2fc"
dependencies:
tslib "^1.9.0"
regexpu-core@^4.1.3:
version "4.1.5"
resolved "https://registry.yarnpkg.com/regexpu-core/-/regexpu-core-4.1.5.tgz#57fdfe1148f8a7a069086228515130cf1820ddd0"
@ -5669,7 +5678,7 @@ tslib@^1.8.0:
version "1.9.1"
resolved "https://registry.yarnpkg.com/tslib/-/tslib-1.9.1.tgz#a5d1f0532a49221c87755cfcc89ca37197242ba7"
tslib@^1.9.1, tslib@^1.9.3:
tslib@^1.9.0, tslib@^1.9.1, tslib@^1.9.3:
version "1.9.3"
resolved "https://registry.yarnpkg.com/tslib/-/tslib-1.9.3.tgz#d7e4dd79245d85428c4d7e4822a79917954ca286"
@ -5789,9 +5798,11 @@ unicode-property-aliases-ecmascript@^1.0.4:
version "1.0.4"
resolved "https://registry.yarnpkg.com/unicode-property-aliases-ecmascript/-/unicode-property-aliases-ecmascript-1.0.4.tgz#5a533f31b4317ea76f17d807fa0d116546111dd0"
unicode-regex@1.0.1:
version "1.0.1"
resolved "https://registry.yarnpkg.com/unicode-regex/-/unicode-regex-1.0.1.tgz#f819e050191d5b9561a339a58dd3b9095ed94b35"
unicode-regex@2.0.0, unicode-regex@^2.0.0:
version "2.0.0"
resolved "https://registry.yarnpkg.com/unicode-regex/-/unicode-regex-2.0.0.tgz#ef8f6642c37dddcaa0c09af5b9456aabf6b436a3"
dependencies:
regexp-util "^1.2.0"
unified@6.1.6:
version "6.1.6"