fix(markdown): handle punctuation variants (#3254)

* fix(markdown): handle punctuation variants

* docs: add comment
master
Ika 2017-11-13 00:09:04 +08:00 committed by GitHub
parent ece764a049
commit d08df0b221
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 42 additions and 25 deletions

View File

@ -49,6 +49,7 @@
"strip-bom": "3.0.0",
"typescript": "2.5.3",
"typescript-eslint-parser": "git://github.com/eslint/typescript-eslint-parser.git#9c71a627da36e97da52ed2731d58509c952b67ae",
"unicode-regex": "1.0.1",
"unified": "6.1.5"
},
"devDependencies": {

View File

@ -11,13 +11,7 @@ const fill = docBuilders.fill;
const align = docBuilders.align;
const docPrinter = require("./doc-printer");
const printDocToString = docPrinter.printDocToString;
const asciiPunctuationPattern = util.asciiPunctuationPattern;
const getCjkRegex = require("cjk-regex");
const punctuationPattern = `(?:${[
getCjkRegex.punctuations().source,
`[${asciiPunctuationPattern}]`
].join("|")})`;
const punctuationCharRange = util.punctuationCharRange;
const SINGLE_LINE_NODE_TYPES = [
"heading",
@ -87,7 +81,10 @@ function genericPrint(path, options, print) {
.replace(/[*]/g, "\\*") // escape all `*`
.replace(
new RegExp(
`(^|${punctuationPattern})(_+)|(_+)(${punctuationPattern}|$)`,
[
`(^|[${punctuationCharRange}])(_+)`,
`(_+)([${punctuationCharRange}]|$)`
].join("|"),
"g"
),
(_, text1, underscore1, underscore2, text2) =>
@ -118,14 +115,14 @@ function genericPrint(path, options, print) {
prevNode.type === "sentence" &&
prevNode.children.length > 0 &&
prevNode.children[prevNode.children.length - 1].type === "word" &&
new RegExp(`[^${asciiPunctuationPattern}]$`).test(
new RegExp(`[^${punctuationCharRange}]$`).test(
prevNode.children[prevNode.children.length - 1].value
)) ||
(nextNode &&
nextNode.type === "sentence" &&
nextNode.children.length > 0 &&
nextNode.children[0].type === "word" &&
new RegExp(`^[^${asciiPunctuationPattern}]`).test(
new RegExp(`^[^${punctuationCharRange}]`).test(
nextNode.children[0].value
));
const style =

View File

@ -3,19 +3,27 @@
const stringWidth = require("string-width");
const emojiRegex = require("emoji-regex")();
const escapeStringRegexp = require("escape-string-regexp");
const getCjkRegex = require("cjk-regex");
const cjkRegex = getCjkRegex();
const getUnicodeRegex = require("unicode-regex");
// the `g` flag is dangerous in RegExp#test()
// https://stackoverflow.com/a/21373261
const cjkPunctuationRegex = new RegExp(getCjkRegex.punctuations().source, "");
const cjkPattern = getCjkRegex().source;
// http://spec.commonmark.org/0.25/#ascii-punctuation-character
const asciiPunctuationPattern = escapeStringRegexp(
const asciiPunctuationCharRange = escapeStringRegexp(
"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
);
// http://spec.commonmark.org/0.25/#punctuation-character
const punctuationCharRange = `${asciiPunctuationCharRange}${getUnicodeRegex([
"Pc",
"Pd",
"Pe",
"Pf",
"Pi",
"Po",
"Ps"
]).source.slice(1, -1)}`; // remove bracket expression `[` and `]`
function isExportDeclaration(node) {
if (node) {
switch (node.type) {
@ -672,10 +680,7 @@ function splitText(text) {
const nodes = [];
text
.replace(
new RegExp(`(${cjkRegex.source})\n(${cjkRegex.source})`, "g"),
"$1$2"
)
.replace(new RegExp(`(${cjkPattern})\n(${cjkPattern})`, "g"), "$1$2")
// `\s` but exclude full-width whitspace (`\u3000`)
.split(/([^\S\u3000]+)/)
.forEach((token, index, tokens) => {
@ -692,7 +697,7 @@ function splitText(text) {
}
token
.split(new RegExp(`(${cjkRegex.source})`))
.split(new RegExp(`(${cjkPattern})`))
.forEach((innerToken, innerIndex, innerTokens) => {
if (
(innerIndex === 0 || innerIndex === innerTokens.length - 1) &&
@ -714,7 +719,7 @@ function splitText(text) {
}
// CJK character
const kind = cjkPunctuationRegex.test(innerToken)
const kind = new RegExp(`[${punctuationCharRange}]`).test(innerToken)
? KIND_CJK_PUNCTUATION
: KIND_CJK_CHARACTER;
appendNode({ type: "word", value: innerToken, kind });
@ -729,10 +734,10 @@ function splitText(text) {
if (
(lastNode.kind === KIND_NON_CJK &&
node.kind === KIND_CJK_CHARACTER &&
!new RegExp(`[${asciiPunctuationPattern}]$`).test(lastNode.value)) ||
!new RegExp(`[${punctuationCharRange}]$`).test(lastNode.value)) ||
(lastNode.kind === KIND_CJK_CHARACTER &&
node.kind === KIND_NON_CJK &&
!new RegExp(`^[${asciiPunctuationPattern}]`).test(node.value))
!new RegExp(`^[${punctuationCharRange}]`).test(node.value))
) {
nodes.push({ type: "whitespace", value: " " });
} else if (
@ -766,7 +771,7 @@ function getStringWidth(text) {
}
module.exports = {
asciiPunctuationPattern,
punctuationCharRange,
getStringWidth,
splitText,
mapDoc,

View File

@ -9,6 +9,8 @@ exports[`cjk.md 1`] = `
This ia an english paragraph with a CJK quote "中文".
This ia an english paragraph with a CJK quote “中文“.
扩展运算符spread是三个点\`...\`)。
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
這是一段很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長
@ -30,6 +32,8 @@ English 混合著中文的一段 Paragraph
This ia an english paragraph with a CJK quote "中文".
This ia an english paragraph with a CJK quote “中文“.
扩展运算符spread是三个点\`...\`)。
`;
@ -43,6 +47,8 @@ exports[`cjk.md 2`] = `
This ia an english paragraph with a CJK quote "中文".
This ia an english paragraph with a CJK quote “中文“.
扩展运算符spread是三个点\`...\`)。
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
這是一段很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長很長的段落
@ -53,6 +59,8 @@ This ia an english paragraph with a CJK quote "中文".
This ia an english paragraph with a CJK quote "中文".
This ia an english paragraph with a CJK quote “中文“.
扩展运算符spread是三个点\`...\`)。
`;

View File

@ -6,4 +6,6 @@
This ia an english paragraph with a CJK quote "中文".
This ia an english paragraph with a CJK quote “中文“.
扩展运算符spread是三个点`...`)。

View File

@ -4279,6 +4279,10 @@ unherit@^1.0.4:
inherits "^2.0.1"
xtend "^4.0.1"
unicode-regex@1.0.1:
version "1.0.1"
resolved "https://registry.yarnpkg.com/unicode-regex/-/unicode-regex-1.0.1.tgz#f819e050191d5b9561a339a58dd3b9095ed94b35"
unified@6.1.5:
version "6.1.5"
resolved "https://registry.yarnpkg.com/unified/-/unified-6.1.5.tgz#716937872621a63135e62ced2f3ac6a063c6fb87"