-
-
Notifications
You must be signed in to change notification settings - Fork 131
/
text-utils.js
141 lines (120 loc) · 4.21 KB
/
text-utils.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
const isHighSurrogate = charCode => charCode >= 0xd800 && charCode <= 0xdbff;
const isLowSurrogate = charCode => charCode >= 0xdc00 && charCode <= 0xdfff;
const isVariationSelector = charCode =>
charCode >= 0xfe00 && charCode <= 0xfe0f;
const isCombiningCharacter = charCode =>
(charCode >= 0x0300 && charCode <= 0x036f) ||
(charCode >= 0x1ab0 && charCode <= 0x1aff) ||
(charCode >= 0x1dc0 && charCode <= 0x1dff) ||
(charCode >= 0x20d0 && charCode <= 0x20ff) ||
(charCode >= 0xfe20 && charCode <= 0xfe2f);
// Are the given character codes a high/low surrogate pair?
//
// * `charCodeA` The first character code {Number}.
// * `charCode2` The second character code {Number}.
//
// Return a {Boolean}.
const isSurrogatePair = (charCodeA, charCodeB) =>
isHighSurrogate(charCodeA) && isLowSurrogate(charCodeB);
// Are the given character codes a variation sequence?
//
// * `charCodeA` The first character code {Number}.
// * `charCode2` The second character code {Number}.
//
// Return a {Boolean}.
const isVariationSequence = (charCodeA, charCodeB) =>
!isVariationSelector(charCodeA) && isVariationSelector(charCodeB);
// Are the given character codes a combined character pair?
//
// * `charCodeA` The first character code {Number}.
// * `charCode2` The second character code {Number}.
//
// Return a {Boolean}.
const isCombinedCharacter = (charCodeA, charCodeB) =>
!isCombiningCharacter(charCodeA) && isCombiningCharacter(charCodeB);
// Is the character at the given index the start of high/low surrogate pair
// a variation sequence, or a combined character?
//
// * `string` The {String} to check for a surrogate pair, variation sequence,
// or combined character.
// * `index` The {Number} index to look for a surrogate pair, variation
// sequence, or combined character.
//
// Return a {Boolean}.
const isPairedCharacter = (string, index = 0) => {
const charCodeA = string.charCodeAt(index);
const charCodeB = string.charCodeAt(index + 1);
return (
isSurrogatePair(charCodeA, charCodeB) ||
isVariationSequence(charCodeA, charCodeB) ||
isCombinedCharacter(charCodeA, charCodeB)
);
};
const IsJapaneseKanaCharacter = charCode =>
charCode >= 0x3000 && charCode <= 0x30ff;
const isCJKUnifiedIdeograph = charCode =>
charCode >= 0x4e00 && charCode <= 0x9fff;
const isFullWidthForm = charCode =>
(charCode >= 0xff01 && charCode <= 0xff5e) ||
(charCode >= 0xffe0 && charCode <= 0xffe6);
const isDoubleWidthCharacter = character => {
const charCode = character.charCodeAt(0);
return (
IsJapaneseKanaCharacter(charCode) ||
isCJKUnifiedIdeograph(charCode) ||
isFullWidthForm(charCode)
);
};
const isHalfWidthCharacter = character => {
const charCode = character.charCodeAt(0);
return (
(charCode >= 0xff65 && charCode <= 0xffdc) ||
(charCode >= 0xffe8 && charCode <= 0xffee)
);
};
const isKoreanCharacter = character => {
const charCode = character.charCodeAt(0);
return (
(charCode >= 0xac00 && charCode <= 0xd7a3) ||
(charCode >= 0x1100 && charCode <= 0x11ff) ||
(charCode >= 0x3130 && charCode <= 0x318f) ||
(charCode >= 0xa960 && charCode <= 0xa97f) ||
(charCode >= 0xd7b0 && charCode <= 0xd7ff)
);
};
const isCJKCharacter = character =>
isDoubleWidthCharacter(character) ||
isHalfWidthCharacter(character) ||
isKoreanCharacter(character);
const isWordStart = (previousCharacter, character) =>
(previousCharacter === ' ' ||
previousCharacter === '\t' ||
previousCharacter === '-' ||
previousCharacter === '/') &&
(character !== ' ' && character !== '\t');
const isWrapBoundary = (previousCharacter, character) =>
isWordStart(previousCharacter, character) || isCJKCharacter(character);
// Does the given string contain at least surrogate pair, variation sequence,
// or combined character?
//
// * `string` The {String} to check for the presence of paired characters.
//
// Returns a {Boolean}.
const hasPairedCharacter = string => {
let index = 0;
while (index < string.length) {
if (isPairedCharacter(string, index)) {
return true;
}
index++;
}
return false;
};
module.exports = {
isPairedCharacter,
hasPairedCharacter,
isDoubleWidthCharacter,
isHalfWidthCharacter,
isKoreanCharacter,
isWrapBoundary
};