forked from oven-sh/bun
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgen-unicode-table.js
194 lines (164 loc) · 6.67 KB
/
gen-unicode-table.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
// Thank you @evanw for this code!!!
const fs = require("fs");
const path = require("path");
// ES5 reference: https://es5.github.io/
//
// A conforming implementation of this International standard shall interpret
// characters in conformance with the Unicode Standard, Version 3.0 or later
// and ISO/IEC 10646-1 with either UCS-2 or UTF-16 as the adopted encoding
// form, implementation level 3. If the adopted ISO/IEC 10646-1 subset is not
// otherwise specified, it is presumed to be the BMP subset, collection 300.
//
// UnicodeLetter: any character in the Unicode categories “Uppercase letter (Lu)”,
// “Lowercase letter (Ll)”, “Titlecase letter (Lt)”, “Modifier letter (Lm)”,
// “Other letter (Lo)”, or “Letter number (Nl)”.
const idStartES5 = []
.concat(
require("@unicode/unicode-3.0.0/General_Category/Uppercase_Letter/code-points"),
require("@unicode/unicode-3.0.0/General_Category/Lowercase_Letter/code-points"),
require("@unicode/unicode-3.0.0/General_Category/Titlecase_Letter/code-points"),
require("@unicode/unicode-3.0.0/General_Category/Modifier_Letter/code-points"),
require("@unicode/unicode-3.0.0/General_Category/Other_Letter/code-points")
// The "letter number" category is not included because old versions of Safari
// had a bug where they didn't include it. This means it does not match ES5.
// We need to make sure we escape these characters so Safari can read them.
// See https://github.com/evanw/esbuild/issues/1349 for more information.
// require('@unicode/unicode-3.0.0/General_Category/Letter_Number/code-points'),
)
.sort((a, b) => a - b);
// UnicodeCombiningMark: any character in the Unicode categories “Non-spacing mark (Mn)”
// or “Combining spacing mark (Mc)”
// UnicodeDigit: any character in the Unicode category “Decimal number (Nd)”
// UnicodeConnectorPunctuation: any character in the Unicode category “Connector punctuation (Pc)”
const idContinueES5 = idStartES5
.concat(
require("@unicode/unicode-3.0.0/General_Category/Nonspacing_Mark/code-points"),
require("@unicode/unicode-3.0.0/General_Category/Spacing_Mark/code-points"),
require("@unicode/unicode-3.0.0/General_Category/Decimal_Number/code-points"),
require("@unicode/unicode-3.0.0/General_Category/Connector_Punctuation/code-points")
)
.sort((a, b) => a - b);
// ESNext reference: https://tc39.es/ecma262/
//
// A conforming implementation of ECMAScript must interpret source text input
// in conformance with the Unicode Standard, Version 5.1.0 or later and ISO/IEC
// 10646. If the adopted ISO/IEC 10646-1 subset is not otherwise specified, it
// is presumed to be the Unicode set, collection 10646.
//
// UnicodeIDStart: any Unicode code point with the Unicode property “ID_Start”
const idStartESNext = require("@unicode/unicode-13.0.0/Binary_Property/ID_Start/code-points");
const idStartESNextSet = new Set(idStartESNext);
// UnicodeIDContinue: any Unicode code point with the Unicode property “ID_Continue”
const idContinueESNext = require("@unicode/unicode-13.0.0/Binary_Property/ID_Continue/code-points");
const idContinueESNextSet = new Set(idContinueESNext);
// These identifiers are valid in both ES5 and ES6+ (i.e. an intersection of both)
const idStartES5AndESNext = idStartES5.filter((n) => idStartESNextSet.has(n));
const idContinueES5AndESNext = idContinueES5.filter((n) =>
idContinueESNextSet.has(n)
);
// These identifiers are valid in either ES5 or ES6+ (i.e. a union of both)
const idStartES5OrESNext = [...new Set(idStartES5.concat(idStartESNext))].sort(
(a, b) => a - b
);
const idContinueES5OrESNext = [
...new Set(idContinueES5.concat(idContinueESNext)),
].sort((a, b) => a - b);
function generateRangeTable(codePoints) {
let lines = [];
let index = 0;
let latinOffset = 0;
while (latinOffset < codePoints.length && codePoints[latinOffset] <= 0xff) {
latinOffset++;
}
lines.push(`RangeTable.init(`, ` ${latinOffset},`, ` &[_]R16Range{`);
// 16-bit code points
while (index < codePoints.length && codePoints[index] < 0x1000) {
let start = codePoints[index];
index++;
while (
index < codePoints.length &&
codePoints[index] < 0x1000 &&
codePoints[index] === codePoints[index - 1] + 1
) {
index++;
}
let end = codePoints[index - 1];
lines.push(` .{0x${start.toString(16)}, 0x${end.toString(16)}},`);
}
lines.push(` },`, `&[_]R32Range{`);
// 32-bit code points
while (index < codePoints.length) {
let start = codePoints[index];
index++;
while (
index < codePoints.length &&
codePoints[index] === codePoints[index - 1] + 1
) {
index++;
}
let end = codePoints[index - 1];
lines.push(` .{0x${start.toString(16)}, 0x${end.toString(16)}},`);
}
lines.push(` },`, `);`);
return lines.join("\n");
}
function generateBigSwitchStatement(codePoints) {
let lines = [];
let index = 0;
let latinOffset = 0;
while (latinOffset < codePoints.length && codePoints[latinOffset] <= 0xff) {
latinOffset++;
}
lines.push(`return switch(codepoint) {`);
// 16-bit code points
while (index < codePoints.length && codePoints[index] < 0x1000) {
let start = codePoints[index];
index++;
while (
index < codePoints.length &&
codePoints[index] < 0x1000 &&
codePoints[index] === codePoints[index - 1] + 1
) {
index++;
}
let end = codePoints[index - 1];
lines.push(`0x${start.toString(16)}...0x${end.toString(16)},`);
}
// 32-bit code points
while (index < codePoints.length) {
let start = codePoints[index];
index++;
while (
index < codePoints.length &&
codePoints[index] === codePoints[index - 1] + 1
) {
index++;
}
let end = codePoints[index - 1];
lines.push(` 0x${start.toString(16)}...0x${end.toString(16)},`);
}
lines.push(` => true,
else => false
};`);
return lines.join("\n");
}
fs.writeFileSync(
path.join(__dirname, "..", "src", "js_lexer", "unicode.zig"),
`// This file was automatically generated by ${path.basename(
__filename
)}. Do not edit.
const RangeTable = @import("./range_table.zig");
// ES5 || ESNext
pub const id_start = ${generateRangeTable(idStartES5OrESNext)}
// ES5 || ESNext
pub const id_continue = ${generateRangeTable(idContinueES5OrESNext)}
pub const printable_id_start = ${generateRangeTable(idStartESNext)}
pub const printable_id_continue = ${generateRangeTable(idContinueESNext)}
pub fn isIdentifierStart(comptime Codepoint: type, codepoint: Codepoint) bool{
${generateBigSwitchStatement(idStartES5OrESNext)}
}
pub fn isIdentifierContinue(comptime Codepoint: type, codepoint: Codepoint) bool{
${generateBigSwitchStatement(idContinueES5OrESNext)}
}
`
);