forked from mathiasbynens/he
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape-spec.js
149 lines (136 loc) · 5.03 KB
/
scrape-spec.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/env phantomjs
var page = require('webpage').create();
var fs = require('fs');
var jsesc = require('jsesc');
var open = function(url, callback) {
page.open(url, function(status) {
if (status != 'success') {
return phantom.exit();
}
callback();
});
};
var writeJSON = function(fileName, data) {
var contents = jsesc(data, {
'json': true,
'compact': false
});
fs.write(fileName, contents + '\n', 'w');
console.log(fileName + ' created successfully.');
};
open('https://html.spec.whatwg.org/', function() {
var result = JSON.parse(page.evaluate(function() {
// Modified version of `ucs2encode`; see https://mths.be/punycode
var stringFromCharCode = String.fromCharCode;
var codePointToSymbol = function(codePoint) {
var output = '';
if (codePoint > 0xFFFF) {
codePoint -= 0x10000;
output += stringFromCharCode(codePoint >>> 10 & 0x3FF | 0xD800);
codePoint = 0xDC00 | codePoint & 0x3FF;
}
output += stringFromCharCode(codePoint);
return output;
};
var range = function(start, stop) {
for (var result = []; start <= stop; result.push(start++));
return result;
};
// Code points that cause parse errors when used in character references
// https://html.spec.whatwg.org/multipage/syntax.html#table-charref-overrides
var table = document.querySelector('#table-charref-overrides');
var siblings = table.parentNode.children;
var max = siblings.length - 1;
var text = siblings[max].textContent;
var charRefCodePoints = [];
text.replace(/0x([a-fA-F0-9]+)\s+to\s+0x([a-fA-F0-9]+)/g, function($0, $1, $2) {
var start = parseInt($1, 16);
var end = parseInt($2, 16);
charRefCodePoints = charRefCodePoints.concat(range(start, end));
return '';
}).replace(/0x([a-fA-F0-9]+)/g, function($0, $1) {
var codePoint = parseInt($1, 16);
charRefCodePoints.push(codePoint);
return '';
});
charRefCodePoints = charRefCodePoints.sort(function(a, b) {
return a - b;
});
// Character reference overrides
// https://html.spec.whatwg.org/multipage/syntax.html#table-charref-overrides
var cells = table.querySelectorAll('td');
var keys = [].filter.call(cells, function(cell, index) {
return index % 3 == 0;
}).map(function(cell) {
return Number(cell.textContent.trim());
});
var values = [].filter.call(cells, function(cell, index) {
return index % 3 == 1;
}).map(function(cell) {
var hex = cell.textContent.trim().replace('U+', '');
var codePoint = parseInt(hex, 16);
return codePointToSymbol(codePoint);
});
var overrides = {};
keys = keys.forEach(function(codePoint, index) {
var symbol = codePointToSymbol(codePoint);
var correspondingValue = values[index];
var mapsToItself = symbol == correspondingValue;
var alreadyMarkedAsInvalid = charRefCodePoints.indexOf(codePoint) > -1;
if (mapsToItself && !alreadyMarkedAsInvalid) {
charRefCodePoints.push(codePoint);
return;
}
if (!mapsToItself || !alreadyMarkedAsInvalid) {
overrides[codePoint] = correspondingValue;
}
});
// Code points for symbols that cause parse errors when in the HTML source
// https://html.spec.whatwg.org/multipage/syntax.html#preprocessing-the-input-stream
var header = document.querySelector('#preprocessing-the-input-stream');
var element = header;
var text;
while (element = element.nextSibling) {
text = element.textContent.trim();
if (/Any occurrences of any characters in the ranges/.test(text)) {
break;
}
}
var rawCodePoints = [];
text.replace(/U\+([a-fA-F0-9]+)\s+to\s+U\+([a-fA-F0-9]+)/g, function($0, $1, $2) {
var start = parseInt($1, 16);
var end = parseInt($2, 16);
rawCodePoints = rawCodePoints.concat(range(start, end));
return '';
}).replace(/U\+([a-fA-F0-9]+)/g, function($0, $1) {
var codePoint = parseInt($1, 16);
rawCodePoints.push(codePoint);
return '';
});
rawCodePoints = rawCodePoints.sort(function(a, b) {
return a - b;
});
// U+0000 is a parse error in the Data state (which is the state where
// `he`’s input and output is supposed to end up in), so add it to the set
// of invalid raw code points.
// https://html.spec.whatwg.org/multipage/syntax.html#data-state
rawCodePoints.unshift(0x0000);
// Pass everything back to PhantomJS.
return JSON.stringify({
'overrides': overrides,
'charRefCodePoints': charRefCodePoints,
'rawCodePoints': rawCodePoints
});
}));
var overrides = result.overrides;
var overrideCodePoints = Object.keys(overrides).map(Number);
writeJSON('data/decode-map-overrides.json', overrides);
writeJSON('data/decode-code-points-overrides.json', overrideCodePoints);
writeJSON('data/invalid-character-reference-code-points.json', result.charRefCodePoints);
writeJSON('data/invalid-raw-code-points.json', result.rawCodePoints);
// Note: `invalid-character-reference-code-points.json` is identical to
// `invalid-raw-code-points.json` except U+000D (CR) is not included in
// the latter, because lone CR are converted to LF before tokenization.
// https://html.spec.whatwg.org/multipage/syntax.html#preprocessing-the-input-stream
phantom.exit();
});