-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscan.ulex
307 lines (266 loc) · 13.4 KB
/
scan.ulex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
%defs (
open Token
type lex_result = Token.token
val eof = fn () => EOF
exception UXMLLexer of string * AntlrStreamPos.pos
fun makePosStr (yylineno, yycolno, state) =
let
val lineno = Int.toString (!yylineno)
val colno = Int.toString (!yycolno)
in
"[" ^ lineno ^ "." ^ colno ^ "] [state=" ^ state ^ "]"
end
fun xmlCharRef r yypos pos =
let
val (digitStart, radix) =
if String.isPrefix "&#x" r
then (3, StringCvt.HEX)
else (2, StringCvt.DEC)
val digits = String.extract (r, digitStart, NONE)
in
case StringCvt.scanString (Int.scan radix) digits of
NONE =>
let
val msg = "StringCvt.scanString failed: " ^ digits
in
raise UXMLLexer (makePosStr pos ^ ": " ^ msg, yypos)
end
| SOME word => XMLCHARREF word
end
fun xmlEntityRef r =
XMLENTITYREF (String.extract (r, 1, SOME (String.size r - 2)))
fun xmlPERef r =
XMLPEREF (String.extract (r, 1, SOME (String.size r - 2)))
val stringBuffer : string list ref = ref []
fun append (sb, s : string) = sb := s :: (!sb)
fun toString sb = concat (rev (!sb)) before sb := []
fun drop n s = String.extract(s, n, NONE)
fun trimQuot s = String.extract (s, 1, SOME (size s - 2))
fun fail yytext yypos pos =
raise UXMLLexer (makePosStr pos ^ ": unexpected `" ^ yytext ^ "'", yypos)
);
%name UXMLLexer;
%states DECL DOCTYPE XML STAG ETAG ATT_APOS ATT_QUOT CDATA PI PI_CONTENT
XML_COMMENT ELEMENT_DECL ATTLIST_DECL ENTITY_DECL NOTATION_DECL
DOCTYPE_COMMENT DOCTYPE_PI DOCTYPE_PI_CONTENT
ATTLIST_QUOT ATTLIST_APOS
ENTITY_VALUE_QUOT ENTITY_VALUE_APOS
SYSTEMID_PUBID
;
(* [2] Char ::=
#x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
[#x10000-#x10FFFF] *)
%let xmlChar =
[\u0009\u000a\u000d] | [\u0020-\ud7ff] | [\ue000-\ufffd] |
[\U00010000-\U0010ffff];
(* [3] S ::= (#x20 | #x9 | #xD | #xA)+ *)
%let xmlS = [\u0020\u0009\u000d\u000a]+;
(* [4] NameStartChar ::=
":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] |
[#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] |
[#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] |
[#xFDF0-#xFFFD] | [#x10000-#xEFFFF] *)
%let xmlNameStartChar =
":" | [A-Z] | "_" | [a-z] | [\u00c0-\u00d6] | [\u00d8-\u00f6] |
[\u00f8-\u02ff] | [\u0370-\u037d] | [\u037f-\u1fff] | [\u200c-\u200d] |
[\u2070-\u218f] | [\u2c00-\u2fef] | [\u3001-\ud7ff] | [\uf900-\ufdcf] |
[\ufdf0-\ufffd] | [\U00010000-\U000effff];
(* [4a] NameChar ::=
NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] |
[#x203F-#x2040] *)
%let xmlNameChar =
{xmlNameStartChar} | "-" | "." | [0-9] | [\u00b7] | [\u0300-\u036f] |
[\u203f-\u2040];
(* [5] Name ::= NameStartChar (NameChar)* *)
%let xmlName = {xmlNameStartChar} {xmlNameChar}*;
(* [7] Nmtoken ::= (NameChar)+ *)
%let xmlNmtoken = {xmlNameChar}+;
(* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]* ) *)
%let xmlCharData = [^<&]* & ~([^<&]* "]]>" [^<&]*);
(* [17] PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) *)
%let piTarget = {xmlName} & ~([Xx] [Mm] [Ll]);
%let xmlCharRef = "&#" [0-9]+ ";" | "&#x" [0-9a-fA-F]+ ";";
%let xmlEntityRef = "&" {xmlName} ";";
%let xmlReference = {xmlEntityRef} | {xmlCharRef};
%let xmlPEReference = "%" {xmlName} ";";
%let xmlEntityValue = "\"" ([^%&"] | {xmlPEReference} | {xmlReference})* "\""
| "'" ([^%&'] | {xmlPEReference} | {xmlReference})* "'";
%let xmlSystemLiteral = ("\"" [^"]* "\"") | ("'" [^']* "'");
%let xmlPubidChar = [\032\013\010] | [a-zA-Z0-9] | [\-'\(\)\+,\.\/:\=?\;!\*#@\$_%];
%let xmlPubidCharExceptQuot = [\032\013\010] | [a-zA-Z0-9] | [-()+,./:=?;!*#@$_%];
%let xmlPubidLiteral = "\"" {xmlPubidChar}* "\""
| "'" {xmlPubidCharExceptQuot} "'";
<INITIAL>{xmlCharData} => (XMLCHARS(yytext));
<INITIAL>"<" {xmlName} => (YYBEGIN STAG; LSTAG(drop 1 yytext));
<INITIAL>"</" {xmlName} => (YYBEGIN ETAG; LETAG(drop 2 yytext));
<INITIAL>{xmlCharRef} => (xmlCharRef yytext yypos (yylineno, yycolno, "INITIAL"));
<INITIAL>{xmlEntityRef} => (xmlEntityRef yytext);
<INITIAL>"<![CDATA[" => (YYBEGIN CDATA; continue());
<INITIAL>"<!--" => (YYBEGIN XML_COMMENT; LXMLCOMMENT);
<INITIAL>"<?" {piTarget} => (YYBEGIN PI; LPI(drop 2 yytext));
<INITIAL>"<!DOCTYPE" => (YYBEGIN DOCTYPE; LDOCTYPE);
<INITIAL>"<". => (fail yytext yypos (yylineno, yycolno, "INITIAL"));
<INITIAL>. => (fail yytext yypos (yylineno, yycolno, "INITIAL"));
<STAG>{xmlS} => (continue());
<STAG>{xmlName} => (XMLNAME(yytext));
<STAG>"=" => (EQUAL);
<STAG>">" => (YYBEGIN INITIAL; RTAG);
<STAG>"/>" => (YYBEGIN INITIAL; REMPTYTAG);
<STAG>"''" => (XMLCHARS(""));
<STAG>"\"\"" => (XMLCHARS(""));
<STAG>[\'] => (YYBEGIN ATT_APOS; continue());
<STAG>[\"] => (YYBEGIN ATT_QUOT; continue());
<STAG>"/". => (fail yytext yypos (yylineno, yycolno, "STAG"));
<STAG>. => (fail yytext yypos (yylineno, yycolno, "STAG"));
(* [10] AttValue ::= '"' ([^<&"] | Reference)* '"' *)
<ATT_QUOT>[^<&"]+ => (XMLCHARS(yytext));
<ATT_QUOT>{xmlCharRef} => (xmlCharRef yytext yypos (yylineno, yycolno, "ATT_QUOT"));
<ATT_QUOT>{xmlEntityRef} => (xmlEntityRef yytext);
<ATT_QUOT>[\"] => (YYBEGIN STAG; continue());
<ATT_QUOT>"&" "#"? [0-9a-zA-Z]* => (fail yytext yypos (yylineno, yycolno, "ATT_QUOT"));
<ATT_QUOT>"<" => (fail yytext yypos (yylineno, yycolno, "ATT_QUOT"));
(* | "'" ([^<&'] | Reference)* "'" *)
<ATT_APOS>[^<&']+ => (XMLCHARS(yytext));
<ATT_APOS>{xmlCharRef} => (xmlCharRef yytext yypos (yylineno, yycolno, "ATT_APOS"));
<ATT_APOS>{xmlEntityRef} => (xmlEntityRef yytext);
<ATT_APOS>[\'] => (YYBEGIN STAG; continue());
<ATT_APOS>"&" "#"? [0-9a-zA-Z]* => (fail yytext yypos (yylineno, yycolno, "ATT_APOS"));
<ATT_APOS>"<" => (fail yytext yypos (yylineno, yycolno, "ATT_APOS"));
<ETAG>{xmlS} => (continue());
<ETAG>">" => (YYBEGIN INITIAL; RTAG);
<ETAG>. => (fail yytext yypos (yylineno, yycolno, "ETAG"));
(* [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->' *)
<XML_COMMENT>"-->" => (YYBEGIN INITIAL; RXMLCOMMENT);
<XML_COMMENT>(({xmlChar} & ~"-") | ("-" ({xmlChar} & ~"-")))+
=> (XMLCHARS(yytext));
<XML_COMMENT>"--" => (fail yytext yypos (yylineno, yycolno, "XML_COMMENT"));
<XML_COMMENT>(. & ~{xmlChar}) => (fail yytext yypos (yylineno, yycolno, "XML_COMMENT"));
(* [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char* )))? '?>' *)
<PI>{xmlS}
(* White space immediately following the target name of a PI. *)
=> (YYBEGIN PI_CONTENT; continue());
<PI>"?>" => (YYBEGIN INITIAL; RPI);
<PI>(. & ~{xmlS} & ~"?") => (fail yytext yypos (yylineno, yycolno, "PI"));
<PI_CONTENT>"?>" => (YYBEGIN INITIAL; RPI);
<PI_CONTENT>({xmlChar} & ~"?")+ => (XMLCHARS(yytext));
<PI_CONTENT>"?" => (XMLCHARS(yytext));
<PI_CONTENT>(. & ~{xmlChar}) => (fail yytext yypos (yylineno, yycolno, "PI_CONTENT"));
(* [20] CData ::= (Char* - (Char* ']]>' Char* )) *)
<CDATA>"]]>" => (YYBEGIN INITIAL; XMLCDSECT (toString stringBuffer));
<CDATA>({xmlChar} & ~"]")+ => (append (stringBuffer, yytext); continue ());
<CDATA>[\]] => (append (stringBuffer, yytext); continue ());
<CDATA>(. & ~{xmlChar}) => (fail yytext yypos (yylineno, yycolno, "CDATA"));
<DOCTYPE>">" => (YYBEGIN INITIAL; RTAG);
<DOCTYPE>{xmlS} => (continue());
<DOCTYPE>{xmlPEReference} => (xmlPERef yytext);
<DOCTYPE>"PUBLIC" => (PUBLIC);
<DOCTYPE>"SYSTEM" => (SYSTEM);
<DOCTYPE>{xmlName} => (XMLNAME(yytext));
<DOCTYPE>[\[] => (LBRACKET);
<DOCTYPE>[\]] => (RBRACKET);
<DOCTYPE>"<!ELEMENT" =>(YYBEGIN ELEMENT_DECL; LELEMENTDECL );
<DOCTYPE>"<!ATTLIST" =>(YYBEGIN ATTLIST_DECL; LATTLISTDECL );
<DOCTYPE>"<!ENTITY" =>(YYBEGIN ENTITY_DECL; LENTITYDECL );
<DOCTYPE>"<!NOTATION" =>(YYBEGIN NOTATION_DECL;LNOTATIONDECL);
<DOCTYPE>"<?" {piTarget} => (YYBEGIN DOCTYPE_PI; LPI(drop 2 yytext));
<DOCTYPE>"<!--" =>(YYBEGIN DOCTYPE_COMMENT; continue());
<DOCTYPE>"\"" ([^"])* "\"" => (XMLCHARS(trimQuot yytext));
<DOCTYPE>"'" ([^'])* "'" => (XMLCHARS(trimQuot yytext));
<DOCTYPE>. => (fail yytext yypos (yylineno, yycolno, "DOCTYPE"));
<ELEMENT_DECL>">" => (YYBEGIN DOCTYPE; RTAG);
<ELEMENT_DECL>{xmlS} => (continue());
<ELEMENT_DECL>{xmlName} => (XMLNAME(yytext));
<ELEMENT_DECL>"#PCDATA" => (XMLNAME(yytext));
<ELEMENT_DECL>"(" => (LPAREN);
<ELEMENT_DECL>")" => (RPAREN);
<ELEMENT_DECL>"|" => (BAR);
<ELEMENT_DECL>"?" => (QUESTION);
<ELEMENT_DECL>"*" => (STAR);
<ELEMENT_DECL>"+" => (PLUS);
<ELEMENT_DECL>"," => (COMMA);
<ELEMENT_DECL>. => (fail yytext yypos (yylineno, yycolno, "ELEMENT_DECL"));
<ATTLIST_DECL>">" => (YYBEGIN DOCTYPE; RTAG);
<ATTLIST_DECL>{xmlS} => (continue());
<ATTLIST_DECL>"CDATA" => (CDATA_KW);
<ATTLIST_DECL>"ID" => (ID);
<ATTLIST_DECL>"IDREF" => (IDREF);
<ATTLIST_DECL>"IDREFS" => (IDREFS);
<ATTLIST_DECL>"ENTITY" => (ENTITY);
<ATTLIST_DECL>"ENTITIES" => (ENTITIES);
<ATTLIST_DECL>"NMTOKEN" => (NMTOKEN);
<ATTLIST_DECL>"NMTOKENS" => (NMTOKENS);
<ATTLIST_DECL>"NOTATION" => (NOTATION);
<ATTLIST_DECL>"#REQUIRED" => (REQUIRED);
<ATTLIST_DECL>"#IMPLIED" => (IMPLIED);
<ATTLIST_DECL>"#FIXED" => (FIXED);
<ATTLIST_DECL>{xmlName} => (XMLNAME(yytext));
<ATTLIST_DECL>{xmlNmtoken} => (XMLNAME(yytext));
<ATTLIST_DECL>"(" => (LPAREN);
<ATTLIST_DECL>")" => (RPAREN);
<ATTLIST_DECL>"|" => (BAR);
<ATTLIST_DECL>[\'] => (YYBEGIN ATTLIST_APOS; continue());
<ATTLIST_DECL>[\"] => (YYBEGIN ATTLIST_QUOT; continue());
<ATTLIST_DECL>. => (fail yytext yypos (yylineno, yycolno, "ATTLIST_DECL"));
<ATTLIST_QUOT>[^<&"]+ => (XMLCHARS(yytext));
<ATTLIST_QUOT>{xmlCharRef} => (xmlCharRef yytext yypos (yylineno, yycolno, "ATTLIST_QUOT"));
<ATTLIST_QUOT>{xmlEntityRef} => (xmlEntityRef yytext);
<ATTLIST_QUOT>[\"] => (YYBEGIN ATTLIST_DECL; continue());
<ATTLIST_QUOT>"&" "#"? [0-9a-zA-Z]* => (fail yytext yypos (yylineno, yycolno, "ATTLISTQUOT"));
<ATTLIST_QUOT>"<" => (fail yytext yypos (yylineno, yycolno, "ATTLISTQUOT"));
<ATTLIST_APOS>[^<&']+ => (XMLCHARS(yytext));
<ATTLIST_APOS>{xmlCharRef} => (xmlCharRef yytext yypos (yylineno, yycolno, "ATTLIST_APOS"));
<ATTLIST_APOS>{xmlEntityRef} => (xmlEntityRef yytext);
<ATTLIST_APOS>[\'] => (YYBEGIN ATTLIST_DECL; continue());
<ATTLIST_APOS>"&" "#"? [0-9a-zA-Z]* => (fail yytext yypos (yylineno, yycolno, "ATTLIST_APOS"));
<ATTLIST_APOS>"<" => (fail yytext yypos (yylineno, yycolno, "ATTLIST_APOS"));
<ENTITY_DECL>">" => (YYBEGIN DOCTYPE; RTAG);
<ENTITY_DECL>{xmlS} => (continue());
<ENTITY_DECL>"PUBLIC" => (YYBEGIN SYSTEMID_PUBID; PUBLIC);
<ENTITY_DECL>"SYSTEM" => (YYBEGIN SYSTEMID_PUBID; SYSTEM);
<ENTITY_DECL>{xmlName} => (XMLNAME(yytext));
<ENTITY_DECL>"%" => (PERCENT);
<ENTITY_DECL>[\"] => (YYBEGIN ENTITY_VALUE_QUOT; continue());
<ENTITY_DECL>[\'] => (YYBEGIN ENTITY_VALUE_APOS; continue());
<ENTITY_DECL>. => (fail yytext yypos (yylineno, yycolno, "ENTITY_DECL"));
<ENTITY_VALUE_QUOT>[^%&"]+ => (XMLCHARS(yytext));
<ENTITY_VALUE_QUOT>{xmlPEReference} => (xmlPERef yytext);
<ENTITY_VALUE_QUOT>{xmlCharRef} => (xmlCharRef yytext yypos (yylineno, yycolno, "ENTITY_VALUE_QUOT"));
<ENTITY_VALUE_QUOT>{xmlEntityRef} => (xmlEntityRef yytext);
<ENTITY_VALUE_QUOT>[\"] => (YYBEGIN ENTITY_DECL; continue());
<ENTITY_VALUE_QUOT>"&" "#"? [0-9a-zA-Z]* => (fail yytext yypos (yylineno, yycolno, "ENTITY_VALUE_QUOT"));
<ENTITY_VALUE_QUOT>"%" [0-9a-zA-Z]* => (fail yytext yypos (yylineno, yycolno, "ENTITY_VALUE_QUOT"));
<ENTITY_VALUE_APOS>[^%&']+ => (XMLCHARS(yytext));
<ENTITY_VALUE_APOS>{xmlPEReference} => (xmlPERef yytext);
<ENTITY_VALUE_APOS>{xmlCharRef} => (xmlCharRef yytext yypos (yylineno, yycolno, "ENTITY_VALUE_APOS"));
<ENTITY_VALUE_APOS>{xmlEntityRef} => (xmlEntityRef yytext);
<ENTITY_VALUE_APOS>[\'] => (YYBEGIN ENTITY_DECL; continue());
<ENTITY_VALUE_APOS>"&" "#"? [0-9a-zA-Z]* => (fail yytext yypos (yylineno, yycolno, "ENTITY_VALUE_APOS"));
<ENTITY_VALUE_APOS>"%" [0-9a-zA-Z]* => (fail yytext yypos (yylineno, yycolno, "ENTITY_VALUE_APOS"));
<SYSTEMID_PUBID>">" => (YYBEGIN DOCTYPE; RTAG);
<SYSTEMID_PUBID>{xmlS} => (continue());
<SYSTEMID_PUBID>"NDATA" => (NDATA);
<SYSTEMID_PUBID>{xmlName} => (XMLNAME(yytext));
<SYSTEMID_PUBID>"\"" ([^"])* "\"" => (XMLCHARS(trimQuot yytext));
<SYSTEMID_PUBID>"'" ([^'])* "'" => (XMLCHARS(trimQuot yytext));
<SYSTEMID_PUBID>. => (fail yytext yypos (yylineno, yycolno, "SYSTEMID_PUBID"));
<NOTATION_DECL>">" => (YYBEGIN DOCTYPE; RTAG);
<NOTATION_DECL>{xmlS} => (continue());
<NOTATION_DECL>"PUBLIC" => (PUBLIC);
<NOTATION_DECL>"SYSTEM" => (SYSTEM);
<NOTATION_DECL>{xmlName} => (XMLNAME(yytext));
<NOTATION_DECL>"\"" ([^"])* "\"" => (XMLCHARS(trimQuot yytext));
<NOTATION_DECL>"'" ([^'])* "'" => (XMLCHARS(trimQuot yytext));
<NOTATION_DECL>. => (fail yytext yypos (yylineno, yycolno, "NOTATION_DECL"));
<DOCTYPE_COMMENT>"-->" => (YYBEGIN DOCTYPE; continue());
<DOCTYPE_COMMENT>(({xmlChar} & ~"-") | ("-" ({xmlChar} & ~"-")))+ => (continue());
<DOCTYPE_COMMENT>"--" => (fail yytext yypos (yylineno, yycolno, "DOCTYPE_COMMENT"));
<DOCTYPE_COMMENT>(. & ~{xmlChar}) => (fail yytext yypos (yylineno, yycolno, "DOCTYPE_COMMENT"));
<DOCTYPE_PI>{xmlS}
(* White space immediately following the target name of a PI. *)
=> (YYBEGIN DOCTYPE_PI_CONTENT; continue());
<DOCTYPE_PI>"?>" => (YYBEGIN DOCTYPE; RPI);
<DOCTYPE_PI>(. & ~{xmlS} & ~"?") => (fail yytext yypos (yylineno, yycolno, "DOCTYPE_PI"));
<DOCTYPE_PI_CONTENT>"?>" => (YYBEGIN DOCTYPE; RPI);
<DOCTYPE_PI_CONTENT>({xmlChar} & ~"?")+ => (XMLCHARS(yytext));
<DOCTYPE_PI_CONTENT>"?" => (XMLCHARS(yytext));
<DOCTYPE_PI_CONTENT>(. & ~{xmlChar}) => (fail yytext yypos (yylineno, yycolno, "DOCTYPE_PI_CONTENT"));