-
Notifications
You must be signed in to change notification settings - Fork 88
/
Copy pathtag_tidy.l
82 lines (64 loc) · 2.36 KB
/
tag_tidy.l
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
// A lexer to normalize HTML by adding closing tags and quoting non-quoted attribute values.
// Limitations: does no match HTML mixed case tags, cannot handle DTDs.
#include <stdio.h>
#include <deque>
#include <algorithm>
class Tidy : public Lexer {
public:
virtual int lex(void);
private:
std::deque<std::string> tags;
};
%o dotall main unicode class=Tidy
dot \p{Unicode}
pi <\?
comment <!--{dot}*?-->
open <[^!/>\s]+
close <\/{dot}+?>
string \"{dot}*?\"|'{dot}*?'
value =\s*[^/>\s'"]+
%x ATTRIBUTES
%%
{comment} ;
{pi} echo();
{open} {
std::string tag(text() + 1);
if (!tags.empty() && tags.back() == tag)
out() << "</" << tag << ">";
else
tags.push_back(tag);
echo();
start(ATTRIBUTES);
}
{close} {
std::string tag(text() + 2, size() - 3);
if (std::find(tags.begin(), tags.end(), tag) != tags.end())
{
while (!tags.empty() && tags.back() != tag)
{
out() << "</" << tags.back() << ">" << std::endl;
tags.pop_back();
}
tags.pop_back();
echo();
}
}
{dot} echo();
<ATTRIBUTES>"/>" echo();
tags.pop_back();
start(INITIAL);
<ATTRIBUTES>"?>" |
<ATTRIBUTES>">" echo();
start(INITIAL);
<ATTRIBUTES>{value} {
const char *t = text() + 1;
while (isspace(*t))
++t;
out() << "=\"" << t << "\"";
}
<ATTRIBUTES>{string} echo();
<ATTRIBUTES>\s+ out() << " ";
<ATTRIBUTES>[^'"] echo();
<*>. fprintf(stderr, "Invalid XML encoding\n");
return 0;
%%