-
Notifications
You must be signed in to change notification settings - Fork 88
/
Copy pathmmap.l
152 lines (141 loc) · 3.34 KB
/
mmap.l
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
// Example RE/flex lexer to tokenize a large C/C++ file faster using mmap(2)
// and buffer(b, n) with zero copy overhead.
//
// Lexer method buffer(b, n) scans n-1 bytes at address b.
//
// WARNING: Do not use text(), span(), rest(), that modify the mmap-ed data!!!
// Use str() or begin() and size() to extract tokens as strings!
// Also echo() is safe to use.
//
// When text(), span(), rest() are used, memory b[0..n] will be modified and
// b[n] will be set to zero. Also unput() should be avoided.
//
// WARNING: Do not use original Flex to do the same with yy_scan_buffer,
// because Flex requires two zero bytes and the mmap-ed buffer will be
// modified, i.e. Flex yy_scan_buffer cannot be truly read-only.
//
// mmap is the fastest method to scan a file, but no UTF detection, conversion
// or other code page conversions can be applied. To do so, we first open the
// file and assign it to a reflex::Input to detect if encoded in UTF-16 or
// UTF-32 and if so use reflex::Input to scan the file instead of mmap-ing.
//
// See also ctokens.l for a Flex-like C/C++ tokenizer without mmap(2)
%top{
#include <stdio.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <sys/mman.h>
%}
/* may add %option unicode before the %include to match unicode identifiers */
%include "cdefs.l"
%option nodefault
%option fast
%%
{WHITESPACE}
{ILCOMMENT}
{MLCOMMENT}
{DIRECTIVE} out() << "DIRECTIVE " << str() << std::endl;
{NAME} out() << "NAME " << str() << std::endl;
{UFLT} out() << "FLOAT " << str() << std::endl;
{UINT} out() << "INT " << str() << std::endl;
{CHAR} out() << "CHAR " << str() << std::endl;
{STRING} out() << "STRING " << str() << std::endl;
"{"|"<%" |
"}"|"%>" |
"["|"<:" |
"]"|":>" |
"(" |
")" |
"+=" |
"++" |
"+" |
"-=" |
"--" |
"->*" |
"->" |
"-" |
"==" |
"=" |
"<=" |
"<<=" |
"<<" |
"<" |
">=" |
">>=" |
">>" |
">" |
"!=" |
"!" |
"," |
";" |
"..." |
".*" |
"." |
"^=" |
"^" |
"~" |
"*=" |
"*" |
"/=" |
"/" |
"%=" |
"%" |
"&=" |
"&&" |
"&" |
"|=" |
"||" |
"|" |
"::" |
":" |
"?" out() << "PUNCT " << str() << std::endl;
. out() << "*** ERROR at line " << lineno() << std::endl;
%%
int main(int argc, char **argv)
{
if (argc >= 2)
{
FILE *file = fopen(argv[1], "r"); // or use fopen_s()
if (file != NULL)
{
reflex::Input input(file);
if (input.file_encoding() == reflex::Input::file_encoding::plain)
{
int fd = fileno(file);
struct stat st;
if (fstat(fd, &st) == 0 && S_ISREG(st.st_mode) && st.st_size <= 4294967295LL)
{
size_t size = static_cast<size_t>(st.st_size);
char *base = (char*)mmap(0, size, PROT_READ, MAP_PRIVATE, fd, 0);
if (base != MAP_FAILED)
{
Lexer lexer;
lexer.buffer(base, size + 1); // size + 1 to include non-accessed final byte
lexer.lex();
munmap((void*)base, size);
}
else
{
perror("could not mmap the specified file");
}
}
else
{
perror("could not stat the specified file");
}
}
else
{
Lexer lexer(input);
lexer.lex();
}
fclose(file);
}
else
{
perror("could not open the specified file");
}
}
}