-
Notifications
You must be signed in to change notification settings - Fork 0
/
GenLexer.java
executable file
·346 lines (275 loc) · 9.83 KB
/
GenLexer.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
// File: GenLexer.java
// Date: October 2015
// Java source file provided for Informatics 2A Assignment 1 (2015).
// Contains general infrastructure relating to DFAs and longest-match lexing,
// along with some trivial examples.
import java.io.* ;
// Some useful sets of characters.
class CharTypes {
static boolean isLetter (char c) {
return (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) ;
}
static boolean isSmall (char c) {
return (('a' <= c && c <= 'z') || c == '_') ;
}
static boolean isLarge (char c) {
return ('A' <= c && c <= 'Z') ;
}
static boolean isDigit (char c) {
return ('0' <= c && c <= '9') ;
}
static boolean isSymbolic (char c) {
return (c == '!' || c == '#' || c == '$' || c == '%' || c == '&' ||
c == '*' || c == '+' || c == '.' || c == '/' || c == '<' ||
c == '=' || c == '>' || c == '?' || c == '@' || c == '\\' ||
c == '^' || c == '|' || c == '-' || c == '~' || c == ':') ;
}
static boolean isWhitespace (char c) {
return (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f') ;
}
static boolean isNewline (char c) {
return (c == '\r' || c == '\n' || c == '\f' ) ;
}
}
// Generic implementation of DFAs with explicit "dead" states
interface DFA {
String lexClass() ;
int numberOfStates() ;
void reset() ;
void processChar (char c) throws StateOutOfRange ;
boolean isAccepting () ;
boolean isDead () ;
}
abstract class GenAcceptor {
// Stubs for methods specific to a particular DFA
abstract String lexClass() ;
abstract int numberOfStates() ;
abstract int nextState (int state, char input) ;
abstract boolean accepting (int state) ;
abstract boolean dead (int state) ;
// General DFA machinery
private int currState = 0 ; // the initial state is always 0
public void reset () {currState = 0 ;}
public void processChar (char c) throws StateOutOfRange {
// performs the state transition determined by c
currState = nextState (currState,c) ;
if (currState >= numberOfStates() || currState < 0) {
throw new StateOutOfRange (lexClass(), currState) ;
}
}
public boolean isAccepting () {return accepting (currState) ;}
public boolean isDead () {return dead (currState) ;}
}
class StateOutOfRange extends Exception {
public StateOutOfRange (String lexClassName, int state) {
super ("Illegal state " + Integer.toString(state) +
" in acceptor for " + lexClassName) ;
}
}
// Examples of DFAs for some example weird lexical classes.
// These show how particular DFAs are to be constructed.
// Example 1: Tokens consisting of an even number of letters (and nothing else)
class EvenLetterAcceptor extends GenAcceptor implements DFA {
public String lexClass() {return "EVEN" ;} ;
public int numberOfStates() {return 3 ;} ;
int nextState (int state, char c) {
switch (state) {
case 0: if (CharTypes.isLetter(c)) return 1 ; else return 2 ;
case 1: if (CharTypes.isLetter(c)) return 0 ; else return 2 ;
default: return 2 ; // garbage state, declared "dead" below
}
}
boolean accepting (int state) {return (state == 0) ;}
boolean dead (int state) {return (state == 2) ;}
}
// To create an instance of the above class, use `new EvenLetterAcceptor()'
// Example 2: Acceptor for just the token "&&"
class AndAcceptor extends GenAcceptor implements DFA {
public String lexClass() {return "&&" ;} ;
public int numberOfStates() {return 4 ;} ;
int nextState (int state, char c) {
switch (state) {
case 0: if (c=='&') return 1 ; else return 3 ;
case 1: if (c=='&') return 2 ; else return 3 ;
case 2: return 3 ;
default: return 3 ;
}
}
boolean accepting (int state) {return (state == 2) ;}
boolean dead (int state) {return (state == 3) ;}
}
// Example 3: Acceptor for just a space or linebreak character.
// Setting the lexical class as "" means these tokens will be discarded
// by the lexer.
class SpaceAcceptor extends GenAcceptor implements DFA {
public String lexClass() {return "" ;} ;
public int numberOfStates() {return 3 ;} ;
int nextState (int state, char c) {
switch (state) {
case 0: if (c == ' ' || c=='\n' || c=='\r') return 1 ;
else return 2 ;
default: return 2 ;
}
}
boolean accepting (int state) {return (state == 1) ;}
boolean dead (int state) {return (state == 2) ;}
}
// Generic lexical analyser.
// Uses principle of longest match, a.k.a. "maximal munch".
// A *lexical token* is simply a string tagged with the name of its
// lexical class.
class LexToken {
private final String value, lexClass ;
LexToken (String value, String lexClass) {
this.value = value ; this.lexClass = lexClass ;
}
public String value () {return this.value ;} ;
public String lexClass () {return this.lexClass ;} ;
}
// Typical example: new LexToken ("5", "num")
// The output of the lexing phase, and the input to the parsing phase,
// will be a LEX_TOKEN_STREAM object from which tokens may be drawn at will
// by calling `nextToken'.
interface LEX_TOKEN_STREAM {
LexToken pullToken () throws Exception ;
// pulls next token from stream, regardless of class
LexToken pullProperToken () throws Exception ;
// pulls next token not of class "" (e.g. skip whitespace and comments)
LexToken peekToken () throws Exception ;
// returns next token without removing it from stream
LexToken peekProperToken () throws Exception ;
// similarly for non-"" tokens
// All these methods return null once end of input is reached
}
// The following allows a LEX_TOKEN_STREAM object to be created for
// a given input file and a language-specific repertoire of lexical classes.
public class GenLexer implements LEX_TOKEN_STREAM {
Reader reader ;
// for reading characters from input
DFA[] acceptors ;
// array of acceptors for the lexical classes, in order of priority
GenLexer (Reader reader, DFA[] acceptors) {
this.reader = reader ;
this.acceptors = acceptors ;
}
LexToken bufferToken ; // buffer to allow 1-token lookahead
boolean bufferInUse = false ;
static final char EOF = (char)65535 ;
// Implementation of longest-match lexer as described in lectures.
// We go for simplicity and clarity rather than maximum efficiency.
LexToken nextToken ()
throws LexError, StateOutOfRange, IOException {
char c ; // current input character
String definite = "" ; // characters up to last acceptance point
String maybe = "" ; // characters since last acceptance point
int acceptorIndex = -1 ; // array index of highest priority acceptor
boolean liveFound = false ; // flags for use in
boolean acceptorFound = false ; // iteration over acceptors
for (int i=0; i<acceptors.length; i++) {
acceptors[i].reset() ;
} ;
do {
c = (char)(reader.read()) ;
acceptorFound = false ;
liveFound = false ;
if (c != EOF) {
maybe += c ;
for (int i=0; i<acceptors.length; i++) {
acceptors[i].processChar(c) ;
if (!acceptors[i].isDead()) {
liveFound = true ;
}
if (!acceptorFound && acceptors[i].isAccepting()) {
acceptorFound = true ;
acceptorIndex = i ;
definite += maybe ;
maybe = "" ;
reader.mark(10) ; // register backup point in input
} ;
}
}
} while (liveFound && c != EOF) ;
if (acceptorIndex >= 0) { // lex token has been found
// backup to last acceptance point and output token
reader.reset() ;
String theLexClass = acceptors[acceptorIndex].lexClass() ;
return new LexToken (definite, theLexClass) ;
} else if (c == EOF && maybe.equals("")) {
// end of input already reached before nextToken was called
reader.close() ;
return null ; // by convention, signifies end of input
} else {
reader.close() ;
throw new LexError(maybe) ;
}
}
public LexToken peekToken ()
throws LexError, StateOutOfRange, IOException {
if (bufferInUse) {
return bufferToken ;
} else {
bufferToken = nextToken() ;
bufferInUse = true ;
return bufferToken ;
}
}
public LexToken pullToken ()
throws LexError, StateOutOfRange, IOException {
peekToken () ;
bufferInUse = false ;
return bufferToken ;
}
public LexToken peekProperToken ()
throws LexError, StateOutOfRange, IOException {
LexToken tok = peekToken () ;
while (tok != null && tok.lexClass().equals("")) {
pullToken () ;
tok = peekToken () ;
}
bufferToken = tok ;
bufferInUse = true ;
return tok ;
}
public LexToken pullProperToken ()
throws LexError, StateOutOfRange, IOException {
peekProperToken () ;
bufferInUse = false ;
return bufferToken ;
}
}
// A weird example of a lexer using the DFAs constructed earlier
class DemoLexer extends GenLexer implements LEX_TOKEN_STREAM {
static DFA evenLetterAcc = new EvenLetterAcceptor() ;
static DFA andAcc = new AndAcceptor() ;
static DFA spaceAcc = new SpaceAcceptor() ;
static DFA[] acceptors =
new DFA[] {evenLetterAcc, andAcc, spaceAcc} ;
DemoLexer (Reader reader) {
super(reader,acceptors) ;
}
}
class LexerDemo {
public static void main (String[] args)
throws LexError, StateOutOfRange, IOException {
System.out.print ("Lexer> ") ;
Reader reader = new BufferedReader (new InputStreamReader (System.in)) ;
GenLexer demoLexer = new DemoLexer (reader) ;
LexToken currTok = demoLexer.pullProperToken() ;
while (currTok != null) {
System.out.println (currTok.value() + " \t" +
currTok.lexClass()) ;
currTok = demoLexer.pullProperToken() ;
} ;
}
}
class LexError extends Exception {
public LexError (String nonToken) {
super ("Can't make lexical token from input \"" +
nonToken + "\"") ;
}
}
// To try out the lexer on the dummy examples, compile this file, type
// java LexerDemo
// and then type a line of input such as
// abcd &&&&
// You can also experiment with erroneous inputs.