1
1
pub use super :: token:: Tok ;
2
2
use std:: collections:: HashMap ;
3
- use std:: str:: CharIndices ;
4
3
5
- pub struct Lexer < ' input > {
6
- chars : CharIndices < ' input > ,
4
+ pub struct Lexer < T : Iterator < Item = char > > {
5
+ chars : T ,
7
6
at_begin_of_line : bool ,
8
7
nesting : usize , // Amount of parenthesis
9
8
indentation_stack : Vec < usize > ,
@@ -85,10 +84,129 @@ pub fn get_keywords() -> HashMap<String, Tok> {
85
84
86
85
pub type Spanned < Tok > = Result < ( Location , Tok , Location ) , LexicalError > ;
87
86
88
- impl < ' input > Lexer < ' input > {
89
- pub fn new ( input : & ' input str ) -> Self {
87
+ pub fn make_tokenizer < ' a > ( source : & ' a str ) -> impl Iterator < Item = Spanned < Tok > > + ' a {
88
+ let nlh = NewlineHandler :: new ( source. chars ( ) ) ;
89
+ let lch = LineContinationHandler :: new ( nlh) ;
90
+ let lexer = Lexer :: new ( lch) ;
91
+ lexer
92
+ }
93
+
94
+ // The newline handler is an iterator which collapses different newline
95
+ // types into \n always.
96
+ pub struct NewlineHandler < T : Iterator < Item = char > > {
97
+ source : T ,
98
+ chr0 : Option < char > ,
99
+ chr1 : Option < char > ,
100
+ }
101
+
102
+ impl < T > NewlineHandler < T >
103
+ where
104
+ T : Iterator < Item = char > ,
105
+ {
106
+ pub fn new ( source : T ) -> Self {
107
+ let mut nlh = NewlineHandler {
108
+ source : source,
109
+ chr0 : None ,
110
+ chr1 : None ,
111
+ } ;
112
+ nlh. shift ( ) ;
113
+ nlh. shift ( ) ;
114
+ nlh
115
+ }
116
+
117
+ fn shift ( & mut self ) -> Option < char > {
118
+ let result = self . chr0 ;
119
+ self . chr0 = self . chr1 ;
120
+ self . chr1 = self . source . next ( ) ;
121
+ result
122
+ }
123
+ }
124
+
125
+ impl < T > Iterator for NewlineHandler < T >
126
+ where
127
+ T : Iterator < Item = char > ,
128
+ {
129
+ type Item = char ;
130
+
131
+ fn next ( & mut self ) -> Option < Self :: Item > {
132
+ // Collapse \r\n into \n
133
+ loop {
134
+ if self . chr0 == Some ( '\r' ) {
135
+ if self . chr1 == Some ( '\n' ) {
136
+ // Transform windows EOL into \n
137
+ self . shift ( ) ;
138
+ } else {
139
+ // Transform MAC EOL into \n
140
+ self . chr0 = Some ( '\n' )
141
+ }
142
+ } else {
143
+ break ;
144
+ }
145
+ }
146
+
147
+ self . shift ( )
148
+ }
149
+ }
150
+
151
+ // Glues \ and \n into a single line:
152
+ pub struct LineContinationHandler < T : Iterator < Item = char > > {
153
+ source : T ,
154
+ chr0 : Option < char > ,
155
+ chr1 : Option < char > ,
156
+ }
157
+
158
+ impl < T > LineContinationHandler < T >
159
+ where
160
+ T : Iterator < Item = char > ,
161
+ {
162
+ pub fn new ( source : T ) -> Self {
163
+ let mut nlh = LineContinationHandler {
164
+ source : source,
165
+ chr0 : None ,
166
+ chr1 : None ,
167
+ } ;
168
+ nlh. shift ( ) ;
169
+ nlh. shift ( ) ;
170
+ nlh
171
+ }
172
+
173
+ fn shift ( & mut self ) -> Option < char > {
174
+ let result = self . chr0 ;
175
+ self . chr0 = self . chr1 ;
176
+ self . chr1 = self . source . next ( ) ;
177
+ result
178
+ }
179
+ }
180
+
181
+ impl < T > Iterator for LineContinationHandler < T >
182
+ where
183
+ T : Iterator < Item = char > ,
184
+ {
185
+ type Item = char ;
186
+
187
+ fn next ( & mut self ) -> Option < Self :: Item > {
188
+ // Collapse \r\n into \n
189
+ loop {
190
+ if self . chr0 == Some ( '\\' ) && self . chr1 == Some ( '\n' ) {
191
+ // Skip backslash and newline
192
+ self . shift ( ) ;
193
+ self . shift ( ) ;
194
+ } else {
195
+ break ;
196
+ }
197
+ }
198
+
199
+ self . shift ( )
200
+ }
201
+ }
202
+
203
+ impl < T > Lexer < T >
204
+ where
205
+ T : Iterator < Item = char > ,
206
+ {
207
+ pub fn new ( input : T ) -> Self {
90
208
let mut lxr = Lexer {
91
- chars : input. char_indices ( ) ,
209
+ chars : input,
92
210
at_begin_of_line : true ,
93
211
nesting : 0 ,
94
212
indentation_stack : vec ! [ 0 ] ,
@@ -155,16 +273,20 @@ impl<'input> Lexer<'input> {
155
273
Some ( '\n' ) => {
156
274
return ;
157
275
}
158
- Some ( '\r' ) => {
159
- return ;
160
- }
161
276
Some ( _) => { }
162
277
None => return ,
163
278
}
164
279
}
165
280
}
166
281
167
282
fn lex_string ( & mut self ) -> Spanned < Tok > {
283
+ let type_char = match self . chr0 {
284
+ Some ( 'u' ) | Some ( 'f' ) | Some ( 'r' ) => self . next_char ( ) ,
285
+ _ => None ,
286
+ } ;
287
+
288
+ let is_raw = type_char == Some ( 'r' ) ;
289
+
168
290
let quote_char = self . next_char ( ) . unwrap ( ) ;
169
291
let mut string_content = String :: new ( ) ;
170
292
let start_pos = self . get_pos ( ) ;
@@ -182,43 +304,36 @@ impl<'input> Lexer<'input> {
182
304
loop {
183
305
match self . next_char ( ) {
184
306
Some ( '\\' ) => {
185
- match self . next_char ( ) {
186
- Some ( '\\' ) => {
187
- string_content. push ( '\\' ) ;
188
- }
189
- Some ( '\'' ) => string_content. push ( '\'' ) ,
190
- Some ( '\"' ) => string_content. push ( '\"' ) ,
191
- Some ( '\n' ) => {
192
- // Ignore Unix EOL character
193
- }
194
- Some ( '\r' ) => {
195
- match self . chr0 {
196
- Some ( '\n' ) => {
197
- // Ignore Windows EOL characters (2 bytes)
198
- self . next_char ( ) ;
199
- }
200
- _ => {
201
- // Ignore Mac EOL character
202
- }
307
+ if is_raw {
308
+ string_content. push ( '\\' ) ;
309
+ } else {
310
+ match self . next_char ( ) {
311
+ Some ( '\\' ) => {
312
+ string_content. push ( '\\' ) ;
313
+ }
314
+ Some ( '\'' ) => string_content. push ( '\'' ) ,
315
+ Some ( '\"' ) => string_content. push ( '\"' ) ,
316
+ Some ( '\n' ) => {
317
+ // Ignore Unix EOL character
318
+ }
319
+ Some ( 'a' ) => string_content. push ( '\x07' ) ,
320
+ Some ( 'b' ) => string_content. push ( '\x08' ) ,
321
+ Some ( 'f' ) => string_content. push ( '\x0c' ) ,
322
+ Some ( 'n' ) => {
323
+ string_content. push ( '\n' ) ;
324
+ }
325
+ Some ( 'r' ) => string_content. push ( '\r' ) ,
326
+ Some ( 't' ) => {
327
+ string_content. push ( '\t' ) ;
328
+ }
329
+ Some ( 'v' ) => string_content. push ( '\x0b' ) ,
330
+ Some ( c) => {
331
+ string_content. push ( '\\' ) ;
332
+ string_content. push ( c) ;
333
+ }
334
+ None => {
335
+ return Err ( LexicalError :: StringError ) ;
203
336
}
204
- }
205
- Some ( 'a' ) => string_content. push ( '\x07' ) ,
206
- Some ( 'b' ) => string_content. push ( '\x08' ) ,
207
- Some ( 'f' ) => string_content. push ( '\x0c' ) ,
208
- Some ( 'n' ) => {
209
- string_content. push ( '\n' ) ;
210
- }
211
- Some ( 'r' ) => string_content. push ( '\r' ) ,
212
- Some ( 't' ) => {
213
- string_content. push ( '\t' ) ;
214
- }
215
- Some ( 'v' ) => string_content. push ( '\x0b' ) ,
216
- Some ( c) => {
217
- string_content. push ( '\\' ) ;
218
- string_content. push ( c) ;
219
- }
220
- None => {
221
- return Err ( LexicalError :: StringError ) ;
222
337
}
223
338
}
224
339
}
@@ -281,7 +396,7 @@ impl<'input> Lexer<'input> {
281
396
let c = self . chr0 ;
282
397
let nxt = self . chars . next ( ) ;
283
398
self . chr0 = self . chr1 ;
284
- self . chr1 = nxt. map ( |x| x . 1 ) ;
399
+ self . chr1 = nxt;
285
400
self . location . column += 1 ;
286
401
c
287
402
}
@@ -318,17 +433,6 @@ impl<'input> Lexer<'input> {
318
433
self . at_begin_of_line = true ;
319
434
continue ' top_loop;
320
435
}
321
- Some ( '\r' ) => {
322
- // Empty line!
323
- self . next_char ( ) ;
324
- if self . chr0 == Some ( '\n' ) {
325
- // absorb two bytes if Windows line ending
326
- self . next_char ( ) ;
327
- }
328
- self . at_begin_of_line = true ;
329
- self . new_line ( ) ;
330
- continue ' top_loop;
331
- }
332
436
Some ( '\n' ) => {
333
437
// Empty line!
334
438
self . next_char ( ) ;
@@ -376,7 +480,18 @@ impl<'input> Lexer<'input> {
376
480
377
481
match self . chr0 {
378
482
Some ( '0' ...'9' ) => return Some ( self . lex_number ( ) ) ,
379
- Some ( '_' ) | Some ( 'a' ...'z' ) | Some ( 'A' ...'Z' ) => return Some ( self . lex_identifier ( ) ) ,
483
+ Some ( '_' ) | Some ( 'a' ...'z' ) | Some ( 'A' ...'Z' ) => {
484
+ // Detect r"", f"" and u""
485
+ match self . chr0 {
486
+ Some ( 'r' ) | Some ( 'u' ) | Some ( 'f' ) => match self . chr1 {
487
+ Some ( '\'' ) | Some ( '\"' ) => {
488
+ return Some ( self . lex_string ( ) ) ;
489
+ }
490
+ _ => return Some ( self . lex_identifier ( ) ) ,
491
+ } ,
492
+ _ => return Some ( self . lex_identifier ( ) ) ,
493
+ }
494
+ }
380
495
Some ( '#' ) => {
381
496
self . lex_comment ( ) ;
382
497
continue ;
@@ -691,20 +806,6 @@ impl<'input> Lexer<'input> {
691
806
let tok_end = self . get_pos ( ) ;
692
807
return Some ( Ok ( ( tok_start, Tok :: Dot , tok_end) ) ) ;
693
808
}
694
- Some ( '\r' ) => {
695
- let tok_start = self . get_pos ( ) ;
696
- self . next_char ( ) ;
697
- let tok_end = self . get_pos ( ) ;
698
- self . new_line ( ) ;
699
-
700
- // Depending on the nesting level, we emit newline or not:
701
- if self . nesting == 0 {
702
- self . at_begin_of_line = true ;
703
- return Some ( Ok ( ( tok_start, Tok :: Newline , tok_end) ) ) ;
704
- } else {
705
- continue ;
706
- }
707
- }
708
809
Some ( '\n' ) => {
709
810
let tok_start = self . get_pos ( ) ;
710
811
self . next_char ( ) ;
@@ -746,7 +847,10 @@ impl<'input> Lexer<'input> {
746
847
Calling the next element in the iterator will yield the next lexical
747
848
token.
748
849
*/
749
- impl < ' input > Iterator for Lexer < ' input > {
850
+ impl < T > Iterator for Lexer < T >
851
+ where
852
+ T : Iterator < Item = char > ,
853
+ {
750
854
type Item = Spanned < Tok > ;
751
855
752
856
fn next ( & mut self ) -> Option < Self :: Item > {
@@ -766,18 +870,46 @@ impl<'input> Iterator for Lexer<'input> {
766
870
767
871
#[ cfg( test) ]
768
872
mod tests {
769
- use super :: { Lexer , Tok } ;
873
+ use super :: { make_tokenizer , NewlineHandler , Tok } ;
770
874
use std:: iter:: FromIterator ;
875
+ use std:: iter:: Iterator ;
771
876
772
877
const WINDOWS_EOL : & str = "\r \n " ;
773
878
const MAC_EOL : & str = "\r " ;
774
879
const UNIX_EOL : & str = "\n " ;
775
880
776
881
pub fn lex_source ( source : & String ) -> Vec < Tok > {
777
- let lexer = Lexer :: new ( source) ;
882
+ let lexer = make_tokenizer ( source) ;
778
883
Vec :: from_iter ( lexer. map ( |x| x. unwrap ( ) . 1 ) )
779
884
}
780
885
886
+ #[ test]
887
+ fn test_newline_processor ( ) {
888
+ // Escape \ followed by \n (by removal):
889
+ let src = "b\\ \r \n " ;
890
+ assert_eq ! ( 4 , src. len( ) ) ;
891
+ let nlh = NewlineHandler :: new ( src. chars ( ) ) ;
892
+ let x: Vec < char > = nlh. collect ( ) ;
893
+ assert_eq ! ( vec![ 'b' , '\\' , '\n' ] , x) ;
894
+ }
895
+
896
+ #[ test]
897
+ fn test_raw_string ( ) {
898
+ let source = String :: from ( "r\" \\ \\ \" \" \\ \\ \" " ) ;
899
+ let tokens = lex_source ( & source) ;
900
+ assert_eq ! (
901
+ tokens,
902
+ vec![
903
+ Tok :: String {
904
+ value: "\\ \\ " . to_string( )
905
+ } ,
906
+ Tok :: String {
907
+ value: "\\ " . to_string( )
908
+ }
909
+ ]
910
+ ) ;
911
+ }
912
+
781
913
macro_rules! test_line_comment {
782
914
( $( $name: ident: $eol: expr, ) * ) => {
783
915
$(
0 commit comments