Skip to content

Commit b03dd62

Browse files
committed
Add raw strings to lexer. Add line break continuation to lexer.
1 parent 09e2c02 commit b03dd62

File tree

6 files changed

+279
-88
lines changed

6 files changed

+279
-88
lines changed

parser/src/ast.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,8 @@ pub enum Statement {
9191
orelse: Option<Vec<LocatedStatement>>,
9292
},
9393
Raise {
94-
expression: Option<Expression>,
94+
exception: Option<Expression>,
95+
cause: Option<Expression>,
9596
},
9697
Try {
9798
body: Vec<LocatedStatement>,

parser/src/lexer.rs

Lines changed: 207 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
pub use super::token::Tok;
22
use std::collections::HashMap;
3-
use std::str::CharIndices;
43

5-
pub struct Lexer<'input> {
6-
chars: CharIndices<'input>,
4+
pub struct Lexer<T: Iterator<Item = char>> {
5+
chars: T,
76
at_begin_of_line: bool,
87
nesting: usize, // Amount of parenthesis
98
indentation_stack: Vec<usize>,
@@ -85,10 +84,129 @@ pub fn get_keywords() -> HashMap<String, Tok> {
8584

8685
pub type Spanned<Tok> = Result<(Location, Tok, Location), LexicalError>;
8786

88-
impl<'input> Lexer<'input> {
89-
pub fn new(input: &'input str) -> Self {
87+
pub fn make_tokenizer<'a>(source: &'a str) -> impl Iterator<Item = Spanned<Tok>> + 'a {
88+
let nlh = NewlineHandler::new(source.chars());
89+
let lch = LineContinationHandler::new(nlh);
90+
let lexer = Lexer::new(lch);
91+
lexer
92+
}
93+
94+
// The newline handler is an iterator which collapses different newline
95+
// types into \n always.
96+
pub struct NewlineHandler<T: Iterator<Item = char>> {
97+
source: T,
98+
chr0: Option<char>,
99+
chr1: Option<char>,
100+
}
101+
102+
impl<T> NewlineHandler<T>
103+
where
104+
T: Iterator<Item = char>,
105+
{
106+
pub fn new(source: T) -> Self {
107+
let mut nlh = NewlineHandler {
108+
source: source,
109+
chr0: None,
110+
chr1: None,
111+
};
112+
nlh.shift();
113+
nlh.shift();
114+
nlh
115+
}
116+
117+
fn shift(&mut self) -> Option<char> {
118+
let result = self.chr0;
119+
self.chr0 = self.chr1;
120+
self.chr1 = self.source.next();
121+
result
122+
}
123+
}
124+
125+
impl<T> Iterator for NewlineHandler<T>
126+
where
127+
T: Iterator<Item = char>,
128+
{
129+
type Item = char;
130+
131+
fn next(&mut self) -> Option<Self::Item> {
132+
// Collapse \r\n into \n
133+
loop {
134+
if self.chr0 == Some('\r') {
135+
if self.chr1 == Some('\n') {
136+
// Transform windows EOL into \n
137+
self.shift();
138+
} else {
139+
// Transform MAC EOL into \n
140+
self.chr0 = Some('\n')
141+
}
142+
} else {
143+
break;
144+
}
145+
}
146+
147+
self.shift()
148+
}
149+
}
150+
151+
// Glues \ and \n into a single line:
152+
pub struct LineContinationHandler<T: Iterator<Item = char>> {
153+
source: T,
154+
chr0: Option<char>,
155+
chr1: Option<char>,
156+
}
157+
158+
impl<T> LineContinationHandler<T>
159+
where
160+
T: Iterator<Item = char>,
161+
{
162+
pub fn new(source: T) -> Self {
163+
let mut nlh = LineContinationHandler {
164+
source: source,
165+
chr0: None,
166+
chr1: None,
167+
};
168+
nlh.shift();
169+
nlh.shift();
170+
nlh
171+
}
172+
173+
fn shift(&mut self) -> Option<char> {
174+
let result = self.chr0;
175+
self.chr0 = self.chr1;
176+
self.chr1 = self.source.next();
177+
result
178+
}
179+
}
180+
181+
impl<T> Iterator for LineContinationHandler<T>
182+
where
183+
T: Iterator<Item = char>,
184+
{
185+
type Item = char;
186+
187+
fn next(&mut self) -> Option<Self::Item> {
188+
// Collapse \r\n into \n
189+
loop {
190+
if self.chr0 == Some('\\') && self.chr1 == Some('\n') {
191+
// Skip backslash and newline
192+
self.shift();
193+
self.shift();
194+
} else {
195+
break;
196+
}
197+
}
198+
199+
self.shift()
200+
}
201+
}
202+
203+
impl<T> Lexer<T>
204+
where
205+
T: Iterator<Item = char>,
206+
{
207+
pub fn new(input: T) -> Self {
90208
let mut lxr = Lexer {
91-
chars: input.char_indices(),
209+
chars: input,
92210
at_begin_of_line: true,
93211
nesting: 0,
94212
indentation_stack: vec![0],
@@ -155,16 +273,20 @@ impl<'input> Lexer<'input> {
155273
Some('\n') => {
156274
return;
157275
}
158-
Some('\r') => {
159-
return;
160-
}
161276
Some(_) => {}
162277
None => return,
163278
}
164279
}
165280
}
166281

167282
fn lex_string(&mut self) -> Spanned<Tok> {
283+
let type_char = match self.chr0 {
284+
Some('u') | Some('f') | Some('r') => self.next_char(),
285+
_ => None,
286+
};
287+
288+
let is_raw = type_char == Some('r');
289+
168290
let quote_char = self.next_char().unwrap();
169291
let mut string_content = String::new();
170292
let start_pos = self.get_pos();
@@ -182,43 +304,36 @@ impl<'input> Lexer<'input> {
182304
loop {
183305
match self.next_char() {
184306
Some('\\') => {
185-
match self.next_char() {
186-
Some('\\') => {
187-
string_content.push('\\');
188-
}
189-
Some('\'') => string_content.push('\''),
190-
Some('\"') => string_content.push('\"'),
191-
Some('\n') => {
192-
// Ignore Unix EOL character
193-
}
194-
Some('\r') => {
195-
match self.chr0 {
196-
Some('\n') => {
197-
// Ignore Windows EOL characters (2 bytes)
198-
self.next_char();
199-
}
200-
_ => {
201-
// Ignore Mac EOL character
202-
}
307+
if is_raw {
308+
string_content.push('\\');
309+
} else {
310+
match self.next_char() {
311+
Some('\\') => {
312+
string_content.push('\\');
313+
}
314+
Some('\'') => string_content.push('\''),
315+
Some('\"') => string_content.push('\"'),
316+
Some('\n') => {
317+
// Ignore Unix EOL character
318+
}
319+
Some('a') => string_content.push('\x07'),
320+
Some('b') => string_content.push('\x08'),
321+
Some('f') => string_content.push('\x0c'),
322+
Some('n') => {
323+
string_content.push('\n');
324+
}
325+
Some('r') => string_content.push('\r'),
326+
Some('t') => {
327+
string_content.push('\t');
328+
}
329+
Some('v') => string_content.push('\x0b'),
330+
Some(c) => {
331+
string_content.push('\\');
332+
string_content.push(c);
333+
}
334+
None => {
335+
return Err(LexicalError::StringError);
203336
}
204-
}
205-
Some('a') => string_content.push('\x07'),
206-
Some('b') => string_content.push('\x08'),
207-
Some('f') => string_content.push('\x0c'),
208-
Some('n') => {
209-
string_content.push('\n');
210-
}
211-
Some('r') => string_content.push('\r'),
212-
Some('t') => {
213-
string_content.push('\t');
214-
}
215-
Some('v') => string_content.push('\x0b'),
216-
Some(c) => {
217-
string_content.push('\\');
218-
string_content.push(c);
219-
}
220-
None => {
221-
return Err(LexicalError::StringError);
222337
}
223338
}
224339
}
@@ -281,7 +396,7 @@ impl<'input> Lexer<'input> {
281396
let c = self.chr0;
282397
let nxt = self.chars.next();
283398
self.chr0 = self.chr1;
284-
self.chr1 = nxt.map(|x| x.1);
399+
self.chr1 = nxt;
285400
self.location.column += 1;
286401
c
287402
}
@@ -318,17 +433,6 @@ impl<'input> Lexer<'input> {
318433
self.at_begin_of_line = true;
319434
continue 'top_loop;
320435
}
321-
Some('\r') => {
322-
// Empty line!
323-
self.next_char();
324-
if self.chr0 == Some('\n') {
325-
// absorb two bytes if Windows line ending
326-
self.next_char();
327-
}
328-
self.at_begin_of_line = true;
329-
self.new_line();
330-
continue 'top_loop;
331-
}
332436
Some('\n') => {
333437
// Empty line!
334438
self.next_char();
@@ -376,7 +480,18 @@ impl<'input> Lexer<'input> {
376480

377481
match self.chr0 {
378482
Some('0'...'9') => return Some(self.lex_number()),
379-
Some('_') | Some('a'...'z') | Some('A'...'Z') => return Some(self.lex_identifier()),
483+
Some('_') | Some('a'...'z') | Some('A'...'Z') => {
484+
// Detect r"", f"" and u""
485+
match self.chr0 {
486+
Some('r') | Some('u') | Some('f') => match self.chr1 {
487+
Some('\'') | Some('\"') => {
488+
return Some(self.lex_string());
489+
}
490+
_ => return Some(self.lex_identifier()),
491+
},
492+
_ => return Some(self.lex_identifier()),
493+
}
494+
}
380495
Some('#') => {
381496
self.lex_comment();
382497
continue;
@@ -691,20 +806,6 @@ impl<'input> Lexer<'input> {
691806
let tok_end = self.get_pos();
692807
return Some(Ok((tok_start, Tok::Dot, tok_end)));
693808
}
694-
Some('\r') => {
695-
let tok_start = self.get_pos();
696-
self.next_char();
697-
let tok_end = self.get_pos();
698-
self.new_line();
699-
700-
// Depending on the nesting level, we emit newline or not:
701-
if self.nesting == 0 {
702-
self.at_begin_of_line = true;
703-
return Some(Ok((tok_start, Tok::Newline, tok_end)));
704-
} else {
705-
continue;
706-
}
707-
}
708809
Some('\n') => {
709810
let tok_start = self.get_pos();
710811
self.next_char();
@@ -746,7 +847,10 @@ impl<'input> Lexer<'input> {
746847
Calling the next element in the iterator will yield the next lexical
747848
token.
748849
*/
749-
impl<'input> Iterator for Lexer<'input> {
850+
impl<T> Iterator for Lexer<T>
851+
where
852+
T: Iterator<Item = char>,
853+
{
750854
type Item = Spanned<Tok>;
751855

752856
fn next(&mut self) -> Option<Self::Item> {
@@ -766,18 +870,46 @@ impl<'input> Iterator for Lexer<'input> {
766870

767871
#[cfg(test)]
768872
mod tests {
769-
use super::{Lexer, Tok};
873+
use super::{make_tokenizer, NewlineHandler, Tok};
770874
use std::iter::FromIterator;
875+
use std::iter::Iterator;
771876

772877
const WINDOWS_EOL: &str = "\r\n";
773878
const MAC_EOL: &str = "\r";
774879
const UNIX_EOL: &str = "\n";
775880

776881
pub fn lex_source(source: &String) -> Vec<Tok> {
777-
let lexer = Lexer::new(source);
882+
let lexer = make_tokenizer(source);
778883
Vec::from_iter(lexer.map(|x| x.unwrap().1))
779884
}
780885

886+
#[test]
887+
fn test_newline_processor() {
888+
// Escape \ followed by \n (by removal):
889+
let src = "b\\\r\n";
890+
assert_eq!(4, src.len());
891+
let nlh = NewlineHandler::new(src.chars());
892+
let x: Vec<char> = nlh.collect();
893+
assert_eq!(vec!['b', '\\', '\n'], x);
894+
}
895+
896+
#[test]
897+
fn test_raw_string() {
898+
let source = String::from("r\"\\\\\" \"\\\\\"");
899+
let tokens = lex_source(&source);
900+
assert_eq!(
901+
tokens,
902+
vec![
903+
Tok::String {
904+
value: "\\\\".to_string()
905+
},
906+
Tok::String {
907+
value: "\\".to_string()
908+
}
909+
]
910+
);
911+
}
912+
781913
macro_rules! test_line_comment {
782914
($($name:ident: $eol:expr,)*) => {
783915
$(

parser/src/parser.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ pub fn parse(filename: &Path) -> Result<ast::Program, String> {
4444

4545
macro_rules! do_lalr_parsing {
4646
($input: expr, $pat: ident, $tok: ident) => {{
47-
let lxr = lexer::Lexer::new($input);
47+
let lxr = lexer::make_tokenizer($input);
4848
let marker_token = (Default::default(), token::Tok::$tok, Default::default());
4949
let tokenizer = iter::once(Ok(marker_token)).chain(lxr);
5050

0 commit comments

Comments
 (0)