From 97f861a782744f27373d24aad0ed72697ab54d5d Mon Sep 17 00:00:00 2001 From: Federico Guerinoni Date: Tue, 13 Aug 2024 12:16:13 +0200 Subject: [PATCH] lex: Handle the char variable initalization There was a bug also when a symbol is at the end of the line, now it is handled correctly for the the tick symbol. Signed-off-by: Federico Guerinoni --- src/lexer/mod.rs | 8 ++++---- src/lexer/states.rs | 20 ++++++++++++++++++-- src/lexer/token.rs | 15 +++++++++++---- testdata/identifier/id_char_assign.fs | 1 + testdata/identifier/id_char_assign.tokens | 11 +++++++++++ 5 files changed, 45 insertions(+), 10 deletions(-) create mode 100644 testdata/identifier/id_char_assign.fs create mode 100644 testdata/identifier/id_char_assign.tokens diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index 68672d3..271631c 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -54,13 +54,13 @@ impl Iterator for Lexer { Ok(transition) => transition, Err(err) => { self.errors.push(err.clone()); - match err { + return match err { LexerError::UnexpectedToken(token) => { error!("Unexpected token: {}", token); // TODO: return a transition to continue lexing (for error recovery) - return None; + None } - } + }; } }; let (state, transition_kind) = transition.into_parts(); @@ -133,7 +133,7 @@ mod tests { #[test] fn identifier() { let fs_files = collect_fs_files("./testdata/identifier", true); - assert_eq!(fs_files.len(), 17); + assert_eq!(fs_files.len(), 18); for path in fs_files { info!("file -> {:?}", path); diff --git a/src/lexer/states.rs b/src/lexer/states.rs index 921f6ff..6a918b5 100644 --- a/src/lexer/states.rs +++ b/src/lexer/states.rs @@ -3,6 +3,7 @@ use super::Lexer; use super::LexerError; use crate::lexer::token::Token; use crate::lexer::token::TokenKind; +use crate::lexer::token::TokenKind::TokenTick; use std::fmt::Debug; pub trait State: Debug { @@ -137,7 +138,7 @@ pub struct StateWord; impl State for StateWord { fn visit(&self, cursor: &mut Cursor) -> Result { match cursor.peek() { - Some(c) if c.is_alphanumeric() || c.eq(&'_') => Ok(Lexer::proceed( + Some(c) if c.is_alphabetic() || c.eq(&'_') => Ok(Lexer::proceed( Box::new(StateWord), TransitionKind::AdvanceOffset, )), @@ -162,7 +163,7 @@ pub struct StateSymbol; impl StateSymbol { fn is_symbol(c: char) -> bool { - matches!(c, ':' | '=' | '\n') + matches!(c, ':' | '=' | '\n' | '\'') } } @@ -170,6 +171,21 @@ impl State for StateSymbol { fn visit(&self, cursor: &mut Cursor) -> Result { match cursor.peek() { Some('\n') => { + let lexeme = cursor.source().content()[cursor.index()..cursor.offset()].to_string(); + let token_kind = TokenKind::from(&lexeme); + // NOTE: if a '\n' is found and it was scanning another "symbol" token, the previous was mangled, and only the '\n' is emitted, + // right now we need to handle only TokenTick since can be at the end of the line, but this can be extended to other symbols + if token_kind == TokenTick { + return Ok(Lexer::proceed( + Box::new(StateStart), + TransitionKind::EmitToken(Token::new( + token_kind, + lexeme, + cursor.location().clone(), + )), + )); + } + let transition = Lexer::proceed( Box::new(StateStart), TransitionKind::EmitToken(Token::new( diff --git a/src/lexer/token.rs b/src/lexer/token.rs index c0a9064..bfcfcd5 100644 --- a/src/lexer/token.rs +++ b/src/lexer/token.rs @@ -5,6 +5,7 @@ use std::path::{Path, PathBuf}; const KEYWORD_INT: &str = "int"; const KEYWORD_FLOAT: &str = "float"; const KEYWORD_BOOL: &str = "bool"; +const KEYWORD_CHAR: &str = "char"; const KEYWORD_BOOL_TRUE: &str = "true"; const KEYWORD_BOOL_FALSE: &str = "false"; const SEPARATOR_COLON: &str = ":"; @@ -15,6 +16,7 @@ pub enum Literal { Int(i64), Float(f64), Bool(bool), + Char(char), } #[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] @@ -25,6 +27,7 @@ pub enum TokenKind { TokenNewLine, // \n TokenColon, // : TokenAssign, // = + TokenTick, // ' TokenEOF, // End of file } @@ -34,6 +37,7 @@ impl TokenKind { KEYWORD_INT => Some(TokenKind::TokenKeyword), KEYWORD_FLOAT => Some(TokenKind::TokenKeyword), KEYWORD_BOOL => Some(TokenKind::TokenKeyword), + KEYWORD_CHAR => Some(TokenKind::TokenKeyword), KEYWORD_BOOL_TRUE => Some(TokenKind::TokenLiteral(Literal::Bool(true))), KEYWORD_BOOL_FALSE => Some(TokenKind::TokenLiteral(Literal::Bool(false))), _ => None, @@ -42,9 +46,10 @@ impl TokenKind { fn match_number(lexeme: &str) -> Option { if lexeme.chars().all(char::is_numeric) { - return Some(TokenKind::TokenLiteral(Literal::Int( - lexeme.parse().unwrap(), - ))); + return match lexeme.parse() { + Ok(value) => Some(TokenKind::TokenLiteral(Literal::Int(value))), + Err(_) => None, + }; } if lexeme.contains('.') { @@ -60,6 +65,7 @@ impl TokenKind { match lexeme { SEPARATOR_COLON => Some(TokenKind::TokenColon), SEPARATOR_ASSIGN => Some(TokenKind::TokenAssign), + "'" => Some(TokenKind::TokenTick), _ => None, } } @@ -86,7 +92,6 @@ impl From<&String> for TokenKind { TokenKind::TokenIdentifier } } - /// The location of a token in the source code in a uman-readable format #[derive(Debug, Clone, Eq, PartialEq, Deserialize, Serialize)] pub struct TokenLocation { @@ -231,6 +236,7 @@ impl std::fmt::Display for Literal { Literal::Int(value) => write!(f, "Int({})", value), Literal::Float(value) => write!(f, "Float({})", value), Literal::Bool(value) => write!(f, "Bool({})", value), + Literal::Char(value) => write!(f, "Char({})", value), } } } @@ -243,6 +249,7 @@ impl std::fmt::Display for TokenKind { TokenKind::TokenNewLine => write!(f, "TokenNewLine"), TokenKind::TokenColon => write!(f, "TokenColon"), TokenKind::TokenAssign => write!(f, "TokenAssign"), + TokenKind::TokenTick => write!(f, "TokenTick"), TokenKind::TokenEOF => write!(f, "TokenEOF"), } } diff --git a/testdata/identifier/id_char_assign.fs b/testdata/identifier/id_char_assign.fs new file mode 100644 index 0000000..2a1a202 --- /dev/null +++ b/testdata/identifier/id_char_assign.fs @@ -0,0 +1 @@ +my_char: char = 'a' diff --git a/testdata/identifier/id_char_assign.tokens b/testdata/identifier/id_char_assign.tokens new file mode 100644 index 0000000..4cfc31d --- /dev/null +++ b/testdata/identifier/id_char_assign.tokens @@ -0,0 +1,11 @@ +[ + {"kind": "TokenIdentifier","lexeme": "my_char","location": {"file_path": "","line": 0,"column_start": 0,"column_end": 7}}, + {"kind": "TokenColon","lexeme": ":","location": {"file_path": "","line": 0,"column_start": 7,"column_end": 8}}, + {"kind": "TokenKeyword","lexeme": "char","location": {"file_path": "","line": 0,"column_start": 9,"column_end": 13}}, + {"kind": "TokenAssign","lexeme": "=","location": {"file_path": "","line": 0,"column_start": 14,"column_end": 15}}, + {"kind": "TokenTick","lexeme": "'","location": {"file_path": "","line": 0,"column_start": 16,"column_end": 17}}, + {"kind": "TokenIdentifier","lexeme": "a","location": {"file_path": "","line": 0,"column_start": 17,"column_end": 18}}, + {"kind": "TokenTick","lexeme": "'","location": {"file_path": "","line": 0,"column_start": 18,"column_end": 19}}, + {"kind": "TokenNewLine","lexeme": "\\n","location": {"file_path": "","line": 0,"column_start": 19,"column_end": 19}}, + {"kind": "TokenEOF","lexeme": "","location": {"file_path": "","line": 1,"column_start": 0,"column_end": 0}} +]