lex: Handle the char variable initalization

There was a bug also when a symbol is at the end of the line, now it is handled correctly for the the tick symbol.
funs-lang · Aug 13, 2024 · 637bd2a · 637bd2a
1 parent c50fe9e
commit 637bd2a
Show file tree

Hide file tree

Showing 5 changed files with 44 additions and 9 deletions.
diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs
@@ -54,11 +54,11 @@ impl Iterator for Lexer {
                 Ok(transition) => transition,
                 Err(err) => {
                     self.errors.push(err.clone());
-                    match err {
+                    return match err {
                         LexerError::UnexpectedToken(token) => {
                             error!("Unexpected token: {}", token);
                             // TODO: return a transition to continue lexing (for error recovery)
-                            return None;
+                            None
                         }
                     }
                 }
@@ -133,7 +133,7 @@ mod tests {
     #[test]
     fn identifier() {
         let fs_files = collect_fs_files("./testdata/identifier", true);
-        assert_eq!(fs_files.len(), 17);
+        assert_eq!(fs_files.len(), 18);
 
         for path in fs_files {
             info!("file -> {:?}", path);

diff --git a/src/lexer/states.rs b/src/lexer/states.rs
@@ -3,6 +3,7 @@ use super::Lexer;
 use super::LexerError;
 use crate::lexer::token::Token;
 use crate::lexer::token::TokenKind;
+use crate::lexer::token::TokenKind::TokenTick;
 use std::fmt::Debug;
 
 pub trait State: Debug {
@@ -137,7 +138,7 @@ pub struct StateWord;
 impl State for StateWord {
     fn visit(&self, cursor: &mut Cursor) -> Result<Transition, LexerError> {
         match cursor.peek() {
-            Some(c) if c.is_alphanumeric() || c.eq(&'_') => Ok(Lexer::proceed(
+            Some(c) if c.is_alphabetic() || c.eq(&'_') => Ok(Lexer::proceed(
                 Box::new(StateWord),
                 TransitionKind::AdvanceOffset,
             )),
@@ -162,14 +163,29 @@ pub struct StateSymbol;
 
 impl StateSymbol {
     fn is_symbol(c: char) -> bool {
-        matches!(c, ':' | '=' | '\n')
+        matches!(c, ':' | '=' | '\n' | '\'')
     }
 }
 
 impl State for StateSymbol {
     fn visit(&self, cursor: &mut Cursor) -> Result<Transition, LexerError> {
         match cursor.peek() {
             Some('\n') => {
+                let lexeme = cursor.source().content()[cursor.index()..cursor.offset()].to_string();
+                let token_kind = TokenKind::from(&lexeme);
+                // NOTE: if a '\n' is found and it was scanning another "symbol" token, the previous was mangled, and only the '\n' is emitted,
+                // right now we need to handle only TokenTick since can be at the end of the line, but this can be extended to other symbols
+                if token_kind == TokenTick {
+                    return Ok(Lexer::proceed(
+                        Box::new(StateStart),
+                        TransitionKind::EmitToken(Token::new(
+                            token_kind,
+                            lexeme,
+                            cursor.location().clone(),
+                        )),
+                    ));
+                }
+
                 let transition = Lexer::proceed(
                     Box::new(StateStart),
                     TransitionKind::EmitToken(Token::new(

diff --git a/src/lexer/token.rs b/src/lexer/token.rs
@@ -5,6 +5,7 @@ use std::path::{Path, PathBuf};
 const KEYWORD_INT: &str = "int";
 const KEYWORD_FLOAT: &str = "float";
 const KEYWORD_BOOL: &str = "bool";
+const KEYWORD_CHAR: &str = "char";
 const KEYWORD_BOOL_TRUE: &str = "true";
 const KEYWORD_BOOL_FALSE: &str = "false";
 const SEPARATOR_COLON: &str = ":";
@@ -15,6 +16,7 @@ pub enum Literal {
     Int(i64),
     Float(f64),
     Bool(bool),
+    Char(char),
 }
 
 #[derive(Clone, Debug, PartialEq, Deserialize, Serialize)]
@@ -25,6 +27,7 @@ pub enum TokenKind {
     TokenNewLine, // \n
     TokenColon,   // :
     TokenAssign,  // =
+    TokenTick,    // '
     TokenEOF,     // End of file
 }
 
@@ -34,6 +37,7 @@ impl TokenKind {
             KEYWORD_INT => Some(TokenKind::TokenKeyword),
             KEYWORD_FLOAT => Some(TokenKind::TokenKeyword),
             KEYWORD_BOOL => Some(TokenKind::TokenKeyword),
+            KEYWORD_CHAR => Some(TokenKind::TokenKeyword),
             KEYWORD_BOOL_TRUE => Some(TokenKind::TokenLiteral(Literal::Bool(true))),
             KEYWORD_BOOL_FALSE => Some(TokenKind::TokenLiteral(Literal::Bool(false))),
             _ => None,
@@ -42,9 +46,10 @@ impl TokenKind {
 
     fn match_number(lexeme: &str) -> Option<TokenKind> {
         if lexeme.chars().all(char::is_numeric) {
-            return Some(TokenKind::TokenLiteral(Literal::Int(
-                lexeme.parse().unwrap(),
-            )));
+            return match lexeme.parse() {
+                Ok(value) => Some(TokenKind::TokenLiteral(Literal::Int(value))),
+                Err(_) => None,
+            };
         }
 
         if lexeme.contains('.') {
@@ -60,6 +65,7 @@ impl TokenKind {
         match lexeme {
             SEPARATOR_COLON => Some(TokenKind::TokenColon),
             SEPARATOR_ASSIGN => Some(TokenKind::TokenAssign),
+            "'" => Some(TokenKind::TokenTick),
             _ => None,
         }
     }
@@ -86,7 +92,6 @@ impl From<&String> for TokenKind {
         TokenKind::TokenIdentifier
     }
 }
-
 /// The location of a token in the source code in a uman-readable format
 #[derive(Debug, Clone, Eq, PartialEq, Deserialize, Serialize)]
 pub struct TokenLocation {
@@ -231,6 +236,7 @@ impl std::fmt::Display for Literal {
             Literal::Int(value) => write!(f, "Int({})", value),
             Literal::Float(value) => write!(f, "Float({})", value),
             Literal::Bool(value) => write!(f, "Bool({})", value),
+            Literal::Char(value) => write!(f, "Char({})", value),
         }
     }
 }
@@ -243,6 +249,7 @@ impl std::fmt::Display for TokenKind {
             TokenKind::TokenNewLine => write!(f, "TokenNewLine"),
             TokenKind::TokenColon => write!(f, "TokenColon"),
             TokenKind::TokenAssign => write!(f, "TokenAssign"),
+            TokenKind::TokenTick => write!(f, "TokenTick"),
             TokenKind::TokenEOF => write!(f, "TokenEOF"),
         }
     }

diff --git a/testdata/identifier/id_char_assign.fs b/testdata/identifier/id_char_assign.fs
@@ -0,0 +1 @@
+my_char: char = 'a'
diff --git a/testdata/identifier/id_char_assign.tokens b/testdata/identifier/id_char_assign.tokens
@@ -0,0 +1,11 @@
+[
+  {"kind": "TokenIdentifier","lexeme": "my_char","location": {"file_path": "","line": 0,"column_start": 0,"column_end": 7}},
+  {"kind": "TokenColon","lexeme": ":","location": {"file_path": "","line": 0,"column_start": 7,"column_end": 8}},
+  {"kind": "TokenKeyword","lexeme": "char","location": {"file_path": "","line": 0,"column_start": 9,"column_end": 13}},
+  {"kind": "TokenAssign","lexeme": "=","location": {"file_path": "","line": 0,"column_start": 14,"column_end": 15}},
+  {"kind": "TokenTick","lexeme": "'","location": {"file_path": "","line": 0,"column_start": 16,"column_end": 17}},
+  {"kind": "TokenIdentifier","lexeme": "a","location": {"file_path": "","line": 0,"column_start": 17,"column_end": 18}},
+  {"kind": "TokenTick","lexeme": "'","location": {"file_path": "","line": 0,"column_start": 18,"column_end": 19}},
+  {"kind": "TokenNewLine","lexeme": "\\n","location": {"file_path": "","line": 0,"column_start": 19,"column_end": 19}},
+  {"kind": "TokenEOF","lexeme": "","location": {"file_path": "","line": 1,"column_start": 0,"column_end": 0}}
+]