From 97f861a782744f27373d24aad0ed72697ab54d5d Mon Sep 17 00:00:00 2001
From: Federico Guerinoni <guerinoni.federico@gmail.com>
Date: Tue, 13 Aug 2024 12:16:13 +0200
Subject: [PATCH] lex: Handle the char variable initalization

There was a bug also when a symbol is at the end of the line, now it is
handled correctly for the the tick symbol.

Signed-off-by: Federico Guerinoni <guerinoni.federico@gmail.com>
---
 src/lexer/mod.rs                          |  8 ++++----
 src/lexer/states.rs                       | 20 ++++++++++++++++++--
 src/lexer/token.rs                        | 15 +++++++++++----
 testdata/identifier/id_char_assign.fs     |  1 +
 testdata/identifier/id_char_assign.tokens | 11 +++++++++++
 5 files changed, 45 insertions(+), 10 deletions(-)
 create mode 100644 testdata/identifier/id_char_assign.fs
 create mode 100644 testdata/identifier/id_char_assign.tokens

diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs
index 68672d3..271631c 100644
--- a/src/lexer/mod.rs
+++ b/src/lexer/mod.rs
@@ -54,13 +54,13 @@ impl Iterator for Lexer {
                 Ok(transition) => transition,
                 Err(err) => {
                     self.errors.push(err.clone());
-                    match err {
+                    return match err {
                         LexerError::UnexpectedToken(token) => {
                             error!("Unexpected token: {}", token);
                             // TODO: return a transition to continue lexing (for error recovery)
-                            return None;
+                            None
                         }
-                    }
+                    };
                 }
             };
             let (state, transition_kind) = transition.into_parts();
@@ -133,7 +133,7 @@ mod tests {
     #[test]
     fn identifier() {
         let fs_files = collect_fs_files("./testdata/identifier", true);
-        assert_eq!(fs_files.len(), 17);
+        assert_eq!(fs_files.len(), 18);
 
         for path in fs_files {
             info!("file -> {:?}", path);
diff --git a/src/lexer/states.rs b/src/lexer/states.rs
index 921f6ff..6a918b5 100644
--- a/src/lexer/states.rs
+++ b/src/lexer/states.rs
@@ -3,6 +3,7 @@ use super::Lexer;
 use super::LexerError;
 use crate::lexer::token::Token;
 use crate::lexer::token::TokenKind;
+use crate::lexer::token::TokenKind::TokenTick;
 use std::fmt::Debug;
 
 pub trait State: Debug {
@@ -137,7 +138,7 @@ pub struct StateWord;
 impl State for StateWord {
     fn visit(&self, cursor: &mut Cursor) -> Result<Transition, LexerError> {
         match cursor.peek() {
-            Some(c) if c.is_alphanumeric() || c.eq(&'_') => Ok(Lexer::proceed(
+            Some(c) if c.is_alphabetic() || c.eq(&'_') => Ok(Lexer::proceed(
                 Box::new(StateWord),
                 TransitionKind::AdvanceOffset,
             )),
@@ -162,7 +163,7 @@ pub struct StateSymbol;
 
 impl StateSymbol {
     fn is_symbol(c: char) -> bool {
-        matches!(c, ':' | '=' | '\n')
+        matches!(c, ':' | '=' | '\n' | '\'')
     }
 }
 
@@ -170,6 +171,21 @@ impl State for StateSymbol {
     fn visit(&self, cursor: &mut Cursor) -> Result<Transition, LexerError> {
         match cursor.peek() {
             Some('\n') => {
+                let lexeme = cursor.source().content()[cursor.index()..cursor.offset()].to_string();
+                let token_kind = TokenKind::from(&lexeme);
+                // NOTE: if a '\n' is found and it was scanning another "symbol" token, the previous was mangled, and only the '\n' is emitted,
+                // right now we need to handle only TokenTick since can be at the end of the line, but this can be extended to other symbols
+                if token_kind == TokenTick {
+                    return Ok(Lexer::proceed(
+                        Box::new(StateStart),
+                        TransitionKind::EmitToken(Token::new(
+                            token_kind,
+                            lexeme,
+                            cursor.location().clone(),
+                        )),
+                    ));
+                }
+
                 let transition = Lexer::proceed(
                     Box::new(StateStart),
                     TransitionKind::EmitToken(Token::new(
diff --git a/src/lexer/token.rs b/src/lexer/token.rs
index c0a9064..bfcfcd5 100644
--- a/src/lexer/token.rs
+++ b/src/lexer/token.rs
@@ -5,6 +5,7 @@ use std::path::{Path, PathBuf};
 const KEYWORD_INT: &str = "int";
 const KEYWORD_FLOAT: &str = "float";
 const KEYWORD_BOOL: &str = "bool";
+const KEYWORD_CHAR: &str = "char";
 const KEYWORD_BOOL_TRUE: &str = "true";
 const KEYWORD_BOOL_FALSE: &str = "false";
 const SEPARATOR_COLON: &str = ":";
@@ -15,6 +16,7 @@ pub enum Literal {
     Int(i64),
     Float(f64),
     Bool(bool),
+    Char(char),
 }
 
 #[derive(Clone, Debug, PartialEq, Deserialize, Serialize)]
@@ -25,6 +27,7 @@ pub enum TokenKind {
     TokenNewLine, // \n
     TokenColon,   // :
     TokenAssign,  // =
+    TokenTick,    // '
     TokenEOF,     // End of file
 }
 
@@ -34,6 +37,7 @@ impl TokenKind {
             KEYWORD_INT => Some(TokenKind::TokenKeyword),
             KEYWORD_FLOAT => Some(TokenKind::TokenKeyword),
             KEYWORD_BOOL => Some(TokenKind::TokenKeyword),
+            KEYWORD_CHAR => Some(TokenKind::TokenKeyword),
             KEYWORD_BOOL_TRUE => Some(TokenKind::TokenLiteral(Literal::Bool(true))),
             KEYWORD_BOOL_FALSE => Some(TokenKind::TokenLiteral(Literal::Bool(false))),
             _ => None,
@@ -42,9 +46,10 @@ impl TokenKind {
 
     fn match_number(lexeme: &str) -> Option<TokenKind> {
         if lexeme.chars().all(char::is_numeric) {
-            return Some(TokenKind::TokenLiteral(Literal::Int(
-                lexeme.parse().unwrap(),
-            )));
+            return match lexeme.parse() {
+                Ok(value) => Some(TokenKind::TokenLiteral(Literal::Int(value))),
+                Err(_) => None,
+            };
         }
 
         if lexeme.contains('.') {
@@ -60,6 +65,7 @@ impl TokenKind {
         match lexeme {
             SEPARATOR_COLON => Some(TokenKind::TokenColon),
             SEPARATOR_ASSIGN => Some(TokenKind::TokenAssign),
+            "'" => Some(TokenKind::TokenTick),
             _ => None,
         }
     }
@@ -86,7 +92,6 @@ impl From<&String> for TokenKind {
         TokenKind::TokenIdentifier
     }
 }
-
 /// The location of a token in the source code in a uman-readable format
 #[derive(Debug, Clone, Eq, PartialEq, Deserialize, Serialize)]
 pub struct TokenLocation {
@@ -231,6 +236,7 @@ impl std::fmt::Display for Literal {
             Literal::Int(value) => write!(f, "Int({})", value),
             Literal::Float(value) => write!(f, "Float({})", value),
             Literal::Bool(value) => write!(f, "Bool({})", value),
+            Literal::Char(value) => write!(f, "Char({})", value),
         }
     }
 }
@@ -243,6 +249,7 @@ impl std::fmt::Display for TokenKind {
             TokenKind::TokenNewLine => write!(f, "TokenNewLine"),
             TokenKind::TokenColon => write!(f, "TokenColon"),
             TokenKind::TokenAssign => write!(f, "TokenAssign"),
+            TokenKind::TokenTick => write!(f, "TokenTick"),
             TokenKind::TokenEOF => write!(f, "TokenEOF"),
         }
     }
diff --git a/testdata/identifier/id_char_assign.fs b/testdata/identifier/id_char_assign.fs
new file mode 100644
index 0000000..2a1a202
--- /dev/null
+++ b/testdata/identifier/id_char_assign.fs
@@ -0,0 +1 @@
+my_char: char = 'a'
diff --git a/testdata/identifier/id_char_assign.tokens b/testdata/identifier/id_char_assign.tokens
new file mode 100644
index 0000000..4cfc31d
--- /dev/null
+++ b/testdata/identifier/id_char_assign.tokens
@@ -0,0 +1,11 @@
+[
+  {"kind": "TokenIdentifier","lexeme": "my_char","location": {"file_path": "","line": 0,"column_start": 0,"column_end": 7}},
+  {"kind": "TokenColon","lexeme": ":","location": {"file_path": "","line": 0,"column_start": 7,"column_end": 8}},
+  {"kind": "TokenKeyword","lexeme": "char","location": {"file_path": "","line": 0,"column_start": 9,"column_end": 13}},
+  {"kind": "TokenAssign","lexeme": "=","location": {"file_path": "","line": 0,"column_start": 14,"column_end": 15}},
+  {"kind": "TokenTick","lexeme": "'","location": {"file_path": "","line": 0,"column_start": 16,"column_end": 17}},
+  {"kind": "TokenIdentifier","lexeme": "a","location": {"file_path": "","line": 0,"column_start": 17,"column_end": 18}},
+  {"kind": "TokenTick","lexeme": "'","location": {"file_path": "","line": 0,"column_start": 18,"column_end": 19}},
+  {"kind": "TokenNewLine","lexeme": "\\n","location": {"file_path": "","line": 0,"column_start": 19,"column_end": 19}},
+  {"kind": "TokenEOF","lexeme": "","location": {"file_path": "","line": 1,"column_start": 0,"column_end": 0}}
+]