support unicode literal

Antonio Yang · Antonio Yang · commit 974dc683e4f6 · 2019-06-20T00:30:38.000+08:00
- support unicode literal \x with 2 digits
- support unicode literal \u with 4 digits
- support unicode literal \U with 8 digits
- avoid to parse \x as unicode literal in bytes
diff --git a/parser/Cargo.toml b/parser/Cargo.toml
@@ -17,3 +17,4 @@ num-traits = "0.2"
 unicode-xid = "0.1.0"
 unic-emoji-char = "0.9.0"
 serde = { version = "1.0.66", features = ["derive"] }
+wtf8 = "0.0.3"
diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs
@@ -13,6 +13,7 @@ use std::collections::HashMap;
 use std::str::FromStr;
 use unic_emoji_char::is_emoji_presentation;
 use unicode_xid::UnicodeXID;
+use wtf8;
 
 #[derive(Clone, Copy, PartialEq, Debug)]
 struct IndentationLevel {
@@ -67,6 +68,7 @@ pub struct LexicalError {
 #[derive(Debug)]
 pub enum LexicalErrorType {
     StringError,
+    UnicodeError,
     NestingError,
     UnrecognizedToken { tok: char },
     OtherError(String),
@@ -456,6 +458,27 @@ where
         }
     }
 
+    fn unicode_literal(&mut self, literal_number: usize) -> Result<char, LexicalError> {
+        let mut p: u32 = 0u32;
+        let unicode_error = Err(LexicalError {
+            error: LexicalErrorType::UnicodeError,
+            location: self.get_pos(),
+        });
+        for i in 1..=literal_number {
+            match self.next_char() {
+                Some(c) => match c.to_digit(16) {
+                    Some(d) => p += d << (literal_number - i) * 4,
+                    None => return unicode_error,
+                },
+                None => return unicode_error,
+            }
+        }
+        match wtf8::CodePoint::from_u32(p) {
+            Some(cp) => return Ok(cp.to_char_lossy()),
+            None => return unicode_error,
+        }
+    }
+
     fn lex_string(
         &mut self,
         is_bytes: bool,
@@ -513,6 +536,9 @@ where
                             Some('t') => {
                                 string_content.push('\t');
                             }
+                            Some('u') => string_content.push(self.unicode_literal(4)?),
+                            Some('U') => string_content.push(self.unicode_literal(8)?),
+                            Some('x') if !is_bytes => string_content.push(self.unicode_literal(2)?),
                             Some('v') => string_content.push('\x0b'),
                             Some(c) => {
                                 string_content.push('\\');
diff --git a/tests/snippets/strings.py b/tests/snippets/strings.py
@@ -240,3 +240,12 @@ def try_mutate_str():
 assert "abcdefg".isprintable()
 assert not "abcdefg\n".isprintable()
 assert "ʹ".isprintable()
+
+# test unicode iterals
+assert "\xac" == "¬"
+assert "\u0037" == "7"
+assert "\u0040" == "@"
+assert "\u0041" == "A"
+assert "\u00BE" == "¾"
+assert "\u9487" == "钇"
+assert "\U0001F609" == "😉"