Skip to content

Commit 974dc68

Browse files
author
Antonio Yang
committed
support unicode literal
- support unicode literal \x with 2 digits - support unicode literal \u with 4 digits - support unicode literal \U with 8 digits - avoid to parse \x as unicode literal in bytes
1 parent 50662c4 commit 974dc68

File tree

3 files changed

+36
-0
lines changed

3 files changed

+36
-0
lines changed

parser/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,4 @@ num-traits = "0.2"
1717
unicode-xid = "0.1.0"
1818
unic-emoji-char = "0.9.0"
1919
serde = { version = "1.0.66", features = ["derive"] }
20+
wtf8 = "0.0.3"

parser/src/lexer.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ use std::collections::HashMap;
1313
use std::str::FromStr;
1414
use unic_emoji_char::is_emoji_presentation;
1515
use unicode_xid::UnicodeXID;
16+
use wtf8;
1617

1718
#[derive(Clone, Copy, PartialEq, Debug)]
1819
struct IndentationLevel {
@@ -67,6 +68,7 @@ pub struct LexicalError {
6768
#[derive(Debug)]
6869
pub enum LexicalErrorType {
6970
StringError,
71+
UnicodeError,
7072
NestingError,
7173
UnrecognizedToken { tok: char },
7274
OtherError(String),
@@ -456,6 +458,27 @@ where
456458
}
457459
}
458460

461+
fn unicode_literal(&mut self, literal_number: usize) -> Result<char, LexicalError> {
462+
let mut p: u32 = 0u32;
463+
let unicode_error = Err(LexicalError {
464+
error: LexicalErrorType::UnicodeError,
465+
location: self.get_pos(),
466+
});
467+
for i in 1..=literal_number {
468+
match self.next_char() {
469+
Some(c) => match c.to_digit(16) {
470+
Some(d) => p += d << (literal_number - i) * 4,
471+
None => return unicode_error,
472+
},
473+
None => return unicode_error,
474+
}
475+
}
476+
match wtf8::CodePoint::from_u32(p) {
477+
Some(cp) => return Ok(cp.to_char_lossy()),
478+
None => return unicode_error,
479+
}
480+
}
481+
459482
fn lex_string(
460483
&mut self,
461484
is_bytes: bool,
@@ -513,6 +536,9 @@ where
513536
Some('t') => {
514537
string_content.push('\t');
515538
}
539+
Some('u') => string_content.push(self.unicode_literal(4)?),
540+
Some('U') => string_content.push(self.unicode_literal(8)?),
541+
Some('x') if !is_bytes => string_content.push(self.unicode_literal(2)?),
516542
Some('v') => string_content.push('\x0b'),
517543
Some(c) => {
518544
string_content.push('\\');

tests/snippets/strings.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,3 +240,12 @@ def try_mutate_str():
240240
assert "abcdefg".isprintable()
241241
assert not "abcdefg\n".isprintable()
242242
assert "ʹ".isprintable()
243+
244+
# test unicode iterals
245+
assert "\xac" == "¬"
246+
assert "\u0037" == "7"
247+
assert "\u0040" == "@"
248+
assert "\u0041" == "A"
249+
assert "\u00BE" == "¾"
250+
assert "\u9487" == "钇"
251+
assert "\U0001F609" == "😉"

0 commit comments

Comments
 (0)