Skip to content

Commit f93d976

Browse files
authored
Merge pull request RustPython#1658 from RustPython/coolreader18/unicode-stuff
Update unicodedata with ucd_3_2_0
2 parents cdba57d + 7d3f341 commit f93d976

File tree

8 files changed

+192
-111
lines changed

8 files changed

+192
-111
lines changed

Cargo.lock

Lines changed: 2 additions & 9 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

parser/Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,6 @@ log="0.4.1"
1717
regex = "1"
1818
num-bigint = "0.2"
1919
num-traits = "0.2"
20-
unicode-xid = "0.2.0"
21-
unic-emoji-char = "0.9.0"
20+
unic-emoji-char = "0.9"
21+
unic-ucd-ident = "0.9"
2222
wtf8 = "0.0.3"

parser/src/lexer.rs

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,6 @@
22
//!
33
//! This means source code is translated into separate tokens.
44
5-
extern crate unic_emoji_char;
6-
extern crate unicode_xid;
7-
85
pub use super::token::Tok;
96
use crate::error::{LexicalError, LexicalErrorType};
107
use crate::location::Location;
@@ -15,8 +12,7 @@ use std::cmp::Ordering;
1512
use std::collections::HashMap;
1613
use std::str::FromStr;
1714
use unic_emoji_char::is_emoji_presentation;
18-
use unicode_xid::UnicodeXID;
19-
use wtf8;
15+
use unic_ucd_ident::{is_xid_continue, is_xid_start};
2016

2117
#[derive(Clone, Copy, PartialEq, Debug, Default)]
2218
struct IndentationLevel {
@@ -658,17 +654,14 @@ where
658654
}
659655

660656
fn is_identifier_start(&self, c: char) -> bool {
661-
match c {
662-
'_' => true,
663-
c => UnicodeXID::is_xid_start(c),
664-
}
657+
c == '_' || is_xid_start(c)
665658
}
666659

667660
fn is_identifier_continuation(&self) -> bool {
668661
if let Some(c) = self.chr0 {
669662
match c {
670663
'_' | '0'..='9' => true,
671-
c => UnicodeXID::is_xid_continue(c),
664+
c => is_xid_continue(c),
672665
}
673666
} else {
674667
false

tests/snippets/unicode_fu.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,21 @@
1919
assert unicodedata.lookup('LATIN SMALL LETTER A') == 'a'
2020
assert unicodedata.bidirectional('a') == 'L'
2121
assert unicodedata.normalize('NFC', 'bla') == 'bla'
22+
23+
# testing unicodedata.ucd_3_2_0 for idna
24+
assert "abcСĤ".encode("idna") == b'xn--abc-7sa390b'
25+
# TODO: fix: assert "abc䄣IJ".encode("idna") == b'xn--abcij-zb5f'
26+
27+
# from CPython tests
28+
assert "python.org".encode("idna") == b"python.org"
29+
assert "python.org.".encode("idna") == b"python.org."
30+
assert "pyth\xf6n.org".encode("idna") == b"xn--pythn-mua.org"
31+
assert "pyth\xf6n.org.".encode("idna") == b"xn--pythn-mua.org."
32+
assert b"python.org".decode("idna") == "python.org"
33+
assert b"python.org.".decode("idna") == "python.org."
34+
assert b"xn--pythn-mua.org".decode("idna") == "pyth\xf6n.org"
35+
assert b"xn--pythn-mua.org.".decode("idna") == "pyth\xf6n.org."
36+
37+
# TODO: add east_asian_width and mirrored
38+
# assert unicodedata.ucd_3_2_0.east_asian_width('\u231a') == 'N'
39+
# assert not unicodedata.ucd_3_2_0.mirrored("\u0f3a")

vm/Cargo.toml

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,18 +46,19 @@ rustc_version_runtime = "0.1.*"
4646
statrs = "0.12.0"
4747
caseless = "0.2.1"
4848
chrono = { version = "=0.4.9", features = ["wasmbind"] }
49-
unicode-xid = "0.2.0"
5049
lazy_static = "^1.0.1"
5150
lexical = "4"
5251
itertools = "0.8"
5352
hex = "0.4.0"
5453
hexf-parse = "0.1.0"
5554
indexmap = "1.0.2"
5655
crc = "^1.0.0"
57-
unicode_categories = "0.1.1"
58-
unicode_names2 = "0.3.0"
59-
unicode-casing = "0.1.0"
60-
unic = "0.9.0"
56+
unicode_names2 = "0.3"
57+
# TODO: use unic for this; needed for title case:
58+
# https://github.com/RustPython/RustPython/pull/832#discussion_r275428939
59+
unicode-casing = "0.1"
60+
unic = "0.9"
61+
unic-common = "0.9"
6162
maplit = "1.0"
6263
proc-macro-hack = { version = "0.5", optional = true }
6364
bitflags = "1.1"

vm/src/obj/objstr.rs

Lines changed: 22 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,3 @@
1-
extern crate unicode_categories;
2-
extern crate unicode_xid;
3-
41
use std::cell::Cell;
52
use std::char;
63
use std::fmt;
@@ -10,10 +7,10 @@ use std::str::FromStr;
107
use std::string::ToString;
118

129
use num_traits::ToPrimitive;
10+
use unic::ucd::category::GeneralCategory;
11+
use unic::ucd::ident::{is_xid_continue, is_xid_start};
1312
use unic::ucd::is_cased;
1413
use unicode_casing::CharExt;
15-
use unicode_categories::UnicodeCategories;
16-
use unicode_xid::UnicodeXID;
1714

1815
use super::objbytes::{PyBytes, PyBytesRef};
1916
use super::objdict::PyDict;
@@ -366,16 +363,7 @@ impl PyString {
366363
formatted.push_str(&format!("\\x{:02x}", c as u32));
367364
} else if c.is_ascii() {
368365
formatted.push(c);
369-
} else if c.is_other() || c.is_separator() {
370-
// According to python following categories aren't printable:
371-
// * Cc (Other, Control)
372-
// * Cf (Other, Format)
373-
// * Cs (Other, Surrogate)
374-
// * Co (Other, Private Use)
375-
// * Cn (Other, Not Assigned)
376-
// * Zl Separator, Line ('\u2028', LINE SEPARATOR)
377-
// * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
378-
// * Zs (Separator, Space) other than ASCII space('\x20').
366+
} else if !char_is_printable(c) {
379367
let code = c as u32;
380368
let escaped = if code < 0xff {
381369
format!("\\U{:02x}", code)
@@ -742,10 +730,9 @@ impl PyString {
742730
/// * Zs (Separator, Space) other than ASCII space('\x20').
743731
#[pymethod]
744732
fn isprintable(&self, _vm: &VirtualMachine) -> bool {
745-
self.value.chars().all(|c| match c {
746-
'\u{0020}' => true,
747-
_ => !(c.is_other_control() | c.is_separator()),
748-
})
733+
self.value
734+
.chars()
735+
.all(|c| c == '\u{0020}' || char_is_printable(c))
749736
}
750737

751738
// cpython's isspace ignores whitespace, including \t and \n, etc, unless the whole string is empty
@@ -1094,13 +1081,9 @@ impl PyString {
10941081
#[pymethod]
10951082
fn isidentifier(&self, _vm: &VirtualMachine) -> bool {
10961083
let mut chars = self.value.chars();
1097-
let is_identifier_start = match chars.next() {
1098-
Some('_') => true,
1099-
Some(c) => UnicodeXID::is_xid_start(c),
1100-
None => false,
1101-
};
1084+
let is_identifier_start = chars.next().map_or(false, |c| c == '_' || is_xid_start(c));
11021085
// a string is not an identifier if it has whitespace or starts with a number
1103-
is_identifier_start && chars.all(UnicodeXID::is_xid_continue)
1086+
is_identifier_start && chars.all(is_xid_continue)
11041087
}
11051088

11061089
// https://docs.python.org/3/library/stdtypes.html#str.translate
@@ -1706,6 +1689,20 @@ fn adjust_indices(
17061689
}
17071690
}
17081691

1692+
// According to python following categories aren't printable:
1693+
// * Cc (Other, Control)
1694+
// * Cf (Other, Format)
1695+
// * Cs (Other, Surrogate)
1696+
// * Co (Other, Private Use)
1697+
// * Cn (Other, Not Assigned)
1698+
// * Zl Separator, Line ('\u2028', LINE SEPARATOR)
1699+
// * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
1700+
// * Zs (Separator, Space) other than ASCII space('\x20').
1701+
fn char_is_printable(c: char) -> bool {
1702+
let cat = GeneralCategory::of(c);
1703+
!(cat.is_other() || cat.is_separator())
1704+
}
1705+
17091706
#[cfg(test)]
17101707
mod tests {
17111708
use super::*;

vm/src/pyobject.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1098,8 +1098,8 @@ pub trait PyValue: fmt::Debug + Sized + 'static {
10981098
};
10991099
PyRef::new_ref(PyObject::new(self, cls, dict), vm)
11001100
} else {
1101-
let subtype = vm.to_pystr(&cls.obj)?;
1102-
let basetype = vm.to_pystr(&class.obj)?;
1101+
let subtype = vm.to_str(&cls.obj)?;
1102+
let basetype = vm.to_str(&class.obj)?;
11031103
Err(vm.new_type_error(format!("{} is not a subtype of {}", subtype, basetype)))
11041104
}
11051105
}

0 commit comments

Comments
 (0)