Merge pull request RustPython#1658 from RustPython/coolreader18/unicode-stuff

coolreader18 · web-flow · commit f93d9763eb7a · 2020-01-05T01:53:30.000-06:00
Update unicodedata with ucd_3_2_0
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/parser/Cargo.toml b/parser/Cargo.toml
@@ -17,6 +17,6 @@ log="0.4.1"
 regex = "1"
 num-bigint = "0.2"
 num-traits = "0.2"
-unicode-xid = "0.2.0"
-unic-emoji-char = "0.9.0"
+unic-emoji-char = "0.9"
+unic-ucd-ident = "0.9"
 wtf8 = "0.0.3"
diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs
@@ -2,9 +2,6 @@
 //!
 //! This means source code is translated into separate tokens.
 
-extern crate unic_emoji_char;
-extern crate unicode_xid;
-
 pub use super::token::Tok;
 use crate::error::{LexicalError, LexicalErrorType};
 use crate::location::Location;
@@ -15,8 +12,7 @@ use std::cmp::Ordering;
 use std::collections::HashMap;
 use std::str::FromStr;
 use unic_emoji_char::is_emoji_presentation;
-use unicode_xid::UnicodeXID;
-use wtf8;
+use unic_ucd_ident::{is_xid_continue, is_xid_start};
 
 #[derive(Clone, Copy, PartialEq, Debug, Default)]
 struct IndentationLevel {
@@ -658,17 +654,14 @@ where
     }
 
     fn is_identifier_start(&self, c: char) -> bool {
-        match c {
-            '_' => true,
-            c => UnicodeXID::is_xid_start(c),
-        }
+        c == '_' || is_xid_start(c)
     }
 
     fn is_identifier_continuation(&self) -> bool {
         if let Some(c) = self.chr0 {
             match c {
                 '_' | '0'..='9' => true,
-                c => UnicodeXID::is_xid_continue(c),
+                c => is_xid_continue(c),
             }
         } else {
             false
diff --git a/tests/snippets/unicode_fu.py b/tests/snippets/unicode_fu.py
@@ -19,3 +19,21 @@
 assert unicodedata.lookup('LATIN SMALL LETTER A') == 'a'
 assert unicodedata.bidirectional('a') == 'L'
 assert unicodedata.normalize('NFC', 'bla') == 'bla'
+
+# testing unicodedata.ucd_3_2_0 for idna
+assert "abcСĤ".encode("idna") == b'xn--abc-7sa390b'
+# TODO: fix: assert "abc䄣Ĳ".encode("idna") == b'xn--abcij-zb5f'
+
+# from CPython tests
+assert "python.org".encode("idna") == b"python.org"
+assert "python.org.".encode("idna") == b"python.org."
+assert "pyth\xf6n.org".encode("idna") == b"xn--pythn-mua.org"
+assert "pyth\xf6n.org.".encode("idna") == b"xn--pythn-mua.org."
+assert b"python.org".decode("idna") == "python.org"
+assert b"python.org.".decode("idna") == "python.org."
+assert b"xn--pythn-mua.org".decode("idna") == "pyth\xf6n.org"
+assert b"xn--pythn-mua.org.".decode("idna") == "pyth\xf6n.org."
+
+# TODO: add east_asian_width and mirrored
+# assert unicodedata.ucd_3_2_0.east_asian_width('\u231a') == 'N'
+# assert not unicodedata.ucd_3_2_0.mirrored("\u0f3a")
diff --git a/vm/Cargo.toml b/vm/Cargo.toml
@@ -46,18 +46,19 @@ rustc_version_runtime = "0.1.*"
 statrs = "0.12.0"
 caseless = "0.2.1"
 chrono = { version = "=0.4.9", features = ["wasmbind"] }
-unicode-xid = "0.2.0"
 lazy_static = "^1.0.1"
 lexical = "4"
 itertools = "0.8"
 hex = "0.4.0"
 hexf-parse = "0.1.0"
 indexmap = "1.0.2"
 crc = "^1.0.0"
-unicode_categories = "0.1.1"
-unicode_names2 = "0.3.0"
-unicode-casing = "0.1.0"
-unic = "0.9.0"
+unicode_names2 = "0.3"
+# TODO: use unic for this; needed for title case:
+# https://github.com/RustPython/RustPython/pull/832#discussion_r275428939
+unicode-casing = "0.1"
+unic = "0.9"
+unic-common = "0.9"
 maplit = "1.0"
 proc-macro-hack = { version = "0.5", optional = true }
 bitflags = "1.1"
diff --git a/vm/src/obj/objstr.rs b/vm/src/obj/objstr.rs
@@ -1,6 +1,3 @@
-extern crate unicode_categories;
-extern crate unicode_xid;
-
 use std::cell::Cell;
 use std::char;
 use std::fmt;
@@ -10,10 +7,10 @@ use std::str::FromStr;
 use std::string::ToString;
 
 use num_traits::ToPrimitive;
+use unic::ucd::category::GeneralCategory;
+use unic::ucd::ident::{is_xid_continue, is_xid_start};
 use unic::ucd::is_cased;
 use unicode_casing::CharExt;
-use unicode_categories::UnicodeCategories;
-use unicode_xid::UnicodeXID;
 
 use super::objbytes::{PyBytes, PyBytesRef};
 use super::objdict::PyDict;
@@ -366,16 +363,7 @@ impl PyString {
                 formatted.push_str(&format!("\\x{:02x}", c as u32));
             } else if c.is_ascii() {
                 formatted.push(c);
-            } else if c.is_other() || c.is_separator() {
-                // According to python following categories aren't printable:
-                // * Cc (Other, Control)
-                // * Cf (Other, Format)
-                // * Cs (Other, Surrogate)
-                // * Co (Other, Private Use)
-                // * Cn (Other, Not Assigned)
-                // * Zl Separator, Line ('\u2028', LINE SEPARATOR)
-                // * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
-                // * Zs (Separator, Space) other than ASCII space('\x20').
+            } else if !char_is_printable(c) {
                 let code = c as u32;
                 let escaped = if code < 0xff {
                     format!("\\U{:02x}", code)
@@ -742,10 +730,9 @@ impl PyString {
     ///   * Zs (Separator, Space) other than ASCII space('\x20').
     #[pymethod]
     fn isprintable(&self, _vm: &VirtualMachine) -> bool {
-        self.value.chars().all(|c| match c {
-            '\u{0020}' => true,
-            _ => !(c.is_other_control() | c.is_separator()),
-        })
+        self.value
+            .chars()
+            .all(|c| c == '\u{0020}' || char_is_printable(c))
     }
 
     // cpython's isspace ignores whitespace, including \t and \n, etc, unless the whole string is empty
@@ -1094,13 +1081,9 @@ impl PyString {
     #[pymethod]
     fn isidentifier(&self, _vm: &VirtualMachine) -> bool {
         let mut chars = self.value.chars();
-        let is_identifier_start = match chars.next() {
-            Some('_') => true,
-            Some(c) => UnicodeXID::is_xid_start(c),
-            None => false,
-        };
+        let is_identifier_start = chars.next().map_or(false, |c| c == '_' || is_xid_start(c));
         // a string is not an identifier if it has whitespace or starts with a number
-        is_identifier_start && chars.all(UnicodeXID::is_xid_continue)
+        is_identifier_start && chars.all(is_xid_continue)
     }
 
     // https://docs.python.org/3/library/stdtypes.html#str.translate
@@ -1706,6 +1689,20 @@ fn adjust_indices(
     }
 }
 
+// According to python following categories aren't printable:
+// * Cc (Other, Control)
+// * Cf (Other, Format)
+// * Cs (Other, Surrogate)
+// * Co (Other, Private Use)
+// * Cn (Other, Not Assigned)
+// * Zl Separator, Line ('\u2028', LINE SEPARATOR)
+// * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
+// * Zs (Separator, Space) other than ASCII space('\x20').
+fn char_is_printable(c: char) -> bool {
+    let cat = GeneralCategory::of(c);
+    !(cat.is_other() || cat.is_separator())
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/vm/src/pyobject.rs b/vm/src/pyobject.rs
@@ -1098,8 +1098,8 @@ pub trait PyValue: fmt::Debug + Sized + 'static {
             };
             PyRef::new_ref(PyObject::new(self, cls, dict), vm)
         } else {
-            let subtype = vm.to_pystr(&cls.obj)?;
-            let basetype = vm.to_pystr(&class.obj)?;
+            let subtype = vm.to_str(&cls.obj)?;
+            let basetype = vm.to_str(&class.obj)?;
             Err(vm.new_type_error(format!("{} is not a subtype of {}", subtype, basetype)))
         }
     }
diff --git a/vm/src/stdlib/unicodedata.rs b/vm/src/stdlib/unicodedata.rs

Original file line number	Diff line number	Diff line change
`@@ -1098,8 +1098,8 @@ pub trait PyValue: fmt::Debug + Sized + 'static {`
`1098`	`1098`	`};`
`1099`	`1099`	`PyRef::new_ref(PyObject::new(self, cls, dict), vm)`
`1100`	`1100`	`} else {`
`1101`		`- let subtype = vm.to_pystr(&cls.obj)?;`
`1102`		`- let basetype = vm.to_pystr(&class.obj)?;`
	`1101`	`+ let subtype = vm.to_str(&cls.obj)?;`
	`1102`	`+ let basetype = vm.to_str(&class.obj)?;`
`1103`	`1103`	`Err(vm.new_type_error(format!("{} is not a subtype of {}", subtype, basetype)))`
`1104`	`1104`	`}`
`1105`	`1105`	`}`