Skip to content

Commit 7ac90f5

Browse files
authored
Merge pull request RustPython#5587 from coolreader18/wtf8
Allow surrogates in str
2 parents e3a1031 + f3b8d55 commit 7ac90f5

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

64 files changed

+3880
-1034
lines changed

Cargo.lock

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Lib/test/string_tests.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1066,8 +1066,6 @@ def test_hash(self):
10661066
hash(b)
10671067
self.assertEqual(hash(a), hash(b))
10681068

1069-
# TODO: RUSTPYTHON
1070-
@unittest.expectedFailure
10711069
def test_capitalize_nonascii(self):
10721070
# check that titlecased chars are lowered correctly
10731071
# \u1ffc is the titlecased char

Lib/test/test_cmd_line_script.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -574,6 +574,7 @@ def test_pep_409_verbiage(self):
574574
self.assertTrue(text[1].startswith(' File '))
575575
self.assertTrue(text[3].startswith('NameError'))
576576

577+
@unittest.expectedFailureIf(sys.platform == "linux", "TODO: RUSTPYTHON")
577578
def test_non_ascii(self):
578579
# Mac OS X denies the creation of a file with an invalid UTF-8 name.
579580
# Windows allows creating a name with an arbitrary bytes name, but

Lib/test/test_codecs.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1698,8 +1698,6 @@ def test_decode_invalid(self):
16981698

16991699

17001700
class NameprepTest(unittest.TestCase):
1701-
# TODO: RUSTPYTHON
1702-
@unittest.expectedFailure
17031701
def test_nameprep(self):
17041702
from encodings.idna import nameprep
17051703
for pos, (orig, prepped) in enumerate(nameprep_tests):

Lib/test/test_difflib.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -373,8 +373,6 @@ def test_byte_content(self):
373373
check(difflib.diff_bytes(context, a, a, b'a', b'a', b'2005', b'2013'))
374374
check(difflib.diff_bytes(context, a, b, b'a', b'b', b'2005', b'2013'))
375375

376-
# TODO: RUSTPYTHON
377-
@unittest.expectedFailure
378376
def test_byte_filenames(self):
379377
# somebody renamed a file from ISO-8859-2 to UTF-8
380378
fna = b'\xb3odz.txt' # "łodz.txt"

Lib/test/test_import/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1305,6 +1305,8 @@ def exec_module(*args):
13051305
else:
13061306
importlib.SourceLoader.exec_module = old_exec_module
13071307

1308+
# TODO: RUSTPYTHON
1309+
@unittest.expectedFailure
13081310
@unittest.skipUnless(TESTFN_UNENCODABLE, 'need TESTFN_UNENCODABLE')
13091311
def test_unencodable_filename(self):
13101312
# Issue #11619: The Python parser and the import machinery must not

Lib/test/test_json/test_scanstring.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -143,10 +143,4 @@ def test_overflow(self):
143143

144144

145145
class TestPyScanstring(TestScanstring, PyTest): pass
146-
# TODO: RUSTPYTHON
147-
class TestPyScanstring(TestScanstring, PyTest):
148-
# TODO: RUSTPYTHON
149-
@unittest.expectedFailure
150-
def test_bad_escapes(self):
151-
super().test_bad_escapes()
152146
class TestCScanstring(TestScanstring, CTest): pass

Lib/test/test_ntpath.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1032,12 +1032,6 @@ class NtCommonTest(test_genericpath.CommonTest, unittest.TestCase):
10321032
pathmodule = ntpath
10331033
attributes = ['relpath']
10341034

1035-
# TODO: RUSTPYTHON
1036-
if sys.platform == "linux":
1037-
@unittest.expectedFailure
1038-
def test_nonascii_abspath(self):
1039-
super().test_nonascii_abspath()
1040-
10411035
# TODO: RUSTPYTHON
10421036
if sys.platform == "win32":
10431037
# TODO: RUSTPYTHON, ValueError: illegal environment variable name

Lib/test/test_re.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -854,8 +854,6 @@ def test_string_boundaries(self):
854854
# Can match around the whitespace.
855855
self.assertEqual(len(re.findall(r"\B", " ")), 2)
856856

857-
# TODO: RUSTPYTHON
858-
@unittest.expectedFailure
859857
def test_bigcharset(self):
860858
self.assertEqual(re.match("([\u2222\u2223])",
861859
"\u2222").group(1), "\u2222")
@@ -2233,6 +2231,7 @@ def test_bug_40736(self):
22332231
with self.assertRaisesRegex(TypeError, "got 'type'"):
22342232
re.search("x*", type)
22352233

2234+
@unittest.skip("TODO: RUSTPYTHON: flaky, improve perf")
22362235
@requires_resource('cpu')
22372236
def test_search_anchor_at_beginning(self):
22382237
s = 'x'*10**7

Lib/test/test_smtplib.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1459,8 +1459,6 @@ def test_send_unicode_with_SMTPUTF8_via_low_level_API(self):
14591459
self.assertIn('SMTPUTF8', self.serv.last_mail_options)
14601460
self.assertEqual(self.serv.last_rcpt_options, [])
14611461

1462-
# TODO: RUSTPYTHON
1463-
@unittest.expectedFailure
14641462
def test_send_message_uses_smtputf8_if_addrs_non_ascii(self):
14651463
msg = EmailMessage()
14661464
msg['From'] = "Páolo <fő[email protected]>"

Lib/test/test_socket.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1578,7 +1578,7 @@ def test_getnameinfo(self):
15781578
# only IP addresses are allowed
15791579
self.assertRaises(OSError, socket.getnameinfo, ('mail.python.org',0), 0)
15801580

1581-
@unittest.expectedFailureIf(sys.platform != "darwin", "TODO: RUSTPYTHON; socket.gethostbyname_ex")
1581+
@unittest.skip("TODO: RUSTPYTHON: flaky on CI?")
15821582
@unittest.skipUnless(support.is_resource_enabled('network'),
15831583
'network is not enabled')
15841584
def test_idna(self):
@@ -5519,8 +5519,6 @@ def testBytesAddr(self):
55195519
self.addCleanup(os_helper.unlink, path)
55205520
self.assertEqual(self.sock.getsockname(), path)
55215521

5522-
# TODO: RUSTPYTHON, surrogateescape
5523-
@unittest.expectedFailure
55245522
def testSurrogateescapeBind(self):
55255523
# Test binding to a valid non-ASCII pathname, with the
55265524
# non-ASCII bytes supplied using surrogateescape encoding.

Lib/test/test_sqlite3/test_types.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,6 @@ def test_too_large_int(self):
9595
row = self.cur.fetchone()
9696
self.assertIsNone(row)
9797

98-
# TODO: RUSTPYTHON
99-
@unittest.expectedFailure
10098
def test_string_with_surrogates(self):
10199
for value in 0xd8ff, 0xdcff:
102100
with self.assertRaises(UnicodeEncodeError):

Lib/test/test_ucn.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,8 +102,6 @@ def test_cjk_unified_ideographs(self):
102102
self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")
103103
self.checkletter("CJK UNIFIED IDEOGRAPH-3134A", "\U0003134A")
104104

105-
# TODO: RUSTPYTHON
106-
@unittest.expectedFailure
107105
def test_bmp_characters(self):
108106
for code in range(0x10000):
109107
char = chr(code)

Lib/test/test_unicode.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -721,8 +721,6 @@ def test_isspace(self):
721721
'\U0001F40D', '\U0001F46F']:
722722
self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))
723723

724-
# TODO: RUSTPYTHON
725-
@unittest.expectedFailure
726724
@support.requires_resource('cpu')
727725
def test_isspace_invariant(self):
728726
for codepoint in range(sys.maxunicode + 1):

Lib/test/test_unicodedata.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,6 @@ def test_function_checksum(self):
9999
result = h.hexdigest()
100100
self.assertEqual(result, self.expectedchecksum)
101101

102-
# TODO: RUSTPYTHON
103-
@unittest.expectedFailure
104102
@requires_resource('cpu')
105103
def test_name_inverse_lookup(self):
106104
for i in range(sys.maxunicode + 1):
@@ -326,8 +324,6 @@ def test_ucd_510(self):
326324
self.assertTrue("\u1d79".upper()=='\ua77d')
327325
self.assertTrue(".".upper()=='.')
328326

329-
# TODO: RUSTPYTHON
330-
@unittest.expectedFailure
331327
def test_bug_5828(self):
332328
self.assertEqual("\u1d79".lower(), "\u1d79")
333329
# Only U+0000 should have U+0000 as its upper/lower/titlecase variant
@@ -347,8 +343,6 @@ def test_bug_4971(self):
347343
self.assertEqual("\u01c5".title(), "\u01c5")
348344
self.assertEqual("\u01c6".title(), "\u01c5")
349345

350-
# TODO: RUSTPYTHON
351-
@unittest.expectedFailure
352346
def test_linebreak_7643(self):
353347
for i in range(0x10000):
354348
lines = (chr(i) + 'A').splitlines()

common/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,14 @@ rustpython-literal = { workspace = true }
1616

1717
ascii = { workspace = true }
1818
bitflags = { workspace = true }
19+
bstr = { workspace = true }
1920
cfg-if = { workspace = true }
2021
itertools = { workspace = true }
2122
libc = { workspace = true }
2223
malachite-bigint = { workspace = true }
2324
malachite-q = { workspace = true }
2425
malachite-base = { workspace = true }
26+
memchr = { workspace = true }
2527
num-complex = { workspace = true }
2628
num-traits = { workspace = true }
2729
once_cell = { workspace = true }

common/src/cformat.rs

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,13 @@ use std::{
1111
str::FromStr,
1212
};
1313

14+
use crate::wtf8::{CodePoint, Wtf8, Wtf8Buf};
15+
1416
#[derive(Debug, PartialEq)]
1517
pub enum CFormatErrorType {
1618
UnmatchedKeyParentheses,
1719
MissingModuloSign,
18-
UnsupportedFormatChar(char),
20+
UnsupportedFormatChar(CodePoint),
1921
IncompleteFormat,
2022
IntTooBig,
2123
// Unimplemented,
@@ -39,7 +41,9 @@ impl fmt::Display for CFormatError {
3941
UnsupportedFormatChar(c) => write!(
4042
f,
4143
"unsupported format character '{}' ({:#x}) at index {}",
42-
c, c as u32, self.index
44+
c,
45+
c.to_u32(),
46+
self.index
4347
),
4448
IntTooBig => write!(f, "width/precision too big"),
4549
_ => write!(f, "unexpected error parsing format string"),
@@ -160,7 +164,7 @@ pub trait FormatBuf:
160164
fn concat(self, other: Self) -> Self;
161165
}
162166

163-
pub trait FormatChar: Copy + Into<char> + From<u8> {
167+
pub trait FormatChar: Copy + Into<CodePoint> + From<u8> {
164168
fn to_char_lossy(self) -> char;
165169
fn eq_char(self, c: char) -> bool;
166170
}
@@ -188,6 +192,29 @@ impl FormatChar for char {
188192
}
189193
}
190194

195+
impl FormatBuf for Wtf8Buf {
196+
type Char = CodePoint;
197+
fn chars(&self) -> impl Iterator<Item = Self::Char> {
198+
self.code_points()
199+
}
200+
fn len(&self) -> usize {
201+
(**self).len()
202+
}
203+
fn concat(mut self, other: Self) -> Self {
204+
self.extend([other]);
205+
self
206+
}
207+
}
208+
209+
impl FormatChar for CodePoint {
210+
fn to_char_lossy(self) -> char {
211+
self.to_char_lossy()
212+
}
213+
fn eq_char(self, c: char) -> bool {
214+
self == c
215+
}
216+
}
217+
191218
impl FormatBuf for Vec<u8> {
192219
type Char = u8;
193220
fn chars(&self) -> impl Iterator<Item = Self::Char> {
@@ -801,6 +828,15 @@ impl FromStr for CFormatString {
801828
}
802829
}
803830

831+
pub type CFormatWtf8 = CFormatStrOrBytes<Wtf8Buf>;
832+
833+
impl CFormatWtf8 {
834+
pub fn parse_from_wtf8(s: &Wtf8) -> Result<Self, CFormatError> {
835+
let mut iter = s.code_points().enumerate().peekable();
836+
Self::parse(&mut iter)
837+
}
838+
}
839+
804840
#[cfg(test)]
805841
mod tests {
806842
use super::*;

0 commit comments

Comments
 (0)