Skip to content

Commit 9b55505

Browse files
authored
Merge pull request RustPython#4678 from dalinaum/test_unicodedata
Update test_unicodedata from CPython 3.11.2
2 parents 87728c4 + b687960 commit 9b55505

File tree

1 file changed

+128
-33
lines changed

1 file changed

+128
-33
lines changed

Lib/test/test_unicodedata.py

Lines changed: 128 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,32 @@
1-
""" Test script for the unicodedata module.
1+
""" Tests for the unicodedata module.
22
33
Written by Marc-Andre Lemburg ([email protected]).
44
55
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
66
77
"""
88

9+
import hashlib
10+
from http.client import HTTPException
911
import sys
12+
import unicodedata
1013
import unittest
11-
import hashlib
12-
from test.support import script_helper
13-
14-
encoding = 'utf-8'
15-
errors = 'surrogatepass'
14+
from test.support import (open_urlresource, requires_resource, script_helper,
15+
cpython_only, check_disallow_instantiation,
16+
ResourceDenied)
1617

1718

18-
### Run tests
19-
2019
class UnicodeMethodsTest(unittest.TestCase):
2120

2221
# update this, if the database changes
23-
expectedchecksum = '9129d6f2bdf008a81c2476e5b5127014a62130c1'
22+
expectedchecksum = '4739770dd4d0e5f1b1677accfc3552ed3c8ef326'
2423

2524
# TODO: RUSTPYTHON
2625
@unittest.expectedFailure
26+
@requires_resource('cpu')
2727
def test_method_checksum(self):
2828
h = hashlib.sha1()
29-
for i in range(0x10000):
29+
for i in range(sys.maxunicode + 1):
3030
char = chr(i)
3131
data = [
3232
# Predicates (single char)
@@ -63,33 +63,26 @@ def test_method_checksum(self):
6363
(char + 'ABC').title(),
6464

6565
]
66-
h.update(''.join(data).encode(encoding, errors))
66+
h.update(''.join(data).encode('utf-8', 'surrogatepass'))
6767
result = h.hexdigest()
6868
self.assertEqual(result, self.expectedchecksum)
6969

7070
class UnicodeDatabaseTest(unittest.TestCase):
71-
72-
def setUp(self):
73-
# In case unicodedata is not available, this will raise an ImportError,
74-
# but the other test cases will still be run
75-
import unicodedata
76-
self.db = unicodedata
77-
78-
def tearDown(self):
79-
del self.db
71+
db = unicodedata
8072

8173
class UnicodeFunctionsTest(UnicodeDatabaseTest):
8274

8375
# Update this if the database changes. Make sure to do a full rebuild
8476
# (e.g. 'make distclean && make') to get the correct checksum.
85-
expectedchecksum = 'c44a49ca7c5cb6441640fe174ede604b45028652'
77+
expectedchecksum = '98d602e1f69d5c5bb8a5910c40bbbad4e18e8370'
8678
# TODO: RUSTPYTHON
8779
@unittest.expectedFailure
80+
@requires_resource('cpu')
8881
def test_function_checksum(self):
8982
data = []
9083
h = hashlib.sha1()
9184

92-
for i in range(0x10000):
85+
for i in range(sys.maxunicode + 1):
9386
char = chr(i)
9487
data = [
9588
# Properties
@@ -106,6 +99,15 @@ def test_function_checksum(self):
10699
result = h.hexdigest()
107100
self.assertEqual(result, self.expectedchecksum)
108101

102+
# TODO: RUSTPYTHON
103+
@unittest.expectedFailure
104+
@requires_resource('cpu')
105+
def test_name_inverse_lookup(self):
106+
for i in range(sys.maxunicode + 1):
107+
char = chr(i)
108+
if looked_name := self.db.name(char, None):
109+
self.assertEqual(self.db.lookup(looked_name), char)
110+
109111
# TODO: RUSTPYTHON
110112
@unittest.expectedFailure
111113
def test_digit(self):
@@ -201,15 +203,8 @@ def test_combining(self):
201203
self.assertRaises(TypeError, self.db.combining)
202204
self.assertRaises(TypeError, self.db.combining, 'xx')
203205

204-
def test_normalize(self):
205-
self.assertRaises(TypeError, self.db.normalize)
206-
self.assertRaises(ValueError, self.db.normalize, 'unknown', 'xx')
207-
self.assertEqual(self.db.normalize('NFKC', ''), '')
208-
# The rest can be found in test_normalization.py
209-
# which requires an external file.
210-
211206
def test_pr29(self):
212-
# http://www.unicode.org/review/pr-29.html
207+
# https://www.unicode.org/review/pr-29.html
213208
# See issues #1054943 and #10254.
214209
composed = ("\u0b47\u0300\u0b3e", "\u1100\u0300\u1161",
215210
'Li\u030dt-s\u1e73\u0301',
@@ -240,9 +235,6 @@ def test_issue29456(self):
240235
self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
241236
self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)
242237

243-
# For tests of unicodedata.is_normalized / self.db.is_normalized ,
244-
# see test_normalization.py .
245-
246238
def test_east_asian_width(self):
247239
eaw = self.db.east_asian_width
248240
self.assertRaises(TypeError, eaw, b'a')
@@ -265,6 +257,11 @@ def test_east_asian_width_9_0_changes(self):
265257

266258
class UnicodeMiscTest(UnicodeDatabaseTest):
267259

260+
@cpython_only
261+
def test_disallow_instantiation(self):
262+
# Ensure that the type disallows instantiation (bpo-43916)
263+
check_disallow_instantiation(self, unicodedata.UCD)
264+
268265
# TODO: RUSTPYTHON
269266
@unittest.expectedFailure
270267
def test_failed_import_during_compiling(self):
@@ -363,5 +360,103 @@ def test_linebreak_7643(self):
363360
self.assertEqual(len(lines), 1,
364361
r"\u%.4x should not be a linebreak" % i)
365362

363+
class NormalizationTest(unittest.TestCase):
364+
@staticmethod
365+
def check_version(testfile):
366+
hdr = testfile.readline()
367+
return unicodedata.unidata_version in hdr
368+
369+
@staticmethod
370+
def unistr(data):
371+
data = [int(x, 16) for x in data.split(" ")]
372+
return "".join([chr(x) for x in data])
373+
374+
@requires_resource('network')
375+
def test_normalization(self):
376+
TESTDATAFILE = "NormalizationTest.txt"
377+
TESTDATAURL = f"http://www.pythontest.net/unicode/{unicodedata.unidata_version}/{TESTDATAFILE}"
378+
379+
# Hit the exception early
380+
try:
381+
testdata = open_urlresource(TESTDATAURL, encoding="utf-8",
382+
check=self.check_version)
383+
except PermissionError:
384+
self.skipTest(f"Permission error when downloading {TESTDATAURL} "
385+
f"into the test data directory")
386+
except (OSError, HTTPException) as exc:
387+
self.skipTest(f"Failed to download {TESTDATAURL}: {exc}")
388+
389+
with testdata:
390+
self.run_normalization_tests(testdata)
391+
392+
def run_normalization_tests(self, testdata):
393+
part = None
394+
part1_data = {}
395+
396+
def NFC(str):
397+
return unicodedata.normalize("NFC", str)
398+
399+
def NFKC(str):
400+
return unicodedata.normalize("NFKC", str)
401+
402+
def NFD(str):
403+
return unicodedata.normalize("NFD", str)
404+
405+
def NFKD(str):
406+
return unicodedata.normalize("NFKD", str)
407+
408+
for line in testdata:
409+
if '#' in line:
410+
line = line.split('#')[0]
411+
line = line.strip()
412+
if not line:
413+
continue
414+
if line.startswith("@Part"):
415+
part = line.split()[0]
416+
continue
417+
c1,c2,c3,c4,c5 = [self.unistr(x) for x in line.split(';')[:-1]]
418+
419+
# Perform tests
420+
self.assertTrue(c2 == NFC(c1) == NFC(c2) == NFC(c3), line)
421+
self.assertTrue(c4 == NFC(c4) == NFC(c5), line)
422+
self.assertTrue(c3 == NFD(c1) == NFD(c2) == NFD(c3), line)
423+
self.assertTrue(c5 == NFD(c4) == NFD(c5), line)
424+
self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \
425+
NFKC(c3) == NFKC(c4) == NFKC(c5),
426+
line)
427+
self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \
428+
NFKD(c3) == NFKD(c4) == NFKD(c5),
429+
line)
430+
431+
self.assertTrue(unicodedata.is_normalized("NFC", c2))
432+
self.assertTrue(unicodedata.is_normalized("NFC", c4))
433+
434+
self.assertTrue(unicodedata.is_normalized("NFD", c3))
435+
self.assertTrue(unicodedata.is_normalized("NFD", c5))
436+
437+
self.assertTrue(unicodedata.is_normalized("NFKC", c4))
438+
self.assertTrue(unicodedata.is_normalized("NFKD", c5))
439+
440+
# Record part 1 data
441+
if part == "@Part1":
442+
part1_data[c1] = 1
443+
444+
# Perform tests for all other data
445+
for c in range(sys.maxunicode+1):
446+
X = chr(c)
447+
if X in part1_data:
448+
continue
449+
self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c)
450+
451+
def test_edge_cases(self):
452+
self.assertRaises(TypeError, unicodedata.normalize)
453+
self.assertRaises(ValueError, unicodedata.normalize, 'unknown', 'xx')
454+
self.assertEqual(unicodedata.normalize('NFKC', ''), '')
455+
456+
def test_bug_834676(self):
457+
# Check for bug 834676
458+
unicodedata.normalize('NFC', '\ud55c\uae00')
459+
460+
366461
if __name__ == "__main__":
367462
unittest.main()

0 commit comments

Comments
 (0)