Skip to content

Commit ab50217

Browse files
authored
Merge pull request RustPython#3558 from fanninpm/codecs-3.10
Update codecs.py to CPython 3.10
2 parents ef90d09 + ead652b commit ab50217

File tree

4 files changed

+188
-37
lines changed

4 files changed

+188
-37
lines changed

Lib/codecs.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@
8383
class CodecInfo(tuple):
8484
"""Codec details when looking up the codec registry"""
8585

86-
# Private API to allow Python 3.4 to blacklist the known non-Unicode
86+
# Private API to allow Python 3.4 to denylist the known non-Unicode
8787
# codecs in the standard library. A more general mechanism to
8888
# reliably distinguish test encodings from other codecs will hopefully
8989
# be defined for Python 3.5
@@ -386,7 +386,7 @@ def writelines(self, list):
386386

387387
def reset(self):
388388

389-
""" Flushes and resets the codec buffers used for keeping state.
389+
""" Resets the codec buffers used for keeping internal state.
390390
391391
Calling this method should ensure that the data on the
392392
output is put into a clean state, that allows appending
@@ -620,7 +620,7 @@ def readlines(self, sizehint=None, keepends=True):
620620

621621
def reset(self):
622622

623-
""" Resets the codec buffers used for keeping state.
623+
""" Resets the codec buffers used for keeping internal state.
624624
625625
Note that no stream repositioning should take place.
626626
This method is primarily intended to be able to recover

Lib/test/test_codecs.py

Lines changed: 161 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,11 @@
99

1010
from test import support
1111
from test.support import os_helper
12+
from test.support import warnings_helper
1213

1314
try:
1415
import _testcapi
15-
except ImportError as exc:
16+
except ImportError:
1617
_testcapi = None
1718

1819
try:
@@ -113,7 +114,7 @@ def check_partial(self, input, partialresults):
113114
q = Queue(b"")
114115
r = codecs.getreader(self.encoding)(q)
115116
result = ""
116-
for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
117+
for (c, partialresult) in zip(input.encode(self.encoding), partialresults, strict=True):
117118
q.write(bytes([c]))
118119
result += r.read()
119120
self.assertEqual(result, partialresult)
@@ -124,7 +125,7 @@ def check_partial(self, input, partialresults):
124125
# do the check again, this time using an incremental decoder
125126
d = codecs.getincrementaldecoder(self.encoding)()
126127
result = ""
127-
for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
128+
for (c, partialresult) in zip(input.encode(self.encoding), partialresults, strict=True):
128129
result += d.decode(bytes([c]))
129130
self.assertEqual(result, partialresult)
130131
# check that there's nothing left in the buffers
@@ -134,7 +135,7 @@ def check_partial(self, input, partialresults):
134135
# Check whether the reset method works properly
135136
d.reset()
136137
result = ""
137-
for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
138+
for (c, partialresult) in zip(input.encode(self.encoding), partialresults, strict=True):
138139
result += d.decode(bytes([c]))
139140
self.assertEqual(result, partialresult)
140141
# check that there's nothing left in the buffers
@@ -843,7 +844,7 @@ def test_bug691291(self):
843844
self.addCleanup(os_helper.unlink, os_helper.TESTFN)
844845
with open(os_helper.TESTFN, 'wb') as fp:
845846
fp.write(s)
846-
with support.check_warnings(('', DeprecationWarning)):
847+
with warnings_helper.check_warnings(('', DeprecationWarning)):
847848
reader = codecs.open(os_helper.TESTFN, 'U', encoding=self.encoding)
848849
with reader:
849850
self.assertEqual(reader.read(), s1)
@@ -1814,6 +1815,22 @@ def test_register(self):
18141815
self.assertRaises(TypeError, codecs.register)
18151816
self.assertRaises(TypeError, codecs.register, 42)
18161817

1818+
def test_unregister(self):
1819+
name = "nonexistent_codec_name"
1820+
search_function = mock.Mock()
1821+
codecs.register(search_function)
1822+
self.assertRaises(TypeError, codecs.lookup, name)
1823+
search_function.assert_called_with(name)
1824+
search_function.reset_mock()
1825+
1826+
codecs.unregister(search_function)
1827+
self.assertRaises(LookupError, codecs.lookup, name)
1828+
search_function.assert_not_called()
1829+
1830+
# TODO: RUSTPYTHON, AttributeError: module '_winapi' has no attribute 'GetACP'
1831+
if sys.platform == "win32":
1832+
test_unregister = unittest.expectedFailure(test_unregister)
1833+
18171834
def test_lookup(self):
18181835
self.assertRaises(TypeError, codecs.lookup)
18191836
self.assertRaises(LookupError, codecs.lookup, "__spam__")
@@ -2544,7 +2561,16 @@ def test_unicode_escape(self):
25442561
(r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
25452562

25462563

2547-
class UnicodeEscapeTest(unittest.TestCase):
2564+
class UnicodeEscapeTest(ReadTest, unittest.TestCase):
2565+
encoding = "unicode-escape"
2566+
2567+
test_lone_surrogates = None
2568+
2569+
# TODO: RUSTPYTHON, TypeError: Expected type 'str', not 'bytes'
2570+
@unittest.expectedFailure
2571+
def test_incremental_surrogatepass(self): # TODO: RUSTPYTHON, remove when this passes
2572+
super().test_incremental_surrogatepass() # TODO: RUSTPYTHON, remove when this passes
2573+
25482574
def test_empty(self):
25492575
self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
25502576
self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
@@ -2631,8 +2657,57 @@ def test_decode_errors(self):
26312657
self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
26322658
self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
26332659

2660+
# TODO: RUSTPYTHON, UnicodeDecodeError: ('unicodeescape', b'\\', 0, 1, '\\ at end of string')
2661+
@unittest.expectedFailure
2662+
def test_partial(self):
2663+
self.check_partial(
2664+
"\x00\t\n\r\\\xff\uffff\U00010000",
2665+
[
2666+
'',
2667+
'',
2668+
'',
2669+
'\x00',
2670+
'\x00',
2671+
'\x00\t',
2672+
'\x00\t',
2673+
'\x00\t\n',
2674+
'\x00\t\n',
2675+
'\x00\t\n\r',
2676+
'\x00\t\n\r',
2677+
'\x00\t\n\r\\',
2678+
'\x00\t\n\r\\',
2679+
'\x00\t\n\r\\',
2680+
'\x00\t\n\r\\',
2681+
'\x00\t\n\r\\\xff',
2682+
'\x00\t\n\r\\\xff',
2683+
'\x00\t\n\r\\\xff',
2684+
'\x00\t\n\r\\\xff',
2685+
'\x00\t\n\r\\\xff',
2686+
'\x00\t\n\r\\\xff',
2687+
'\x00\t\n\r\\\xff\uffff',
2688+
'\x00\t\n\r\\\xff\uffff',
2689+
'\x00\t\n\r\\\xff\uffff',
2690+
'\x00\t\n\r\\\xff\uffff',
2691+
'\x00\t\n\r\\\xff\uffff',
2692+
'\x00\t\n\r\\\xff\uffff',
2693+
'\x00\t\n\r\\\xff\uffff',
2694+
'\x00\t\n\r\\\xff\uffff',
2695+
'\x00\t\n\r\\\xff\uffff',
2696+
'\x00\t\n\r\\\xff\uffff',
2697+
'\x00\t\n\r\\\xff\uffff\U00010000',
2698+
]
2699+
)
2700+
2701+
class RawUnicodeEscapeTest(ReadTest, unittest.TestCase):
2702+
encoding = "raw-unicode-escape"
2703+
2704+
test_lone_surrogates = None
2705+
2706+
# TODO: RUSTPYTHON, AssertionError: '\\' != ''
2707+
@unittest.expectedFailure
2708+
def test_incremental_surrogatepass(self): # TODO: RUSTPYTHON, remove when this passes
2709+
super().test_incremental_surrogatepass() # TODO: RUSTPYTHON, remove when this passes
26342710

2635-
class RawUnicodeEscapeTest(unittest.TestCase):
26362711
def test_empty(self):
26372712
self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
26382713
self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
@@ -2681,6 +2756,37 @@ def test_decode_errors(self):
26812756
self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
26822757
self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
26832758

2759+
# TODO: RUSTPYTHON, AssertionError: '\x00\t\n\r\\' != '\x00\t\n\r'
2760+
@unittest.expectedFailure
2761+
def test_partial(self):
2762+
self.check_partial(
2763+
"\x00\t\n\r\\\xff\uffff\U00010000",
2764+
[
2765+
'\x00',
2766+
'\x00\t',
2767+
'\x00\t\n',
2768+
'\x00\t\n\r',
2769+
'\x00\t\n\r',
2770+
'\x00\t\n\r\\\xff',
2771+
'\x00\t\n\r\\\xff',
2772+
'\x00\t\n\r\\\xff',
2773+
'\x00\t\n\r\\\xff',
2774+
'\x00\t\n\r\\\xff',
2775+
'\x00\t\n\r\\\xff',
2776+
'\x00\t\n\r\\\xff\uffff',
2777+
'\x00\t\n\r\\\xff\uffff',
2778+
'\x00\t\n\r\\\xff\uffff',
2779+
'\x00\t\n\r\\\xff\uffff',
2780+
'\x00\t\n\r\\\xff\uffff',
2781+
'\x00\t\n\r\\\xff\uffff',
2782+
'\x00\t\n\r\\\xff\uffff',
2783+
'\x00\t\n\r\\\xff\uffff',
2784+
'\x00\t\n\r\\\xff\uffff',
2785+
'\x00\t\n\r\\\xff\uffff',
2786+
'\x00\t\n\r\\\xff\uffff\U00010000',
2787+
]
2788+
)
2789+
26842790

26852791
class EscapeEncodeTest(unittest.TestCase):
26862792

@@ -2889,7 +2995,7 @@ def test_buffer_api_usage(self):
28892995
view_decoded = codecs.decode(view, encoding)
28902996
self.assertEqual(view_decoded, data)
28912997

2892-
def test_text_to_binary_blacklists_binary_transforms(self):
2998+
def test_text_to_binary_denylists_binary_transforms(self):
28932999
# Check binary -> binary codecs give a good error for str input
28943000
bad_input = "bad input type"
28953001
for encoding in bytes_transform_encodings:
@@ -2901,14 +3007,14 @@ def test_text_to_binary_blacklists_binary_transforms(self):
29013007
bad_input.encode(encoding)
29023008
self.assertIsNone(failure.exception.__cause__)
29033009

2904-
def test_text_to_binary_blacklists_text_transforms(self):
3010+
def test_text_to_binary_denylists_text_transforms(self):
29053011
# Check str.encode gives a good error message for str -> str codecs
29063012
msg = (r"^'rot_13' is not a text encoding; "
29073013
r"use codecs.encode\(\) to handle arbitrary codecs")
29083014
with self.assertRaisesRegex(LookupError, msg):
29093015
"just an example message".encode("rot_13")
29103016

2911-
def test_binary_to_text_blacklists_binary_transforms(self):
3017+
def test_binary_to_text_denylists_binary_transforms(self):
29123018
# Check bytes.decode and bytearray.decode give a good error
29133019
# message for binary -> binary codecs
29143020
data = b"encode first to ensure we meet any format restrictions"
@@ -2923,7 +3029,7 @@ def test_binary_to_text_blacklists_binary_transforms(self):
29233029
with self.assertRaisesRegex(LookupError, msg):
29243030
bytearray(encoded_data).decode(encoding)
29253031

2926-
def test_binary_to_text_blacklists_text_transforms(self):
3032+
def test_binary_to_text_denylists_text_transforms(self):
29273033
# Check str -> str codec gives a good error for binary input
29283034
for bad_input in (b"immutable", bytearray(b"mutable")):
29293035
with self.subTest(bad_input=bad_input):
@@ -2991,29 +3097,14 @@ def test_uu_invalid(self):
29913097

29923098
def _get_test_codec(codec_name):
29933099
return _TEST_CODECS.get(codec_name)
2994-
codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2995-
2996-
try:
2997-
# Issue #22166: Also need to clear the internal cache in CPython
2998-
from _codecs import _forget_codec
2999-
except ImportError:
3000-
def _forget_codec(codec_name):
3001-
pass
30023100

30033101

30043102
class ExceptionChainingTest(unittest.TestCase):
30053103

30063104
def setUp(self):
3007-
# There's no way to unregister a codec search function, so we just
3008-
# ensure we render this one fairly harmless after the test
3009-
# case finishes by using the test case repr as the codec name
3010-
# The codecs module normalizes codec names, although this doesn't
3011-
# appear to be formally documented...
3012-
# We also make sure we use a truly unique id for the custom codec
3013-
# to avoid issues with the codec cache when running these tests
3014-
# multiple times (e.g. when hunting for refleaks)
3015-
unique_id = repr(self) + str(id(self))
3016-
self.codec_name = encodings.normalize_encoding(unique_id).lower()
3105+
self.codec_name = 'exception_chaining_test'
3106+
codecs.register(_get_test_codec)
3107+
self.addCleanup(codecs.unregister, _get_test_codec)
30173108

30183109
# We store the object to raise on the instance because of a bad
30193110
# interaction between the codec caching (which means we can't
@@ -3028,10 +3119,6 @@ def tearDown(self):
30283119
_TEST_CODECS.pop(self.codec_name, None)
30293120
# Issue #22166: Also pop from caches to avoid appearance of ref leaks
30303121
encodings._cache.pop(self.codec_name, None)
3031-
try:
3032-
_forget_codec(self.codec_name)
3033-
except KeyError:
3034-
pass
30353122

30363123
def set_codec(self, encode, decode):
30373124
codec_info = codecs.CodecInfo(encode, decode,
@@ -3710,5 +3797,46 @@ def test_rot13_func(self):
37103797
'To be, or not to be, that is the question')
37113798

37123799

3800+
class CodecNameNormalizationTest(unittest.TestCase):
3801+
"""Test codec name normalization"""
3802+
# TODO: RUSTPYTHON, AssertionError: Tuples differ: (1, 2, 3, 4) != (None, None, None, None)
3803+
@unittest.expectedFailure
3804+
def test_codecs_lookup(self):
3805+
FOUND = (1, 2, 3, 4)
3806+
NOT_FOUND = (None, None, None, None)
3807+
def search_function(encoding):
3808+
if encoding == "aaa_8":
3809+
return FOUND
3810+
else:
3811+
return NOT_FOUND
3812+
3813+
codecs.register(search_function)
3814+
self.addCleanup(codecs.unregister, search_function)
3815+
self.assertEqual(FOUND, codecs.lookup('aaa_8'))
3816+
self.assertEqual(FOUND, codecs.lookup('AAA-8'))
3817+
self.assertEqual(FOUND, codecs.lookup('AAA---8'))
3818+
self.assertEqual(FOUND, codecs.lookup('AAA 8'))
3819+
self.assertEqual(FOUND, codecs.lookup('aaa\xe9\u20ac-8'))
3820+
self.assertEqual(NOT_FOUND, codecs.lookup('AAA.8'))
3821+
self.assertEqual(NOT_FOUND, codecs.lookup('AAA...8'))
3822+
self.assertEqual(NOT_FOUND, codecs.lookup('BBB-8'))
3823+
self.assertEqual(NOT_FOUND, codecs.lookup('BBB.8'))
3824+
self.assertEqual(NOT_FOUND, codecs.lookup('a\xe9\u20ac-8'))
3825+
3826+
# TODO: RUSTPYTHON, AssertionError
3827+
@unittest.expectedFailure
3828+
def test_encodings_normalize_encoding(self):
3829+
# encodings.normalize_encoding() ignores non-ASCII characters.
3830+
normalize = encodings.normalize_encoding
3831+
self.assertEqual(normalize('utf_8'), 'utf_8')
3832+
self.assertEqual(normalize('utf\xE9\u20AC\U0010ffff-8'), 'utf_8')
3833+
self.assertEqual(normalize('utf 8'), 'utf_8')
3834+
# encodings.normalize_encoding() doesn't convert
3835+
# characters to lower case.
3836+
self.assertEqual(normalize('UTF 8'), 'UTF_8')
3837+
self.assertEqual(normalize('utf.8'), 'utf.8')
3838+
self.assertEqual(normalize('utf...8'), 'utf...8')
3839+
3840+
37133841
if __name__ == "__main__":
37143842
unittest.main()

vm/src/codecs.rs

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ use crate::{
22
builtins::{PyBaseExceptionRef, PyBytesRef, PyStr, PyStrRef, PyTuple, PyTupleRef},
33
common::{ascii, lock::PyRwLock},
44
function::IntoPyObject,
5-
PyContext, PyObject, PyObjectRef, PyResult, PyValue, TryFromObject, TypeProtocol,
5+
IdProtocol, PyContext, PyObject, PyObjectRef, PyResult, PyValue, TryFromObject, TypeProtocol,
66
VirtualMachine,
77
};
88
use std::borrow::Cow;
@@ -195,6 +195,24 @@ impl CodecsRegistry {
195195
Ok(())
196196
}
197197

198+
pub fn unregister(&self, search_function: PyObjectRef) -> PyResult<()> {
199+
let mut inner = self.inner.write();
200+
// Do nothing if search_path is not created yet or was cleared.
201+
if inner.search_path.is_empty() {
202+
return Ok(());
203+
}
204+
for (i, item) in inner.search_path.iter().enumerate() {
205+
if item.get_id() == search_function.get_id() {
206+
if !inner.search_cache.is_empty() {
207+
inner.search_cache.clear();
208+
}
209+
inner.search_path.remove(i);
210+
return Ok(());
211+
}
212+
}
213+
Ok(())
214+
}
215+
198216
pub fn lookup(&self, encoding: &str, vm: &VirtualMachine) -> PyResult<PyCodec> {
199217
let encoding = normalize_encoding_name(encoding);
200218
let inner = self.inner.read();

vm/src/stdlib/codecs.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,11 @@ mod _codecs {
1616
vm.state.codec_registry.register(search_function, vm)
1717
}
1818

19+
#[pyfunction]
20+
fn unregister(search_function: PyObjectRef, vm: &VirtualMachine) -> PyResult<()> {
21+
vm.state.codec_registry.unregister(search_function)
22+
}
23+
1924
#[pyfunction]
2025
fn lookup(encoding: PyStrRef, vm: &VirtualMachine) -> PyResult {
2126
vm.state

0 commit comments

Comments
 (0)