Skip to content

Commit

Permalink
Add UTS 46 10.0.0 test cases; Remove dot prefix stripping
Browse files Browse the repository at this point in the history
  • Loading branch information
kjd committed Mar 22, 2018
1 parent 0b9191e commit 6984ab7
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 15 deletions.
3 changes: 3 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ History
++++++++++++++++

- Update to Unicode 10.0.0.
- No longer accepts dot-prefixed domains (e.g. ".example') as valid.
This is to be more conformant with the UTS 46 spec. Users should
strip dot prefixes from domains before processing.

2.6 (2017-08-08)
++++++++++++++++
Expand Down
20 changes: 12 additions & 8 deletions idna/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,15 +344,17 @@ def encode(s, strict=False, uts46=False, std3_rules=False, transitional=False):
labels = s.split('.')
else:
labels = _unicode_dots_re.split(s)
while labels and not labels[0]:
del labels[0]
if not labels:
if not labels or labels == ['']:
raise IDNAError('Empty domain')
if labels[-1] == '':
del labels[-1]
trailing_dot = True
for label in labels:
result.append(alabel(label))
s = alabel(label)
if s:
result.append(s)
else:
raise IDNAError('Empty label')
if trailing_dot:
result.append(b'')
s = b'.'.join(result)
Expand All @@ -373,15 +375,17 @@ def decode(s, strict=False, uts46=False, std3_rules=False):
labels = _unicode_dots_re.split(s)
else:
labels = s.split(u'.')
while labels and not labels[0]:
del labels[0]
if not labels:
if not labels or labels == ['']:
raise IDNAError('Empty domain')
if not labels[-1]:
del labels[-1]
trailing_dot = True
for label in labels:
result.append(ulabel(label))
s = ulabel(label)
if s:
result.append(s)
else:
raise IDNAError('Empty label')
if trailing_dot:
result.append(u'')
return u'.'.join(result)
Binary file modified tests/IdnaTest.txt.gz
Binary file not shown.
40 changes: 33 additions & 7 deletions tests/test_idna_uts46.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,34 @@

_RE_UNICODE = re.compile(u"\\\\u([0-9a-fA-F]{4})")
_RE_SURROGATE = re.compile(u"[\uD800-\uDBFF][\uDC00-\uDFFF]")
_MISSING_NV8 = frozenset((525, 527, 529, 531, 1022, 2083, 2914, 2919, 3482,
3484, 4783, 4785))
_SKIP_TESTS = [
# These appear to be errors in the test vectors. All relate to incorrectly applying
# bidi rules across label boundaries. Appears independently confirmed
# at http://www.alvestrand.no/pipermail/idna-update/2017-January/007946.html
'0\u00E0.\u05D0', '0a\u0300.\u05D0', '0A\u0300.\u05D0', '0\u00C0.\u05D0', 'xn--0-sfa.xn--4db',
'\u00E0\u02c7.\u05D0', 'a\u0300\u02c7.\u05D0', 'A\u0300\u02c7.\u05D0', '\u00C0\u02c7.\u05D0',
'xn--0ca88g.xn--4db', '0A.\u05D0', '0a.\u05D0', '0a.xn--4db', 'c.xn--0-eha.xn--4db',
'c.0\u00FC.\u05D0', 'c.0u\u0308.\u05D0', 'C.0U\u0308.\u05D0', 'C.0\u00DC.\u05D0',
'\u06B6\u06DF\u3002\u2087\uA806', '\u06B6\u06DF\u30027\uA806', 'xn--pkb6f.xn--7-x93e',
'\u06B6\u06DF.7\uA806', '1.\uAC7E6.\U00010C41\u06D0', '1.\u1100\u1165\u11B56.\U00010C41\u06D0',
'1.xn--6-945e.xn--glb1794k',

# These are transitional strings that compute to NV8 and thus are not supported
# in IDNA 2008.
'\U000102F7\u3002\u200D',
'\U0001D7F5\u9681\u2BEE\uFF0E\u180D\u200C',
'9\u9681\u2BEE.\u180D\u200C',
'\u00DF\u200C\uAAF6\u18A5.\u22B6\u2D21\u2D16',
'ss\u200C\uAAF6\u18A5.\u22B6\u2D21\u2D16',
'\u00DF\u200C\uAAF6\u18A5\uFF0E\u22B6\u2D21\u2D16',
'ss\u200C\uAAF6\u18A5\uFF0E\u22B6\u2D21\u2D16',
'\U00010A57\u200D\u3002\u2D09\u2D15',
'\U00010A57\u200D\uFF61\u2D09\u2D15',
'\U0001D7CF\U0001DA19\u2E16.\u200D',
'1\U0001DA19\u2E16.\u200D',
'\U0001D7E04\U000E01D7\U0001D23B\uFF0E\u200D\U000102F5\u26E7\u200D',
'84\U000E01D7\U0001D23B.\u200D\U000102F5\u26E7\u200D',
]

def unicode_fixup(string):
"""Replace backslash-u-XXXX with appropriate unicode characters."""
Expand Down Expand Up @@ -69,16 +94,17 @@ def runTest(self):
except ValueError:
raise unittest.SkipTest(
"Test requires Python wide Unicode support")
if source in _SKIP_TESTS:
return
if not to_unicode:
to_unicode = source
if not to_ascii:
to_ascii = to_unicode
nv8 = (len(self.fields) > 4 and self.fields[4] or
self.lineno in _MISSING_NV8)
nv8 = (len(self.fields) > 4 and self.fields[4])
try:
output = idna.decode(source, uts46=True, strict=True)
if to_unicode[0] == u"[":
self.fail("decode() did not emit required error")
self.fail("decode() did not emit required error {0} for {1}".format(to_unicode, repr(source)))
self.assertEqual(output, to_unicode, "unexpected decode() output")
except (idna.IDNAError, UnicodeError, ValueError) as exc:
if unicode(exc).startswith(u"Unknown directionality"):
Expand All @@ -96,8 +122,8 @@ def runTest(self):
transitional=transitional).decode("ascii")
if to_ascii[0] == u"[":
self.fail(
"encode(transitional={0}) did not emit required error".
format(transitional))
"encode(transitional={0}) did not emit required error {1} for {2}".
format(transitional, to_ascii, repr(source)))
self.assertEqual(output, to_ascii,
"unexpected encode(transitional={0}) output".
format(transitional))
Expand Down

0 comments on commit 6984ab7

Please sign in to comment.