Add UTS 46 10.0.0 test cases; Remove dot prefix stripping

kjd · Mar 22, 2018 · 6984ab7 · 6984ab7
1 parent 0b9191e
commit 6984ab7
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 15 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -7,6 +7,9 @@ History
 ++++++++++++++++
 
 - Update to Unicode 10.0.0.
+- No longer accepts dot-prefixed domains (e.g. ".example') as valid.
+  This is to be more conformant with the UTS 46 spec. Users should
+  strip dot prefixes from domains before processing.
 
 2.6 (2017-08-08)
 ++++++++++++++++

diff --git a/idna/core.py b/idna/core.py
@@ -344,15 +344,17 @@ def encode(s, strict=False, uts46=False, std3_rules=False, transitional=False):
         labels = s.split('.')
     else:
         labels = _unicode_dots_re.split(s)
-    while labels and not labels[0]:
-        del labels[0]
-    if not labels:
+    if not labels or labels == ['']:
         raise IDNAError('Empty domain')
     if labels[-1] == '':
         del labels[-1]
         trailing_dot = True
     for label in labels:
-        result.append(alabel(label))
+        s = alabel(label)
+        if s:
+            result.append(s)
+        else:
+            raise IDNAError('Empty label')
     if trailing_dot:
         result.append(b'')
     s = b'.'.join(result)
@@ -373,15 +375,17 @@ def decode(s, strict=False, uts46=False, std3_rules=False):
         labels = _unicode_dots_re.split(s)
     else:
         labels = s.split(u'.')
-    while labels and not labels[0]:
-        del labels[0]
-    if not labels:
+    if not labels or labels == ['']:
         raise IDNAError('Empty domain')
     if not labels[-1]:
         del labels[-1]
         trailing_dot = True
     for label in labels:
-        result.append(ulabel(label))
+        s = ulabel(label)
+        if s:
+            result.append(s)
+        else:
+            raise IDNAError('Empty label')
     if trailing_dot:
         result.append(u'')
     return u'.'.join(result)
diff --git a/tests/IdnaTest.txt.gz b/tests/IdnaTest.txt.gz
diff --git a/tests/test_idna_uts46.py b/tests/test_idna_uts46.py
@@ -14,9 +14,34 @@
 
 _RE_UNICODE = re.compile(u"\\\\u([0-9a-fA-F]{4})")
 _RE_SURROGATE = re.compile(u"[\uD800-\uDBFF][\uDC00-\uDFFF]")
-_MISSING_NV8 = frozenset((525, 527, 529, 531, 1022, 2083, 2914, 2919, 3482,
-    3484, 4783, 4785))
+_SKIP_TESTS = [
+    # These appear to be errors in the test vectors. All relate to incorrectly applying
+    # bidi rules across label boundaries. Appears independently confirmed
+    # at http://www.alvestrand.no/pipermail/idna-update/2017-January/007946.html
+    '0\u00E0.\u05D0', '0a\u0300.\u05D0', '0A\u0300.\u05D0', '0\u00C0.\u05D0', 'xn--0-sfa.xn--4db',
+    '\u00E0\u02c7.\u05D0', 'a\u0300\u02c7.\u05D0', 'A\u0300\u02c7.\u05D0', '\u00C0\u02c7.\u05D0',
+    'xn--0ca88g.xn--4db', '0A.\u05D0', '0a.\u05D0', '0a.xn--4db', 'c.xn--0-eha.xn--4db',
+    'c.0\u00FC.\u05D0', 'c.0u\u0308.\u05D0', 'C.0U\u0308.\u05D0', 'C.0\u00DC.\u05D0',
+    '\u06B6\u06DF\u3002\u2087\uA806', '\u06B6\u06DF\u30027\uA806', 'xn--pkb6f.xn--7-x93e',
+    '\u06B6\u06DF.7\uA806', '1.\uAC7E6.\U00010C41\u06D0', '1.\u1100\u1165\u11B56.\U00010C41\u06D0',
+    '1.xn--6-945e.xn--glb1794k',
 
+    # These are transitional strings that compute to NV8 and thus are not supported
+    # in IDNA 2008.
+    '\U000102F7\u3002\u200D',
+    '\U0001D7F5\u9681\u2BEE\uFF0E\u180D\u200C',
+    '9\u9681\u2BEE.\u180D\u200C',
+    '\u00DF\u200C\uAAF6\u18A5.\u22B6\u2D21\u2D16',
+    'ss\u200C\uAAF6\u18A5.\u22B6\u2D21\u2D16',
+    '\u00DF\u200C\uAAF6\u18A5\uFF0E\u22B6\u2D21\u2D16',
+    'ss\u200C\uAAF6\u18A5\uFF0E\u22B6\u2D21\u2D16',
+    '\U00010A57\u200D\u3002\u2D09\u2D15',
+    '\U00010A57\u200D\uFF61\u2D09\u2D15',
+    '\U0001D7CF\U0001DA19\u2E16.\u200D',
+    '1\U0001DA19\u2E16.\u200D',
+    '\U0001D7E04\U000E01D7\U0001D23B\uFF0E\u200D\U000102F5\u26E7\u200D',
+    '84\U000E01D7\U0001D23B.\u200D\U000102F5\u26E7\u200D',
+]
 
 def unicode_fixup(string):
     """Replace backslash-u-XXXX with appropriate unicode characters."""
@@ -69,16 +94,17 @@ def runTest(self):
         except ValueError:
             raise unittest.SkipTest(
                 "Test requires Python wide Unicode support")
+        if source in _SKIP_TESTS:
+            return
         if not to_unicode:
             to_unicode = source
         if not to_ascii:
             to_ascii = to_unicode
-        nv8 = (len(self.fields) > 4 and self.fields[4] or
-            self.lineno in _MISSING_NV8)
+        nv8 = (len(self.fields) > 4 and self.fields[4])
         try:
             output = idna.decode(source, uts46=True, strict=True)
             if to_unicode[0] == u"[":
-                self.fail("decode() did not emit required error")
+                self.fail("decode() did not emit required error {0} for {1}".format(to_unicode, repr(source)))
             self.assertEqual(output, to_unicode, "unexpected decode() output")
         except (idna.IDNAError, UnicodeError, ValueError) as exc:
             if unicode(exc).startswith(u"Unknown directionality"):
@@ -96,8 +122,8 @@ def runTest(self):
                     transitional=transitional).decode("ascii")
                 if to_ascii[0] == u"[":
                     self.fail(
-                        "encode(transitional={0}) did not emit required error".
-                        format(transitional))
+                        "encode(transitional={0}) did not emit required error {1} for {2}".
+                        format(transitional, to_ascii, repr(source)))
                 self.assertEqual(output, to_ascii,
                     "unexpected encode(transitional={0}) output".
                     format(transitional))