Fix some issues with unicode in URLs

First, `UrlParser.update_query` didn't like 7-bit unclean values. `unicode()` should work everywhere `str()` did. Second, the check for emedded NBSPs in `UrlParser.is_web_safe_url` could be bypassed since `b'\xa0'` couldn't automatically be promoted to unicode (thus `u'\xa0'` != b'\xa0'.) The check was fixed to handle the NBSP char in either unicode or byte strings.
jayhung · May 18, 2015 · d7acfce · d7acfce
1 parent d0108cf
commit d7acfce
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 2 deletions.
diff --git a/r2/r2/lib/utils/utils.py b/r2/r2/lib/utils/utils.py
@@ -522,7 +522,7 @@ def update_query(self, **updates):
         # Since in HTTP everything's a string, coercing values to strings now
         # makes equality testing easier.  Python will throw an error if you try
         # to pass in a non-string key, so that's already taken care of for us.
-        updates = {k: str(v) for k, v in updates.iteritems()}
+        updates = {k: _force_unicode(v) for k, v in updates.iteritems()}
         self.query_dict.update(updates)
 
     @property
@@ -715,7 +715,8 @@ def is_web_safe_url(self):
             # should be safe enough to allow after three slashes. Opera 12's the
             # only browser that trips over them, and it doesn't fall for
             # `http:///foo.com/`.
-            if match.group(0) == '\xa0':
+            # Check both in case unicode promotion fails
+            if match.group(0) in {u'\xa0', '\xa0'}:
                 if match.string[0:match.start(0)].count('/') < 3:
                     return False
             else:

diff --git a/r2/r2/tests/unit/lib/urlparser_test.py b/r2/r2/tests/unit/lib/urlparser_test.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+# coding=utf-8
 # The contents of this file are subject to the Common Public Attribution
 # License Version 1.0. (the "License"); you may not use this file except in
 # compliance with the License. You may obtain a copy of the License at
@@ -138,6 +139,12 @@ def test_nbsp_allowances(self):
         self.assertIsNotSafeRedditUrl("\xa0http://%s/" % g.domain)
         self.assertIsSafeRedditUrl("http://%s/\xa0" % g.domain)
         self.assertIsSafeRedditUrl("/foo/bar/\xa0baz")
+        # Make sure this works if the URL is unicode
+        self.assertIsNotSafeRedditUrl(u"http://\xa0.%s/" % g.domain)
+        self.assertIsNotSafeRedditUrl(u"\xa0http://%s/" % g.domain)
+        self.assertIsSafeRedditUrl(u"http://%s/\xa0" % g.domain)
+        self.assertIsSafeRedditUrl(u"/foo/bar/\xa0baz")
+
 
 
 class TestSwitchSubdomainByExtension(unittest.TestCase):
@@ -281,3 +288,9 @@ def test_integer_query_params(self):
         u2 = UrlParser('http://example.com/')
         u2.update_query(page=1234)
         self.assertEquals(u, u2)
+
+    def test_unicode_query_params(self):
+        u = UrlParser(u'http://example.com/?page=ｕｎｉｃｏｄｅ：（')
+        u2 = UrlParser('http://example.com/')
+        u2.update_query(page=u'ｕｎｉｃｏｄｅ：（')
+        self.assertEquals(u, u2)