[GURL] (2 of 2) Strip username/password/port when canonicalizing, if …

…the scheme doesn't support them The goal of this CL is to inhibit port numbers and usernames in internal schemes like "chrome-extension" and "chrome". Currently, navigations to chrome-extension:// URLs with ports actually get suprisingly far; it seems like no good can possibly come from that. A new SchemeType is added: SCHEME_WITH_HOST_AND_PORT (no user information). This is only used when canonicalizing the inner URL of filesystem: -- e.g., filesystem:http://user@host:20/temp/foo now canonicalizes to filesystem:http://host:20/temp/foo; whereas filesystem:chrome://user@host:20/temp/foo canonicalizes to filesystem:chrome://host/temp/foo Bug: 606001,809062 Cq-Include-Trybots: master.tryserver.chromium.mac:ios-simulator-cronet;master.tryserver.chromium.mac:ios-simulator-full-configs Change-Id: I77c5ba3d2fe964deb8aadae95a06519ce038c472 Reviewed-on: https://chromium-review.googlesource.com/974380 Reviewed-by: Vasilii Sukhanov <[email protected]> Reviewed-by: Tommy Li <[email protected]> Reviewed-by: Mike West <[email protected]> Commit-Queue: Nick Carter <[email protected]> Cr-Commit-Position: refs/heads/master@{#547882}
AryanGitHub · Apr 4, 2018 · ff69a10 · ff69a10
1 parent 3e0db46
commit ff69a10
Show file tree

Hide file tree

Showing 12 changed files with 237 additions and 158 deletions.
diff --git a/components/password_manager/core/browser/android_affiliation/affiliation_utils.cc b/components/password_manager/core/browser/android_affiliation/affiliation_utils.cc
@@ -53,7 +53,8 @@ bool CanonicalizeWebFacetURI(const std::string& input_uri,
   url::StdStringCanonOutput canonical_output(canonical_uri);
 
   bool canonicalization_succeeded = url::CanonicalizeStandardURL(
-      input_uri.c_str(), input_uri.size(), input_parsed, nullptr,
+      input_uri.c_str(), input_uri.size(), input_parsed,
+      url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr,
       &canonical_output, &canonical_parsed);
   canonical_output.Complete();
 

diff --git a/components/url_formatter/url_fixer_unittest.cc b/components/url_formatter/url_fixer_unittest.cc
@@ -248,64 +248,64 @@ struct FixupCase {
   const std::string input;
   const std::string output;
 } fixup_cases[] = {
-  {"www.google.com", "http://www.google.com/"},
-  {" www.google.com     ", "http://www.google.com/"},
-  {" foo.com/asdf  bar", "http://foo.com/asdf%20%20bar"},
-  {"..www.google.com..", "http://www.google.com./"},
-  {"http://......", "http://....../"},
-  {"http://host.com:ninety-two/", "http://host.com:ninety-two/"},
-  {"http://host.com:ninety-two?foo", "http://host.com:ninety-two/?foo"},
-  {"google.com:123", "http://google.com:123/"},
-  {"about:", "chrome://version/"},
-  {"about:foo", "chrome://foo/"},
-  {"about:version", "chrome://version/"},
-  {"about:blank", "about:blank"},
-  {"about:usr:pwd@hst/pth?qry#ref", "chrome://usr:pwd@hst/pth?qry#ref"},
-  {"about://usr:pwd@hst/pth?qry#ref", "chrome://usr:pwd@hst/pth?qry#ref"},
-  {"chrome:usr:pwd@hst/pth?qry#ref", "chrome://usr:pwd@hst/pth?qry#ref"},
-  {"chrome://usr:pwd@hst/pth?qry#ref", "chrome://usr:pwd@hst/pth?qry#ref"},
-  {"www:123", "http://www:123/"},
-  {"   www:123", "http://www:123/"},
-  {"www.google.com?foo", "http://www.google.com/?foo"},
-  {"www.google.com#foo", "http://www.google.com/#foo"},
-  {"www.google.com?", "http://www.google.com/?"},
-  {"www.google.com#", "http://www.google.com/#"},
-  {"www.google.com:123?foo#bar", "http://www.google.com:123/?foo#bar"},
-  {"[email protected]", "http://[email protected]/"},
-  {"\xE6\xB0\xB4.com", "http://xn--1rw.com/"},
-  // It would be better if this next case got treated as http, but I don't see
-  // a clean way to guess this isn't the new-and-exciting "user" scheme.
-  {"user:[email protected]:8080/", "user:[email protected]:8080/"},
-  // {"file:///c:/foo/bar%20baz.txt", "file:///C:/foo/bar%20baz.txt"},
-  // URLs which end with 0x85 (NEL in ISO-8859).
-  {"http://foo.com/s?q=\xd0\x85", "http://foo.com/s?q=%D0%85"},
-  {"http://foo.com/s?q=\xec\x97\x85", "http://foo.com/s?q=%EC%97%85"},
-  {"http://foo.com/s?q=\xf0\x90\x80\x85", "http://foo.com/s?q=%F0%90%80%85"},
-  // URLs which end with 0xA0 (non-break space in ISO-8859).
-  {"http://foo.com/s?q=\xd0\xa0", "http://foo.com/s?q=%D0%A0"},
-  {"http://foo.com/s?q=\xec\x97\xa0", "http://foo.com/s?q=%EC%97%A0"},
-  {"http://foo.com/s?q=\xf0\x90\x80\xa0", "http://foo.com/s?q=%F0%90%80%A0"},
-  // URLs containing IPv6 literals.
-  {"[2001:db8::2]", "http://[2001:db8::2]/"},
-  {"[::]:80", "http://[::]/"},
-  {"[::]:80/path", "http://[::]/path"},
-  {"[::]:180/path", "http://[::]:180/path"},
-  // TODO(pmarks): Maybe we should parse bare IPv6 literals someday. Currently
-  // the first colon is treated as a scheme separator, and we default
-  // unspecified schemes to "http".
-  {"::1", "http://:1/"},
-  // Semicolon as scheme separator for standard schemes.
-  {"http;//www.google.com/", "http://www.google.com/"},
-  {"about;chrome", "chrome://chrome/"},
-  // Semicolon in non-standard schemes is not replaced by colon.
-  {"whatsup;//fool", "http://whatsup%3B//fool"},
-  // Semicolon left as-is in URL itself.
-  {"http://host/port?query;moar", "http://host/port?query;moar"},
-  // Fewer slashes than expected.
-  {"http;www.google.com/", "http://www.google.com/"},
-  {"http;/www.google.com/", "http://www.google.com/"},
-  // Semicolon at start.
-  {";http://www.google.com/", "http://%3Bhttp//www.google.com/"},
+    {"www.google.com", "http://www.google.com/"},
+    {" www.google.com     ", "http://www.google.com/"},
+    {" foo.com/asdf  bar", "http://foo.com/asdf%20%20bar"},
+    {"..www.google.com..", "http://www.google.com./"},
+    {"http://......", "http://....../"},
+    {"http://host.com:ninety-two/", "http://host.com:ninety-two/"},
+    {"http://host.com:ninety-two?foo", "http://host.com:ninety-two/?foo"},
+    {"google.com:123", "http://google.com:123/"},
+    {"about:", "chrome://version/"},
+    {"about:foo", "chrome://foo/"},
+    {"about:version", "chrome://version/"},
+    {"about:blank", "about:blank"},
+    {"about:usr:pwd@hst:20/pth?qry#ref", "chrome://hst/pth?qry#ref"},
+    {"about://usr:pwd@hst/pth?qry#ref", "chrome://hst/pth?qry#ref"},
+    {"chrome:usr:pwd@hst/pth?qry#ref", "chrome://hst/pth?qry#ref"},
+    {"chrome://usr:pwd@hst/pth?qry#ref", "chrome://hst/pth?qry#ref"},
+    {"www:123", "http://www:123/"},
+    {"   www:123", "http://www:123/"},
+    {"www.google.com?foo", "http://www.google.com/?foo"},
+    {"www.google.com#foo", "http://www.google.com/#foo"},
+    {"www.google.com?", "http://www.google.com/?"},
+    {"www.google.com#", "http://www.google.com/#"},
+    {"www.google.com:123?foo#bar", "http://www.google.com:123/?foo#bar"},
+    {"[email protected]", "http://[email protected]/"},
+    {"\xE6\xB0\xB4.com", "http://xn--1rw.com/"},
+    // It would be better if this next case got treated as http, but I don't see
+    // a clean way to guess this isn't the new-and-exciting "user" scheme.
+    {"user:[email protected]:8080/", "user:[email protected]:8080/"},
+    // {"file:///c:/foo/bar%20baz.txt", "file:///C:/foo/bar%20baz.txt"},
+    // URLs which end with 0x85 (NEL in ISO-8859).
+    {"http://foo.com/s?q=\xd0\x85", "http://foo.com/s?q=%D0%85"},
+    {"http://foo.com/s?q=\xec\x97\x85", "http://foo.com/s?q=%EC%97%85"},
+    {"http://foo.com/s?q=\xf0\x90\x80\x85", "http://foo.com/s?q=%F0%90%80%85"},
+    // URLs which end with 0xA0 (non-break space in ISO-8859).
+    {"http://foo.com/s?q=\xd0\xa0", "http://foo.com/s?q=%D0%A0"},
+    {"http://foo.com/s?q=\xec\x97\xa0", "http://foo.com/s?q=%EC%97%A0"},
+    {"http://foo.com/s?q=\xf0\x90\x80\xa0", "http://foo.com/s?q=%F0%90%80%A0"},
+    // URLs containing IPv6 literals.
+    {"[2001:db8::2]", "http://[2001:db8::2]/"},
+    {"[::]:80", "http://[::]/"},
+    {"[::]:80/path", "http://[::]/path"},
+    {"[::]:180/path", "http://[::]:180/path"},
+    // TODO(pmarks): Maybe we should parse bare IPv6 literals someday. Currently
+    // the first colon is treated as a scheme separator, and we default
+    // unspecified schemes to "http".
+    {"::1", "http://:1/"},
+    // Semicolon as scheme separator for standard schemes.
+    {"http;//www.google.com/", "http://www.google.com/"},
+    {"about;chrome", "chrome://chrome/"},
+    // Semicolon in non-standard schemes is not replaced by colon.
+    {"whatsup;//fool", "http://whatsup%3B//fool"},
+    // Semicolon left as-is in URL itself.
+    {"http://host/port?query;moar", "http://host/port?query;moar"},
+    // Fewer slashes than expected.
+    {"http;www.google.com/", "http://www.google.com/"},
+    {"http;/www.google.com/", "http://www.google.com/"},
+    // Semicolon at start.
+    {";http://www.google.com/", "http://%3Bhttp//www.google.com/"},
 };
 
 TEST(URLFixerTest, FixupURL) {

diff --git a/url/gurl_unittest.cc b/url/gurl_unittest.cc
@@ -197,7 +197,7 @@ TEST(GURLTest, CopyFileSystem) {
   GURL url2(url);
   EXPECT_TRUE(url2.is_valid());
 
-  EXPECT_EQ("filesystem:https://user:pass@google.com:99/t/foo;bar?q=a#ref", url2.spec());
+  EXPECT_EQ("filesystem:https://google.com:99/t/foo;bar?q=a#ref", url2.spec());
   EXPECT_EQ("filesystem", url2.scheme());
   EXPECT_EQ("", url2.username());
   EXPECT_EQ("", url2.password());
@@ -211,8 +211,8 @@ TEST(GURLTest, CopyFileSystem) {
   const GURL* inner = url2.inner_url();
   ASSERT_TRUE(inner);
   EXPECT_EQ("https", inner->scheme());
-  EXPECT_EQ("user", inner->username());
-  EXPECT_EQ("pass", inner->password());
+  EXPECT_EQ("", inner->username());
+  EXPECT_EQ("", inner->password());
   EXPECT_EQ("google.com", inner->host());
   EXPECT_EQ("99", inner->port());
   EXPECT_EQ(99, inner->IntPort());

diff --git a/url/scheme_host_port.cc b/url/scheme_host_port.cc
@@ -60,6 +60,7 @@ bool IsValidInput(const base::StringPiece& scheme,
     return false;
 
   switch (scheme_type) {
+    case SCHEME_WITH_HOST_AND_PORT:
     case SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION:
       // A URL with |scheme| is required to have the host and port (may be
       // omitted in a serialization if it's the same as the default value).

diff --git a/url/url_canon.h b/url/url_canon.h
@@ -220,6 +220,31 @@ class URL_EXPORT CharsetConverter {
                                 CanonOutput* output) = 0;
 };
 
+// Schemes --------------------------------------------------------------------
+
+// Types of a scheme representing the requirements on the data represented by
+// the authority component of a URL with the scheme.
+enum SchemeType {
+  // The authority component of a URL with the scheme has the form
+  // "username:password@host:port". The username and password entries are
+  // optional; the host may not be empty. The default value of the port can be
+  // omitted in serialization. This type occurs with network schemes like http,
+  // https, and ftp.
+  SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION,
+  // The authority component of a URL with the scheme has the form "host:port",
+  // and does not include username or password. The default value of the port
+  // can be omitted in serialization. Used by inner URLs of filesystem URLs of
+  // origins with network hosts, from which the username and password are
+  // stripped.
+  SCHEME_WITH_HOST_AND_PORT,
+  // The authority component of an URL with the scheme has the form "host", and
+  // does not include port, username, or password. Used when the hosts are not
+  // network addresses; for example, schemes used internally by the browser.
+  SCHEME_WITH_HOST,
+  // A URL with the scheme doesn't have the authority component.
+  SCHEME_WITHOUT_AUTHORITY,
+};
+
 // Whitespace -----------------------------------------------------------------
 
 // Searches for whitespace that should be removed from the middle of URLs, and
@@ -549,12 +574,14 @@ URL_EXPORT void CanonicalizeRef(const base::char16* spec,
 URL_EXPORT bool CanonicalizeStandardURL(const char* spec,
                                         int spec_len,
                                         const Parsed& parsed,
+                                        SchemeType scheme_type,
                                         CharsetConverter* query_converter,
                                         CanonOutput* output,
                                         Parsed* new_parsed);
 URL_EXPORT bool CanonicalizeStandardURL(const base::char16* spec,
                                         int spec_len,
                                         const Parsed& parsed,
+                                        SchemeType scheme_type,
                                         CharsetConverter* query_converter,
                                         CanonOutput* output,
                                         Parsed* new_parsed);
@@ -802,13 +829,15 @@ class Replacements {
 URL_EXPORT bool ReplaceStandardURL(const char* base,
                                    const Parsed& base_parsed,
                                    const Replacements<char>& replacements,
+                                   SchemeType scheme_type,
                                    CharsetConverter* query_converter,
                                    CanonOutput* output,
                                    Parsed* new_parsed);
 URL_EXPORT bool ReplaceStandardURL(
     const char* base,
     const Parsed& base_parsed,
     const Replacements<base::char16>& replacements,
+    SchemeType scheme_type,
     CharsetConverter* query_converter,
     CanonOutput* output,
     Parsed* new_parsed);

diff --git a/url/url_canon_filesystemurl.cc b/url/url_canon_filesystemurl.cc
@@ -43,16 +43,22 @@ bool DoCanonicalizeFileSystemURL(const CHAR* spec,
     return false;
 
   bool success = true;
+  SchemeType inner_scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION;
   if (CompareSchemeComponent(spec, inner_parsed->scheme, url::kFileScheme)) {
     new_inner_parsed.scheme.begin = output->length();
     output->Append("file://", 7);
     new_inner_parsed.scheme.len = 4;
     success &= CanonicalizePath(spec, inner_parsed->path, output,
                                 &new_inner_parsed.path);
-  } else if (IsStandard(spec, inner_parsed->scheme)) {
-    success = CanonicalizeStandardURL(spec, parsed.inner_parsed()->Length(),
-                                      *parsed.inner_parsed(), charset_converter,
-                                      output, &new_inner_parsed);
+  } else if (GetStandardSchemeType(spec, inner_parsed->scheme,
+                                   &inner_scheme_type)) {
+    if (inner_scheme_type == SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION) {
+      // Strip out the user information from the inner URL, if any.
+      inner_scheme_type = SCHEME_WITH_HOST_AND_PORT;
+    }
+    success = CanonicalizeStandardURL(
+        spec, parsed.inner_parsed()->Length(), *parsed.inner_parsed(),
+        inner_scheme_type, charset_converter, output, &new_inner_parsed);
   } else {
     // TODO(ericu): The URL is wrong, but should we try to output more of what
     // we were given?  Echoing back filesystem:mailto etc. doesn't seem all that

diff --git a/url/url_canon_relative.cc b/url/url_canon_relative.cc
@@ -12,6 +12,7 @@
 #include "url/url_constants.h"
 #include "url/url_file.h"
 #include "url/url_parse_internal.h"
+#include "url/url_util.h"
 #include "url/url_util_internal.h"
 
 namespace url {
@@ -407,7 +408,13 @@ bool DoResolveRelativeHost(const char* base_url,
   output->ReserveSizeIfNeeded(
       replacements.components().Length() +
       base_parsed.CountCharactersBefore(Parsed::USERNAME, false));
-  return ReplaceStandardURL(base_url, base_parsed, replacements,
+  SchemeType scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION;
+  if (!GetStandardSchemeType(base_url, base_parsed.scheme, &scheme_type)) {
+    // A path with an authority section gets canonicalized under standard URL
+    // rules, even though the base was not known to be standard.
+    scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION;
+  }
+  return ReplaceStandardURL(base_url, base_parsed, replacements, scheme_type,
                             query_converter, output, out_parsed);
 }