Skip to content

Commit

Permalink
[GURL] (2 of 2) Strip username/password/port when canonicalizing, if …
Browse files Browse the repository at this point in the history
…the scheme doesn't support them

The goal of this CL is to inhibit port numbers and usernames in internal schemes
like "chrome-extension" and "chrome". Currently, navigations to chrome-extension:// URLs
with ports actually get suprisingly far; it seems like no good can possibly come from
that.

A new SchemeType is added: SCHEME_WITH_HOST_AND_PORT (no user information).
This is only used when canonicalizing the inner URL of filesystem: -- e.g.,
filesystem:http://user@host:20/temp/foo now canonicalizes to
filesystem:http://host:20/temp/foo; whereas filesystem:chrome://user@host:20/temp/foo
canonicalizes to filesystem:chrome://host/temp/foo

Bug: 606001,809062
Cq-Include-Trybots: master.tryserver.chromium.mac:ios-simulator-cronet;master.tryserver.chromium.mac:ios-simulator-full-configs
Change-Id: I77c5ba3d2fe964deb8aadae95a06519ce038c472
Reviewed-on: https://chromium-review.googlesource.com/974380
Reviewed-by: Vasilii Sukhanov <[email protected]>
Reviewed-by: Tommy Li <[email protected]>
Reviewed-by: Mike West <[email protected]>
Commit-Queue: Nick Carter <[email protected]>
Cr-Commit-Position: refs/heads/master@{#547882}
  • Loading branch information
nick-chromium authored and Commit Bot committed Apr 4, 2018
1 parent 3e0db46 commit ff69a10
Show file tree
Hide file tree
Showing 12 changed files with 237 additions and 158 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ bool CanonicalizeWebFacetURI(const std::string& input_uri,
url::StdStringCanonOutput canonical_output(canonical_uri);

bool canonicalization_succeeded = url::CanonicalizeStandardURL(
input_uri.c_str(), input_uri.size(), input_parsed, nullptr,
input_uri.c_str(), input_uri.size(), input_parsed,
url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr,
&canonical_output, &canonical_parsed);
canonical_output.Complete();

Expand Down
116 changes: 58 additions & 58 deletions components/url_formatter/url_fixer_unittest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -248,64 +248,64 @@ struct FixupCase {
const std::string input;
const std::string output;
} fixup_cases[] = {
{"www.google.com", "http://www.google.com/"},
{" www.google.com ", "http://www.google.com/"},
{" foo.com/asdf bar", "http://foo.com/asdf%20%20bar"},
{"..www.google.com..", "http://www.google.com./"},
{"http://......", "http://....../"},
{"http://host.com:ninety-two/", "http://host.com:ninety-two/"},
{"http://host.com:ninety-two?foo", "http://host.com:ninety-two/?foo"},
{"google.com:123", "http://google.com:123/"},
{"about:", "chrome://version/"},
{"about:foo", "chrome://foo/"},
{"about:version", "chrome://version/"},
{"about:blank", "about:blank"},
{"about:usr:pwd@hst/pth?qry#ref", "chrome://usr:pwd@hst/pth?qry#ref"},
{"about://usr:pwd@hst/pth?qry#ref", "chrome://usr:pwd@hst/pth?qry#ref"},
{"chrome:usr:pwd@hst/pth?qry#ref", "chrome://usr:pwd@hst/pth?qry#ref"},
{"chrome://usr:pwd@hst/pth?qry#ref", "chrome://usr:pwd@hst/pth?qry#ref"},
{"www:123", "http://www:123/"},
{" www:123", "http://www:123/"},
{"www.google.com?foo", "http://www.google.com/?foo"},
{"www.google.com#foo", "http://www.google.com/#foo"},
{"www.google.com?", "http://www.google.com/?"},
{"www.google.com#", "http://www.google.com/#"},
{"www.google.com:123?foo#bar", "http://www.google.com:123/?foo#bar"},
{"[email protected]", "http://[email protected]/"},
{"\xE6\xB0\xB4.com", "http://xn--1rw.com/"},
// It would be better if this next case got treated as http, but I don't see
// a clean way to guess this isn't the new-and-exciting "user" scheme.
{"user:[email protected]:8080/", "user:[email protected]:8080/"},
// {"file:///c:/foo/bar%20baz.txt", "file:///C:/foo/bar%20baz.txt"},
// URLs which end with 0x85 (NEL in ISO-8859).
{"http://foo.com/s?q=\xd0\x85", "http://foo.com/s?q=%D0%85"},
{"http://foo.com/s?q=\xec\x97\x85", "http://foo.com/s?q=%EC%97%85"},
{"http://foo.com/s?q=\xf0\x90\x80\x85", "http://foo.com/s?q=%F0%90%80%85"},
// URLs which end with 0xA0 (non-break space in ISO-8859).
{"http://foo.com/s?q=\xd0\xa0", "http://foo.com/s?q=%D0%A0"},
{"http://foo.com/s?q=\xec\x97\xa0", "http://foo.com/s?q=%EC%97%A0"},
{"http://foo.com/s?q=\xf0\x90\x80\xa0", "http://foo.com/s?q=%F0%90%80%A0"},
// URLs containing IPv6 literals.
{"[2001:db8::2]", "http://[2001:db8::2]/"},
{"[::]:80", "http://[::]/"},
{"[::]:80/path", "http://[::]/path"},
{"[::]:180/path", "http://[::]:180/path"},
// TODO(pmarks): Maybe we should parse bare IPv6 literals someday. Currently
// the first colon is treated as a scheme separator, and we default
// unspecified schemes to "http".
{"::1", "http://:1/"},
// Semicolon as scheme separator for standard schemes.
{"http;//www.google.com/", "http://www.google.com/"},
{"about;chrome", "chrome://chrome/"},
// Semicolon in non-standard schemes is not replaced by colon.
{"whatsup;//fool", "http://whatsup%3B//fool"},
// Semicolon left as-is in URL itself.
{"http://host/port?query;moar", "http://host/port?query;moar"},
// Fewer slashes than expected.
{"http;www.google.com/", "http://www.google.com/"},
{"http;/www.google.com/", "http://www.google.com/"},
// Semicolon at start.
{";http://www.google.com/", "http://%3Bhttp//www.google.com/"},
{"www.google.com", "http://www.google.com/"},
{" www.google.com ", "http://www.google.com/"},
{" foo.com/asdf bar", "http://foo.com/asdf%20%20bar"},
{"..www.google.com..", "http://www.google.com./"},
{"http://......", "http://....../"},
{"http://host.com:ninety-two/", "http://host.com:ninety-two/"},
{"http://host.com:ninety-two?foo", "http://host.com:ninety-two/?foo"},
{"google.com:123", "http://google.com:123/"},
{"about:", "chrome://version/"},
{"about:foo", "chrome://foo/"},
{"about:version", "chrome://version/"},
{"about:blank", "about:blank"},
{"about:usr:pwd@hst:20/pth?qry#ref", "chrome://hst/pth?qry#ref"},
{"about://usr:pwd@hst/pth?qry#ref", "chrome://hst/pth?qry#ref"},
{"chrome:usr:pwd@hst/pth?qry#ref", "chrome://hst/pth?qry#ref"},
{"chrome://usr:pwd@hst/pth?qry#ref", "chrome://hst/pth?qry#ref"},
{"www:123", "http://www:123/"},
{" www:123", "http://www:123/"},
{"www.google.com?foo", "http://www.google.com/?foo"},
{"www.google.com#foo", "http://www.google.com/#foo"},
{"www.google.com?", "http://www.google.com/?"},
{"www.google.com#", "http://www.google.com/#"},
{"www.google.com:123?foo#bar", "http://www.google.com:123/?foo#bar"},
{"[email protected]", "http://[email protected]/"},
{"\xE6\xB0\xB4.com", "http://xn--1rw.com/"},
// It would be better if this next case got treated as http, but I don't see
// a clean way to guess this isn't the new-and-exciting "user" scheme.
{"user:[email protected]:8080/", "user:[email protected]:8080/"},
// {"file:///c:/foo/bar%20baz.txt", "file:///C:/foo/bar%20baz.txt"},
// URLs which end with 0x85 (NEL in ISO-8859).
{"http://foo.com/s?q=\xd0\x85", "http://foo.com/s?q=%D0%85"},
{"http://foo.com/s?q=\xec\x97\x85", "http://foo.com/s?q=%EC%97%85"},
{"http://foo.com/s?q=\xf0\x90\x80\x85", "http://foo.com/s?q=%F0%90%80%85"},
// URLs which end with 0xA0 (non-break space in ISO-8859).
{"http://foo.com/s?q=\xd0\xa0", "http://foo.com/s?q=%D0%A0"},
{"http://foo.com/s?q=\xec\x97\xa0", "http://foo.com/s?q=%EC%97%A0"},
{"http://foo.com/s?q=\xf0\x90\x80\xa0", "http://foo.com/s?q=%F0%90%80%A0"},
// URLs containing IPv6 literals.
{"[2001:db8::2]", "http://[2001:db8::2]/"},
{"[::]:80", "http://[::]/"},
{"[::]:80/path", "http://[::]/path"},
{"[::]:180/path", "http://[::]:180/path"},
// TODO(pmarks): Maybe we should parse bare IPv6 literals someday. Currently
// the first colon is treated as a scheme separator, and we default
// unspecified schemes to "http".
{"::1", "http://:1/"},
// Semicolon as scheme separator for standard schemes.
{"http;//www.google.com/", "http://www.google.com/"},
{"about;chrome", "chrome://chrome/"},
// Semicolon in non-standard schemes is not replaced by colon.
{"whatsup;//fool", "http://whatsup%3B//fool"},
// Semicolon left as-is in URL itself.
{"http://host/port?query;moar", "http://host/port?query;moar"},
// Fewer slashes than expected.
{"http;www.google.com/", "http://www.google.com/"},
{"http;/www.google.com/", "http://www.google.com/"},
// Semicolon at start.
{";http://www.google.com/", "http://%3Bhttp//www.google.com/"},
};

TEST(URLFixerTest, FixupURL) {
Expand Down
6 changes: 3 additions & 3 deletions url/gurl_unittest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ TEST(GURLTest, CopyFileSystem) {
GURL url2(url);
EXPECT_TRUE(url2.is_valid());

EXPECT_EQ("filesystem:https://user:pass@google.com:99/t/foo;bar?q=a#ref", url2.spec());
EXPECT_EQ("filesystem:https://google.com:99/t/foo;bar?q=a#ref", url2.spec());
EXPECT_EQ("filesystem", url2.scheme());
EXPECT_EQ("", url2.username());
EXPECT_EQ("", url2.password());
Expand All @@ -211,8 +211,8 @@ TEST(GURLTest, CopyFileSystem) {
const GURL* inner = url2.inner_url();
ASSERT_TRUE(inner);
EXPECT_EQ("https", inner->scheme());
EXPECT_EQ("user", inner->username());
EXPECT_EQ("pass", inner->password());
EXPECT_EQ("", inner->username());
EXPECT_EQ("", inner->password());
EXPECT_EQ("google.com", inner->host());
EXPECT_EQ("99", inner->port());
EXPECT_EQ(99, inner->IntPort());
Expand Down
1 change: 1 addition & 0 deletions url/scheme_host_port.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ bool IsValidInput(const base::StringPiece& scheme,
return false;

switch (scheme_type) {
case SCHEME_WITH_HOST_AND_PORT:
case SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION:
// A URL with |scheme| is required to have the host and port (may be
// omitted in a serialization if it's the same as the default value).
Expand Down
29 changes: 29 additions & 0 deletions url/url_canon.h
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,31 @@ class URL_EXPORT CharsetConverter {
CanonOutput* output) = 0;
};

// Schemes --------------------------------------------------------------------

// Types of a scheme representing the requirements on the data represented by
// the authority component of a URL with the scheme.
enum SchemeType {
// The authority component of a URL with the scheme has the form
// "username:password@host:port". The username and password entries are
// optional; the host may not be empty. The default value of the port can be
// omitted in serialization. This type occurs with network schemes like http,
// https, and ftp.
SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION,
// The authority component of a URL with the scheme has the form "host:port",
// and does not include username or password. The default value of the port
// can be omitted in serialization. Used by inner URLs of filesystem URLs of
// origins with network hosts, from which the username and password are
// stripped.
SCHEME_WITH_HOST_AND_PORT,
// The authority component of an URL with the scheme has the form "host", and
// does not include port, username, or password. Used when the hosts are not
// network addresses; for example, schemes used internally by the browser.
SCHEME_WITH_HOST,
// A URL with the scheme doesn't have the authority component.
SCHEME_WITHOUT_AUTHORITY,
};

// Whitespace -----------------------------------------------------------------

// Searches for whitespace that should be removed from the middle of URLs, and
Expand Down Expand Up @@ -549,12 +574,14 @@ URL_EXPORT void CanonicalizeRef(const base::char16* spec,
URL_EXPORT bool CanonicalizeStandardURL(const char* spec,
int spec_len,
const Parsed& parsed,
SchemeType scheme_type,
CharsetConverter* query_converter,
CanonOutput* output,
Parsed* new_parsed);
URL_EXPORT bool CanonicalizeStandardURL(const base::char16* spec,
int spec_len,
const Parsed& parsed,
SchemeType scheme_type,
CharsetConverter* query_converter,
CanonOutput* output,
Parsed* new_parsed);
Expand Down Expand Up @@ -802,13 +829,15 @@ class Replacements {
URL_EXPORT bool ReplaceStandardURL(const char* base,
const Parsed& base_parsed,
const Replacements<char>& replacements,
SchemeType scheme_type,
CharsetConverter* query_converter,
CanonOutput* output,
Parsed* new_parsed);
URL_EXPORT bool ReplaceStandardURL(
const char* base,
const Parsed& base_parsed,
const Replacements<base::char16>& replacements,
SchemeType scheme_type,
CharsetConverter* query_converter,
CanonOutput* output,
Parsed* new_parsed);
Expand Down
14 changes: 10 additions & 4 deletions url/url_canon_filesystemurl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,16 +43,22 @@ bool DoCanonicalizeFileSystemURL(const CHAR* spec,
return false;

bool success = true;
SchemeType inner_scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION;
if (CompareSchemeComponent(spec, inner_parsed->scheme, url::kFileScheme)) {
new_inner_parsed.scheme.begin = output->length();
output->Append("file://", 7);
new_inner_parsed.scheme.len = 4;
success &= CanonicalizePath(spec, inner_parsed->path, output,
&new_inner_parsed.path);
} else if (IsStandard(spec, inner_parsed->scheme)) {
success = CanonicalizeStandardURL(spec, parsed.inner_parsed()->Length(),
*parsed.inner_parsed(), charset_converter,
output, &new_inner_parsed);
} else if (GetStandardSchemeType(spec, inner_parsed->scheme,
&inner_scheme_type)) {
if (inner_scheme_type == SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION) {
// Strip out the user information from the inner URL, if any.
inner_scheme_type = SCHEME_WITH_HOST_AND_PORT;
}
success = CanonicalizeStandardURL(
spec, parsed.inner_parsed()->Length(), *parsed.inner_parsed(),
inner_scheme_type, charset_converter, output, &new_inner_parsed);
} else {
// TODO(ericu): The URL is wrong, but should we try to output more of what
// we were given? Echoing back filesystem:mailto etc. doesn't seem all that
Expand Down
9 changes: 8 additions & 1 deletion url/url_canon_relative.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "url/url_constants.h"
#include "url/url_file.h"
#include "url/url_parse_internal.h"
#include "url/url_util.h"
#include "url/url_util_internal.h"

namespace url {
Expand Down Expand Up @@ -407,7 +408,13 @@ bool DoResolveRelativeHost(const char* base_url,
output->ReserveSizeIfNeeded(
replacements.components().Length() +
base_parsed.CountCharactersBefore(Parsed::USERNAME, false));
return ReplaceStandardURL(base_url, base_parsed, replacements,
SchemeType scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION;
if (!GetStandardSchemeType(base_url, base_parsed.scheme, &scheme_type)) {
// A path with an authority section gets canonicalized under standard URL
// rules, even though the base was not known to be standard.
scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION;
}
return ReplaceStandardURL(base_url, base_parsed, replacements, scheme_type,
query_converter, output, out_parsed);
}

Expand Down
Loading

0 comments on commit ff69a10

Please sign in to comment.