Skip to content

Commit

Permalink
Merge pull request gocolly#479 from WGH-/fix-relative-base
Browse files Browse the repository at this point in the history
Fix relative <base> URL
  • Loading branch information
asciimoo authored May 10, 2020
2 parents 06c3255 + 6058416 commit 6581387
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 2 deletions.
4 changes: 2 additions & 2 deletions colly.go
Original file line number Diff line number Diff line change
Expand Up @@ -1057,7 +1057,7 @@ func (c *Collector) handleOnHTML(resp *Response) error {
return err
}
if href, found := doc.Find("base[href]").Attr("href"); found {
resp.Request.baseURL, _ = url.Parse(href)
resp.Request.baseURL, _ = resp.Request.URL.Parse(href)
}
for _, cc := range c.htmlCallbacks {
i := 0
Expand Down Expand Up @@ -1096,7 +1096,7 @@ func (c *Collector) handleOnXML(resp *Response) error {
if e := htmlquery.FindOne(doc, "//base"); e != nil {
for _, a := range e.Attr {
if a.Key == "href" {
resp.Request.baseURL, _ = url.Parse(a.Val)
resp.Request.baseURL, _ = resp.Request.URL.Parse(a.Val)
break
}
}
Expand Down
40 changes: 40 additions & 0 deletions colly_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,21 @@ func newTestServer() *httptest.Server {
`))
})

mux.HandleFunc("/base_relative", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
w.Write([]byte(`<!DOCTYPE html>
<html>
<head>
<title>Test Page</title>
<base href="/foobar/" />
</head>
<body>
<a href="z">link</a>
</body>
</html>
`))
})

mux.HandleFunc("/large_binary", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/octet-stream")
ww := bufio.NewWriter(w)
Expand Down Expand Up @@ -767,6 +782,31 @@ func TestBaseTag(t *testing.T) {
c2.Visit(ts.URL + "/base")
}

func TestBaseTagRelative(t *testing.T) {
ts := newTestServer()
defer ts.Close()

c := NewCollector()
c.OnHTML("a[href]", func(e *HTMLElement) {
u := e.Request.AbsoluteURL(e.Attr("href"))
expected := ts.URL + "/foobar/z"
if u != expected {
t.Errorf("Invalid <base /> tag handling in OnHTML: expected %q, got %q", expected, u)
}
})
c.Visit(ts.URL + "/base_relative")

c2 := NewCollector()
c2.OnXML("//a", func(e *XMLElement) {
u := e.Request.AbsoluteURL(e.Attr("href"))
expected := ts.URL + "/foobar/z"
if u != expected {
t.Errorf("Invalid <base /> tag handling in OnXML: expected %q, got %q", expected, u)
}
})
c2.Visit(ts.URL + "/base_relative")
}

func TestCollectorCookies(t *testing.T) {
ts := newTestServer()
defer ts.Close()
Expand Down

0 comments on commit 6581387

Please sign in to comment.