Skip to content

Commit 70168cf

Browse files
authoredApr 17, 2023
Merge pull request gocolly#763 from WGH-/fix-setcookie-self-redirect
Support websites redirecting to the same page when AllowURLRevisit is disabled
2 parents 336c8f7 + b4ca6a7 commit 70168cf

File tree

2 files changed

+82
-9
lines changed

2 files changed

+82
-9
lines changed
 

‎colly.go

+15-7
Original file line numberDiff line numberDiff line change
@@ -1334,7 +1334,12 @@ func (c *Collector) checkRedirectFunc() func(req *http.Request, via []*http.Requ
13341334
return fmt.Errorf("Not following redirect to %q: %w", req.URL, err)
13351335
}
13361336

1337-
if !c.AllowURLRevisit {
1337+
// allow redirects to the original destination
1338+
// to support websites redirecting to the same page while setting
1339+
// session cookies
1340+
samePageRedirect := normalizeURL(req.URL.String()) == normalizeURL(via[0].URL.String())
1341+
1342+
if !c.AllowURLRevisit && !samePageRedirect {
13381343
var body io.ReadCloser
13391344
if req.GetBody != nil {
13401345
var err error
@@ -1506,16 +1511,19 @@ func isMatchingFilter(fs []*regexp.Regexp, d []byte) bool {
15061511
return false
15071512
}
15081513

1514+
func normalizeURL(u string) string {
1515+
parsed, err := urlParser.Parse(u)
1516+
if err != nil {
1517+
return u
1518+
}
1519+
return parsed.String()
1520+
}
1521+
15091522
func requestHash(url string, body io.Reader) uint64 {
15101523
h := fnv.New64a()
15111524
// reparse the url to fix ambiguities such as
15121525
// "http://example.com" vs "http://example.com/"
1513-
parsedWhatwgURL, err := whatwgUrl.Parse(url)
1514-
if err == nil {
1515-
h.Write([]byte(parsedWhatwgURL.String()))
1516-
} else {
1517-
h.Write([]byte(url))
1518-
}
1526+
io.WriteString(h, normalizeURL(url))
15191527
if body != nil {
15201528
io.Copy(h, body)
15211529
}

‎colly_test.go

+67-2
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ Disallow: /disallowed
4343
Disallow: /allowed*q=
4444
`
4545

46-
func newTestServer() *httptest.Server {
46+
func newUnstartedTestServer() *httptest.Server {
4747
mux := http.NewServeMux()
4848

4949
mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
@@ -253,7 +253,13 @@ y">link</a>
253253
}
254254
})
255255

256-
return httptest.NewServer(mux)
256+
return httptest.NewUnstartedServer(mux)
257+
}
258+
259+
func newTestServer() *httptest.Server {
260+
srv := newUnstartedTestServer()
261+
srv.Start()
262+
return srv
257263
}
258264

259265
var newCollectorTests = map[string]func(*testing.T){
@@ -712,6 +718,33 @@ func TestCollectorURLRevisitCheck(t *testing.T) {
712718
}
713719
}
714720

721+
func TestSetCookieRedirect(t *testing.T) {
722+
type middleware = func(http.Handler) http.Handler
723+
for _, m := range []middleware{
724+
requireSessionCookieSimple,
725+
requireSessionCookieAuthPage,
726+
} {
727+
t.Run("", func(t *testing.T) {
728+
ts := newUnstartedTestServer()
729+
ts.Config.Handler = m(ts.Config.Handler)
730+
ts.Start()
731+
defer ts.Close()
732+
c := NewCollector()
733+
c.OnResponse(func(r *Response) {
734+
if got, want := r.Body, serverIndexResponse; !bytes.Equal(got, want) {
735+
t.Errorf("bad response body got=%q want=%q", got, want)
736+
}
737+
if got, want := r.StatusCode, http.StatusOK; got != want {
738+
t.Errorf("bad response code got=%d want=%d", got, want)
739+
}
740+
})
741+
if err := c.Visit(ts.URL); err != nil {
742+
t.Fatal(err)
743+
}
744+
})
745+
}
746+
}
747+
715748
func TestCollectorPostURLRevisitCheck(t *testing.T) {
716749
ts := newTestServer()
717750
defer ts.Close()
@@ -1587,3 +1620,35 @@ func BenchmarkOnResponse(b *testing.B) {
15871620
c.Visit(ts.URL)
15881621
}
15891622
}
1623+
1624+
func requireSessionCookieSimple(handler http.Handler) http.Handler {
1625+
const cookieName = "session_id"
1626+
1627+
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
1628+
if _, err := r.Cookie(cookieName); err == http.ErrNoCookie {
1629+
http.SetCookie(w, &http.Cookie{Name: cookieName, Value: "1"})
1630+
http.Redirect(w, r, r.RequestURI, http.StatusFound)
1631+
return
1632+
}
1633+
handler.ServeHTTP(w, r)
1634+
})
1635+
}
1636+
1637+
func requireSessionCookieAuthPage(handler http.Handler) http.Handler {
1638+
const setCookiePath = "/auth"
1639+
const cookieName = "session_id"
1640+
1641+
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
1642+
if r.URL.Path == setCookiePath {
1643+
destination := r.URL.Query().Get("return")
1644+
http.Redirect(w, r, destination, http.StatusFound)
1645+
return
1646+
}
1647+
if _, err := r.Cookie(cookieName); err == http.ErrNoCookie {
1648+
http.SetCookie(w, &http.Cookie{Name: cookieName, Value: "1"})
1649+
http.Redirect(w, r, setCookiePath+"?return="+url.QueryEscape(r.RequestURI), http.StatusFound)
1650+
return
1651+
}
1652+
handler.ServeHTTP(w, r)
1653+
})
1654+
}

0 commit comments

Comments
 (0)
Please sign in to comment.