diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md deleted file mode 100644 index e17c682be..000000000 --- a/.github/ISSUE_TEMPLATE.md +++ /dev/null @@ -1,2 +0,0 @@ - \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 000000000..e330bb3e5 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,15 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: '' +assignees: '' + +--- + + diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 000000000..1f8e627d7 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,5 @@ +blank_issues_enabled: true +contact_links: + - name: Question + url: https://stackoverflow.com/ + about: Questions should go to Stack Overflow. You can use go-colly tag. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 000000000..bf3b90799 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,13 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: '' +assignees: '' + +--- + + diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 000000000..20f2af00a --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,81 @@ +--- +name: CI +on: + push: + branches: + - '**' + pull_request: + +jobs: + test: + name: Test ${{matrix.go}} + runs-on: [ubuntu-latest] + strategy: + fail-fast: false + max-parallel: 4 + matrix: + go: [ + "1.22", + "1.21", + "1.20", + "1.19", + ] + + steps: + - name: Checkout branch + uses: actions/checkout@v2 + + - name: Setup go + uses: actions/setup-go@v2 + with: + go-version: ${{matrix.go}} + + - name: Test + run: | + go install golang.org/x/lint/golint@latest + OUT="$(go get -a)"; test -z "$OUT" || (echo "$OUT" && return 1) + OUT="$(gofmt -l -d ./)"; test -z "$OUT" || (echo "$OUT" && return 1) + golint -set_exit_status + go vet -v ./... + go test -race -v -coverprofile=coverage.txt -covermode=atomic ./... + + build: + name: Build ${{matrix.go}} + runs-on: [ubuntu-latest] + strategy: + fail-fast: false + max-parallel: 4 + matrix: + go: [ + "1.22", + "1.21", + "1.20", + "1.19", + ] + + steps: + - name: Checkout branch + uses: actions/checkout@v2 + + - name: Setup go + uses: actions/setup-go@v2 + with: + go-version: ${{matrix.go}} + + - name: Build + run: | + go install golang.org/x/lint/golint@latest + OUT="$(go get -a)"; test -z "$OUT" || (echo "$OUT" && return 1) + OUT="$(gofmt -l -d ./)"; test -z "$OUT" || (echo "$OUT" && return 1) + golint -set_exit_status + go build + + codecov: + name: Codecov + runs-on: [ubuntu-latest] + needs: + - test + - build + steps: + - name: Run Codecov + run: bash <(curl -s https://codecov.io/bash) diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index d72ef3847..000000000 --- a/.travis.yml +++ /dev/null @@ -1,17 +0,0 @@ -language: go -sudo: false -go: - - 1.9.x - - 1.10.x - - 1.11.x - - tip -script: - - go get -u golang.org/x/lint/golint - - OUT="$(go get -a)"; test -z "$OUT" || (echo "$OUT" && return 1) - - OUT="$(gofmt -l -d ./)"; test -z "$OUT" || (echo "$OUT" && return 1) - - OUT="$(golint ./...)"; test -z "$OUT" || (echo "$OUT" && return 1) - - go vet -v ./... - - go test -race -v -coverprofile=coverage.txt -covermode=atomic ./ - - go build -after_success: - - bash <(curl -s https://codecov.io/bash) diff --git a/CHANGELOG.md b/CHANGELOG.md index 933d9eff1..166327f1e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,23 @@ +# 2.1.0 - 2020.06.09 + + - HTTP tracing support + - New callback: OnResponseHeader + - Queue fixes + - New collector option: Collector.CheckHead + - Proxy fixes + - Fixed POST revisit checking + - Updated dependencies + +# 2.0.0 - 2019.11.28 + + - Breaking change: Change Collector.RedirectHandler member to Collector.SetRedirectHandler function + - Go module support + - Collector.HasVisited method added to be able to check if an url has been visited + - Collector.SetClient method introduced + - HTMLElement.ChildTexts method added + - New user agents + - Multiple bugfixes + # 1.2.0 - 2019.02.13 - Compatibility with the latest htmlquery package diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 17df63602..c42dbc8f8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -13,7 +13,7 @@ We welcome any type of contribution, not only code. You can help with ## Your First Contribution -Working on your first Pull Request? You can learn how from this *free* series, [How to Contribute to an Open Source Project on GitHub](https://egghead.io/series/how-to-contribute-to-an-open-source-project-on-github). +Working on your first Pull Request? You can learn how from this *free* series, [How to Contribute to an Open Source Project on GitHub](https://app.egghead.io/playlists/how-to-contribute-to-an-open-source-project-on-github). ## Submitting code diff --git a/README.md b/README.md index 06e73cbea..6205799ed 100644 --- a/README.md +++ b/README.md @@ -6,8 +6,8 @@ Colly provides a clean interface to write any kind of crawler/scraper/spider. With Colly you can easily extract structured data from websites, which can be used for a wide range of applications, like data mining, data processing or archiving. -[![GoDoc](https://godoc.org/github.com/gocolly/colly?status.svg)](https://godoc.org/github.com/gocolly/colly) -[![Backers on Open Collective](https://opencollective.com/colly/backers/badge.svg)](#backers) [![Sponsors on Open Collective](https://opencollective.com/colly/sponsors/badge.svg)](#sponsors) [![build status](https://img.shields.io/travis/gocolly/colly/master.svg?style=flat-square)](https://travis-ci.org/gocolly/colly) +[![GoDoc](https://godoc.org/github.com/gocolly/colly?status.svg)](https://pkg.go.dev/github.com/gocolly/colly/v2) +[![Backers on Open Collective](https://opencollective.com/colly/backers/badge.svg)](#backers) [![Sponsors on Open Collective](https://opencollective.com/colly/sponsors/badge.svg)](#sponsors) [![build status](https://github.com/gocolly/colly/actions/workflows/ci.yml/badge.svg)](https://github.com/gocolly/colly/actions/workflows/ci.yml) [![report card](https://img.shields.io/badge/report%20card-a%2B-ff3333.svg?style=flat-square)](http://goreportcard.com/report/gocolly/colly) [![view examples](https://img.shields.io/badge/learn%20by-examples-0077b3.svg?style=flat-square)](https://github.com/gocolly/colly/tree/master/_examples) [![Code Coverage](https://img.shields.io/codecov/c/github/gocolly/colly/master.svg)](https://codecov.io/github/gocolly/colly?branch=master) @@ -15,20 +15,39 @@ With Colly you can easily extract structured data from websites, which can be us [![Twitter URL](https://img.shields.io/badge/twitter-follow-green.svg)](https://twitter.com/gocolly) -## Features +------ + + +## Sponsors + + +Scrapfly.io - * Clean API - * Fast (>1k request/sec on a single core) - * Manages request delays and maximum concurrency per domain - * Automatic cookie and session handling - * Sync/async/parallel scraping - * Caching - * Automatic encoding of non-unicode responses - * Robots.txt support - * Distributed scraping - * Configuration via environment variables - * Extensions +[Scrapfly](https://scrapfly.io/?utm_source=Github&utm_medium=repo&utm_campaign=colly) +is an enterprise-grade solution providing Web Scraping API that aims to simplify the +scraping process by managing everything: real browser rendering, rotating proxies, and +fingerprints (TLS, HTTP, browser) to bypass all major anti-bots. Scrapfly also unlocks the +observability by providing an analytical dashboard and measuring the success rate/block +rate in detail. + + +------ + + + +## Features +- Clean API +- Fast (>1k request/sec on a single core) +- Manages request delays and maximum concurrency per domain +- Automatic cookie and session handling +- Sync/async/parallel scraping +- Caching +- Automatic encoding of non-unicode responses +- Robots.txt support +- Distributed scraping +- Configuration via environment variables +- Extensions ## Example @@ -51,29 +70,44 @@ func main() { See [examples folder](https://github.com/gocolly/colly/tree/master/_examples) for more detailed examples. - ## Installation +Add colly to your `go.mod` file: + ``` -go get -u github.com/gocolly/colly/... -``` +module github.com/x/y +go 1.14 + +require ( + github.com/gocolly/colly/v2 latest +) +``` ## Bugs Bugs or suggestions? Visit the [issue tracker](https://github.com/gocolly/colly/issues) or join `#colly` on freenode - ## Other Projects Using Colly Below is a list of public, open source projects that use Colly: - * [greenpeace/check-my-pages](https://github.com/greenpeace/check-my-pages) Scraping script to test the Spanish Greenpeace web archive - * [altsab/gowap](https://github.com/altsab/gowap) Wappalyzer implementation in Go - * [jesuiscamille/goquotes](https://github.com/jesuiscamille/goquotes) A quotes scrapper, making your day a little better! - * [jivesearch/jivesearch](https://github.com/jivesearch/jivesearch) A search engine that doesn't track you. - * [Leagify/colly-draft-prospects](https://github.com/Leagify/colly-draft-prospects) A scraper for future NFL Draft prospects. - * [lucasepe/go-ps4](https://github.com/lucasepe/go-ps4) Search playstation store for your favorite PS4 games using the command line. +- [greenpeace/check-my-pages](https://github.com/greenpeace/check-my-pages) Scraping script to test the Spanish Greenpeace web archive. +- [altsab/gowap](https://github.com/altsab/gowap) Wappalyzer implementation in Go. +- [jesuiscamille/goquotes](https://github.com/jesuiscamille/goquotes) A quotes scraper, making your day a little better! +- [jivesearch/jivesearch](https://github.com/jivesearch/jivesearch) A search engine that doesn't track you. +- [Leagify/colly-draft-prospects](https://github.com/Leagify/colly-draft-prospects) A scraper for future NFL Draft prospects. +- [lucasepe/go-ps4](https://github.com/lucasepe/go-ps4) Search playstation store for your favorite PS4 games using the command line. +- [yringler/inside-chassidus-scraper](https://github.com/yringler/inside-chassidus-scraper) Scrapes Rabbi Paltiel's web site for lesson metadata. +- [gamedb/gamedb](https://github.com/gamedb/gamedb) A database of Steam games. +- [lawzava/scrape](https://github.com/lawzava/scrape) CLI for email scraping from any website. +- [eureka101v/WeiboSpiderGo](https://github.com/eureka101v/WeiboSpiderGo) A sina weibo(chinese twitter) scraper +- [Go-phie/gophie](https://github.com/Go-phie/gophie) Search, Download and Stream movies from your terminal +- [imthaghost/goclone](https://github.com/imthaghost/goclone) Clone websites to your computer within seconds. +- [superiss/spidy](https://github.com/superiss/spidy) Crawl the web and collect expired domains. +- [docker-slim/docker-slim](https://github.com/docker-slim/docker-slim) Optimize your Docker containers to make them smaller and better. +- [seversky/gachifinder](https://github.com/seversky/gachifinder) an agent for asynchronous scraping, parsing and writing to some storages(elasticsearch for now) +- [eval-exec/goodreads](https://github.com/eval-exec/goodreads) crawl all tags and all pages of quotes from goodreads. If you are using Colly in a project please send a pull request to add it to the list. @@ -82,14 +116,12 @@ If you are using Colly in a project please send a pull request to add it to the This project exists thanks to all the people who contribute. [[Contribute]](CONTRIBUTING.md). - ## Backers Thank you to all our backers! 🙏 [[Become a backer](https://opencollective.com/colly#backer)] - ## Sponsors Support this project by becoming a sponsor. Your logo will show up here with a link to your website. [[Become a sponsor](https://opencollective.com/colly#sponsor)] @@ -105,8 +137,6 @@ Support this project by becoming a sponsor. Your logo will show up here with a l - - - ## License + [![FOSSA Status](https://app.fossa.io/api/projects/git%2Bgithub.com%2Fgocolly%2Fcolly.svg?type=large)](https://app.fossa.io/projects/git%2Bgithub.com%2Fgocolly%2Fcolly?ref=badge_large) diff --git a/VERSION b/VERSION index 26aaba0e8..7ec1d6db4 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.0 +2.1.0 diff --git a/_examples/basic/basic.go b/_examples/basic/basic.go index cd7abf4c1..b3f37251b 100644 --- a/_examples/basic/basic.go +++ b/_examples/basic/basic.go @@ -3,7 +3,7 @@ package main import ( "fmt" - "github.com/gocolly/colly" + "github.com/gocolly/colly/v2" ) func main() { diff --git a/_examples/coursera_courses/coursera_courses.go b/_examples/coursera_courses/coursera_courses.go index 45544a383..8526b9a95 100644 --- a/_examples/coursera_courses/coursera_courses.go +++ b/_examples/coursera_courses/coursera_courses.go @@ -6,7 +6,7 @@ import ( "os" "strings" - "github.com/gocolly/colly" + "github.com/gocolly/colly/v2" ) // Course stores information about a coursera course @@ -18,11 +18,18 @@ type Course struct { URL string Language string Commitment string - HowToPass string Rating string } func main() { + fName := "courses.json" + file, err := os.Create(fName) + if err != nil { + log.Fatalf("Cannot create file %q: %s\n", fName, err) + return + } + defer file.Close() + // Instantiate default collector c := colly.NewCollector( // Visit only domains: coursera.org, www.coursera.org @@ -38,7 +45,7 @@ func main() { courses := make([]Course, 0, 200) - // On every a element which has href attribute call callback + // On every element which has "href" attribute call callback c.OnHTML("a[href]", func(e *colly.HTMLElement) { // If attribute class is this long string return from callback // As this a is irrelevant @@ -59,8 +66,8 @@ func main() { log.Println("visiting", r.URL.String()) }) - // On every a HTML element which has name attribute call callback - c.OnHTML(`a[name]`, func(e *colly.HTMLElement) { + // On every element with collection-product-card class call callback + c.OnHTML(`a.collection-product-card`, func(e *colly.HTMLElement) { // Activate detailCollector if the link contains "coursera.org/learn" courseURL := e.Request.AbsoluteURL(e.Attr("href")) if strings.Index(courseURL, "coursera.org/learn") != -1 { @@ -71,7 +78,7 @@ func main() { // Extract details of the course detailCollector.OnHTML(`div[id=rendered-content]`, func(e *colly.HTMLElement) { log.Println("Course found", e.Request.URL) - title := e.ChildText(".course-title") + title := e.ChildText(".banner-title") if title == "" { log.Println("No title found", e.Request.URL) } @@ -79,22 +86,23 @@ func main() { Title: title, URL: e.Request.URL.String(), Description: e.ChildText("div.content"), - Creator: e.ChildText("div.creator-names > span"), + Creator: e.ChildText("li.banner-instructor-info > a > div > div > span"), + Rating: e.ChildText("span.number-rating"), } - // Iterate over rows of the table which contains different information - // about the course - e.ForEach("table.basic-info-table tr", func(_ int, el *colly.HTMLElement) { - switch el.ChildText("td:first-child") { - case "Language": - course.Language = el.ChildText("td:nth-child(2)") + // Iterate over div components and add details to course + e.ForEach(".AboutCourse .ProductGlance > div", func(_ int, el *colly.HTMLElement) { + svgTitle := strings.Split(el.ChildText("div:nth-child(1) svg title"), " ") + lastWord := svgTitle[len(svgTitle)-1] + switch lastWord { + // svg Title: Available Languages + case "languages": + course.Language = el.ChildText("div:nth-child(2) > div:nth-child(1)") + // svg Title: Mixed/Beginner/Intermediate/Advanced Level case "Level": - course.Level = el.ChildText("td:nth-child(2)") - case "Commitment": - course.Commitment = el.ChildText("td:nth-child(2)") - case "How To Pass": - course.HowToPass = el.ChildText("td:nth-child(2)") - case "User Ratings": - course.Rating = el.ChildText("td:nth-child(2) div:nth-of-type(2)") + course.Level = el.ChildText("div:nth-child(2) > div:nth-child(1)") + // svg Title: Hours to complete + case "complete": + course.Commitment = el.ChildText("div:nth-child(2) > div:nth-child(1)") } }) courses = append(courses, course) @@ -103,7 +111,7 @@ func main() { // Start scraping on http://coursera.com/browse c.Visit("https://coursera.org/browse") - enc := json.NewEncoder(os.Stdout) + enc := json.NewEncoder(file) enc.SetIndent("", " ") // Dump json to the standard output diff --git a/_examples/cryptocoinmarketcap/cryptocoinmarketcap.go b/_examples/cryptocoinmarketcap/cryptocoinmarketcap.go index b84bb3457..3de34d8f8 100644 --- a/_examples/cryptocoinmarketcap/cryptocoinmarketcap.go +++ b/_examples/cryptocoinmarketcap/cryptocoinmarketcap.go @@ -5,7 +5,7 @@ import ( "log" "os" - "github.com/gocolly/colly" + "github.com/gocolly/colly/v2" ) func main() { @@ -20,21 +20,22 @@ func main() { defer writer.Flush() // Write CSV header - writer.Write([]string{"Name", "Symbol", "Price (USD)", "Volume (USD)", "Market capacity (USD)", "Change (1h)", "Change (24h)", "Change (7d)"}) + writer.Write([]string{"Name", "Symbol", "Market Cap (USD)", "Price (USD)", "Circulating Supply (USD)", "Volume (24h)", "Change (1h)", "Change (24h)", "Change (7d)"}) // Instantiate default collector c := colly.NewCollector() - c.OnHTML("#currencies-all tbody tr", func(e *colly.HTMLElement) { + c.OnHTML("tbody tr", func(e *colly.HTMLElement) { writer.Write([]string{ - e.ChildText(".currency-name-container"), - e.ChildText(".col-symbol"), - e.ChildAttr("a.price", "data-usd"), - e.ChildAttr("a.volume", "data-usd"), - e.ChildAttr(".market-cap", "data-usd"), - e.ChildText(".percent-1h"), - e.ChildText(".percent-24h"), - e.ChildText(".percent-7d"), + e.ChildText(".cmc-table__column-name"), + e.ChildText(".cmc-table__cell--sort-by__symbol"), + e.ChildText(".cmc-table__cell--sort-by__market-cap"), + e.ChildText(".cmc-table__cell--sort-by__price"), + e.ChildText(".cmc-table__cell--sort-by__circulating-supply"), + e.ChildText(".cmc-table__cell--sort-by__volume-24-h"), + e.ChildText(".cmc-table__cell--sort-by__percent-change-1-h"), + e.ChildText(".cmc-table__cell--sort-by__percent-change-24-h"), + e.ChildText(".cmc-table__cell--sort-by__percent-change-7-d"), }) }) diff --git a/_examples/error_handling/error_handling.go b/_examples/error_handling/error_handling.go index 7ac9d8f54..7d9d3d797 100644 --- a/_examples/error_handling/error_handling.go +++ b/_examples/error_handling/error_handling.go @@ -3,7 +3,7 @@ package main import ( "fmt" - "github.com/gocolly/colly" + "github.com/gocolly/colly/v2" ) func main() { diff --git a/_examples/factba.se/factbase.go b/_examples/factba.se/factbase.go index 76edfc067..440acfd98 100644 --- a/_examples/factba.se/factbase.go +++ b/_examples/factba.se/factbase.go @@ -3,10 +3,10 @@ package main import ( "encoding/json" "fmt" - "io/ioutil" + "os" "strconv" - "github.com/gocolly/colly" + "github.com/gocolly/colly/v2" ) var baseSearchURL = "https://factba.se/json/json-transcript.php?q=&f=&dt=&p=" @@ -45,7 +45,7 @@ func main() { if err != nil { return } - ioutil.WriteFile(colly.SanitizeFileName(e.Request.Ctx.Get("date")+"_"+e.Request.Ctx.Get("slug"))+".json", jsonData, 0644) + os.WriteFile(colly.SanitizeFileName(e.Request.Ctx.Get("date")+"_"+e.Request.Ctx.Get("slug"))+".json", jsonData, 0644) }) stop := false diff --git a/_examples/google_groups/google_groups.go b/_examples/google_groups/google_groups.go index cbeb97e9c..d838a831d 100644 --- a/_examples/google_groups/google_groups.go +++ b/_examples/google_groups/google_groups.go @@ -7,7 +7,7 @@ import ( "os" "strings" - "github.com/gocolly/colly" + "github.com/gocolly/colly/v2" ) // Mail is the container of a single e-mail diff --git a/_examples/hackernews_comments/hackernews_comments.go b/_examples/hackernews_comments/hackernews_comments.go index 8859a5f3c..1ecc3086a 100644 --- a/_examples/hackernews_comments/hackernews_comments.go +++ b/_examples/hackernews_comments/hackernews_comments.go @@ -8,7 +8,7 @@ import ( "strconv" "strings" - "github.com/gocolly/colly" + "github.com/gocolly/colly/v2" ) type comment struct { diff --git a/_examples/instagram/instagram.go b/_examples/instagram/instagram.go index de2b21368..c514ce0e7 100644 --- a/_examples/instagram/instagram.go +++ b/_examples/instagram/instagram.go @@ -10,7 +10,7 @@ import ( "regexp" "strings" - "github.com/gocolly/colly" + "github.com/gocolly/colly/v2" ) // "id": user id, "after": end cursor diff --git a/_examples/local_files/local_files b/_examples/local_files/local_files deleted file mode 100755 index 4d7f677b5..000000000 Binary files a/_examples/local_files/local_files and /dev/null differ diff --git a/_examples/local_files/local_files.go b/_examples/local_files/local_files.go index ba2b9865c..3473a67f5 100644 --- a/_examples/local_files/local_files.go +++ b/_examples/local_files/local_files.go @@ -6,7 +6,7 @@ import ( "os" "path/filepath" - "github.com/gocolly/colly" + "github.com/gocolly/colly/v2" ) func main() { diff --git a/_examples/login/login.go b/_examples/login/login.go index eeadeba36..ab7a6b06d 100644 --- a/_examples/login/login.go +++ b/_examples/login/login.go @@ -3,7 +3,7 @@ package main import ( "log" - "github.com/gocolly/colly" + "github.com/gocolly/colly/v2" ) func main() { diff --git a/_examples/max_depth/max_depth.go b/_examples/max_depth/max_depth.go index 2e28e568c..d11af1805 100644 --- a/_examples/max_depth/max_depth.go +++ b/_examples/max_depth/max_depth.go @@ -3,7 +3,7 @@ package main import ( "fmt" - "github.com/gocolly/colly" + "github.com/gocolly/colly/v2" ) func main() { diff --git a/_examples/multipart/multipart.go b/_examples/multipart/multipart.go index d8809241f..6d74facf9 100644 --- a/_examples/multipart/multipart.go +++ b/_examples/multipart/multipart.go @@ -2,19 +2,19 @@ package main import ( "fmt" - "io/ioutil" + "io" "net/http" "os" "time" - "github.com/gocolly/colly" + "github.com/gocolly/colly/v2" ) func generateFormData() map[string][]byte { f, _ := os.Open("gocolly.jpg") defer f.Close() - imgData, _ := ioutil.ReadAll(f) + imgData, _ := io.ReadAll(f) return map[string][]byte{ "firstname": []byte("one"), diff --git a/_examples/openedx_courses/openedx_courses.go b/_examples/openedx_courses/openedx_courses.go index 293637541..f9a70d1c2 100644 --- a/_examples/openedx_courses/openedx_courses.go +++ b/_examples/openedx_courses/openedx_courses.go @@ -6,11 +6,11 @@ import ( "strings" "time" - "github.com/gocolly/colly" + "github.com/gocolly/colly/v2" ) // DATE_FORMAT default format date used in openedx -const DATE_FORMAT = "Jan 02, 2006" +const DATE_FORMAT = "02 Jan, 2006" // Course store openedx course data type Course struct { @@ -42,18 +42,19 @@ func main() { if !strings.HasPrefix(link, "/courses/") { return } - // start scaping the page under the link found + // start scraping the page under the link found e.Request.Visit(link) }) - c.OnHTML("div[class=content-wrapper]", func(e *colly.HTMLElement) { - if e.DOM.Find("section.course-info").Length() == 0 { + c.OnHTML("div[class=main-container]", func(e *colly.HTMLElement) { + if e.DOM.Find("section#course-info").Length() == 0 { return } - title := strings.Split(e.ChildText(".course-title"), "\n")[0] + title := strings.Split(e.ChildText(".course-info__title"), "\n")[0] course_id := e.ChildAttr("input[name=course_id]", "value") - start_date, _ := time.Parse(DATE_FORMAT, e.ChildText("span.start-date")) - end_date, _ := time.Parse(DATE_FORMAT, e.ChildText("span.final-date")) + texts := e.ChildTexts("span[data-datetime]") + start_date, _ := time.Parse(DATE_FORMAT, texts[0]) + end_date, _ := time.Parse(DATE_FORMAT, texts[1]) var run string if len(strings.Split(course_id, "_")) > 1 { run = strings.Split(course_id, "_")[1] diff --git a/_examples/parallel/parallel.go b/_examples/parallel/parallel.go index 837b85b20..9a5c9ae62 100644 --- a/_examples/parallel/parallel.go +++ b/_examples/parallel/parallel.go @@ -3,7 +3,7 @@ package main import ( "fmt" - "github.com/gocolly/colly" + "github.com/gocolly/colly/v2" ) func main() { @@ -12,7 +12,7 @@ func main() { // MaxDepth is 2, so only the links on the scraped page // and links on those pages are visited colly.MaxDepth(2), - colly.Async(true), + colly.Async(), ) // Limit the maximum parallelism to 2 diff --git a/_examples/proxy_switcher/proxy_switcher.go b/_examples/proxy_switcher/proxy_switcher.go index 022699f47..4f0154483 100644 --- a/_examples/proxy_switcher/proxy_switcher.go +++ b/_examples/proxy_switcher/proxy_switcher.go @@ -4,8 +4,8 @@ import ( "bytes" "log" - "github.com/gocolly/colly" - "github.com/gocolly/colly/proxy" + "github.com/gocolly/colly/v2" + "github.com/gocolly/colly/v2/proxy" ) func main() { diff --git a/_examples/queue/queue.go b/_examples/queue/queue.go index ddf70a660..e6cacc3c2 100644 --- a/_examples/queue/queue.go +++ b/_examples/queue/queue.go @@ -3,8 +3,8 @@ package main import ( "fmt" - "github.com/gocolly/colly" - "github.com/gocolly/colly/queue" + "github.com/gocolly/colly/v2" + "github.com/gocolly/colly/v2/queue" ) func main() { diff --git a/_examples/random_delay/random_delay.go b/_examples/random_delay/random_delay.go index d9f58a250..21037efdf 100644 --- a/_examples/random_delay/random_delay.go +++ b/_examples/random_delay/random_delay.go @@ -4,8 +4,8 @@ import ( "fmt" "time" - "github.com/gocolly/colly" - "github.com/gocolly/colly/debug" + "github.com/gocolly/colly/v2" + "github.com/gocolly/colly/v2/debug" ) func main() { @@ -15,7 +15,7 @@ func main() { c := colly.NewCollector( // Attach a debugger to the collector colly.Debugger(&debug.LogDebugger{}), - colly.Async(true), + colly.Async(), ) // Limit the number of threads started by colly to two diff --git a/_examples/rate_limit/rate_limit.go b/_examples/rate_limit/rate_limit.go index e17f4941f..0c533ceea 100644 --- a/_examples/rate_limit/rate_limit.go +++ b/_examples/rate_limit/rate_limit.go @@ -3,8 +3,8 @@ package main import ( "fmt" - "github.com/gocolly/colly" - "github.com/gocolly/colly/debug" + "github.com/gocolly/colly/v2" + "github.com/gocolly/colly/v2/debug" ) func main() { @@ -13,7 +13,7 @@ func main() { // Instantiate default collector c := colly.NewCollector( // Turn on asynchronous requests - colly.Async(true), + colly.Async(), // Attach a debugger to the collector colly.Debugger(&debug.LogDebugger{}), ) diff --git a/_examples/reddit/reddit.go b/_examples/reddit/reddit.go index bd69f396d..06e86148c 100644 --- a/_examples/reddit/reddit.go +++ b/_examples/reddit/reddit.go @@ -5,7 +5,7 @@ import ( "os" "time" - "github.com/gocolly/colly" + "github.com/gocolly/colly/v2" ) type item struct { @@ -23,6 +23,7 @@ func main() { c := colly.NewCollector( // Visit only domains: old.reddit.com colly.AllowedDomains("old.reddit.com"), + // Parallelism colly.Async(true), ) diff --git a/_examples/request_context/request_context.go b/_examples/request_context/request_context.go index b4b79b435..ace7edfbe 100644 --- a/_examples/request_context/request_context.go +++ b/_examples/request_context/request_context.go @@ -3,7 +3,7 @@ package main import ( "fmt" - "github.com/gocolly/colly" + "github.com/gocolly/colly/v2" ) func main() { diff --git a/_examples/scraper_server/scraper_server.go b/_examples/scraper_server/scraper_server.go index 6d0f0d85b..3c4bca6e9 100644 --- a/_examples/scraper_server/scraper_server.go +++ b/_examples/scraper_server/scraper_server.go @@ -5,7 +5,7 @@ import ( "log" "net/http" - "github.com/gocolly/colly" + "github.com/gocolly/colly/v2" ) type pageInfo struct { diff --git a/_examples/shopify_sitemap/shopify_sitemap.go b/_examples/shopify_sitemap/shopify_sitemap.go index c769f37f4..e26d8a185 100644 --- a/_examples/shopify_sitemap/shopify_sitemap.go +++ b/_examples/shopify_sitemap/shopify_sitemap.go @@ -3,7 +3,7 @@ package main import ( "fmt" - "github.com/gocolly/colly" + "github.com/gocolly/colly/v2" ) func main() { diff --git a/_examples/url_filter/url_filter.go b/_examples/url_filter/url_filter.go index a4560f94c..a9210f127 100644 --- a/_examples/url_filter/url_filter.go +++ b/_examples/url_filter/url_filter.go @@ -4,7 +4,7 @@ import ( "fmt" "regexp" - "github.com/gocolly/colly" + "github.com/gocolly/colly/v2" ) func main() { diff --git a/_examples/xkcd_store/xkcd_store.go b/_examples/xkcd_store/xkcd_store.go index e77a6cff6..43f233a16 100644 --- a/_examples/xkcd_store/xkcd_store.go +++ b/_examples/xkcd_store/xkcd_store.go @@ -5,7 +5,7 @@ import ( "log" "os" - "github.com/gocolly/colly" + "github.com/gocolly/colly/v2" ) func main() { @@ -33,7 +33,7 @@ func main() { e.ChildAttr("a", "title"), e.ChildText("span"), e.Request.AbsoluteURL(e.ChildAttr("a", "href")), - "https" + e.ChildAttr("img", "src"), + "https:" + e.ChildAttr("img", "src"), }) }) diff --git a/assets/scrapfly.png b/assets/scrapfly.png new file mode 100644 index 000000000..e4b384f66 Binary files /dev/null and b/assets/scrapfly.png differ diff --git a/cmd/colly/colly.go b/cmd/colly/colly.go index 8ad240a37..a8e626fd3 100644 --- a/cmd/colly/colly.go +++ b/cmd/colly/colly.go @@ -29,7 +29,7 @@ var scraperHeadTemplate = `package main import ( "log" - "github.com/gocolly/colly" + "github.com/gocolly/colly/v2" ) func main() { @@ -48,19 +48,19 @@ var htmlCallbackTemplate = ` ` var requestCallbackTemplate = ` - c.OnRequest("element-selector", func(r *colly.Request) { + c.OnRequest(func(r *colly.Request) { log.Println("Visiting", r.URL) }) ` var responseCallbackTemplate = ` - c.OnResponse("element-selector", func(r *colly.Response) { + c.OnResponse(func(r *colly.Response) { log.Println("Visited", r.Request.URL, r.StatusCode) }) ` var errorCallbackTemplate = ` - c.OnError("element-selector", func(r *colly.Response, err error) { + c.OnError(func(r *colly.Response, err error) { log.Printf("Error on %s: %s", r.Request.URL, err) }) ` diff --git a/colly.go b/colly.go index 587174ba3..ae74b7c3e 100644 --- a/colly.go +++ b/colly.go @@ -24,7 +24,6 @@ import ( "fmt" "hash/fnv" "io" - "io/ioutil" "log" "net/http" "net/http/cookiejar" @@ -38,22 +37,26 @@ import ( "sync/atomic" "time" - "google.golang.org/appengine/urlfetch" - "github.com/PuerkitoBio/goquery" "github.com/antchfx/htmlquery" "github.com/antchfx/xmlquery" + "github.com/gocolly/colly/v2/debug" + "github.com/gocolly/colly/v2/storage" "github.com/kennygrant/sanitize" + whatwgUrl "github.com/nlnwa/whatwg-url/url" "github.com/temoto/robotstxt" - - "github.com/gocolly/colly/debug" - "github.com/gocolly/colly/storage" + "google.golang.org/appengine/urlfetch" ) +// A CollectorOption sets an option on a Collector. +type CollectorOption func(*Collector) + // Collector provides the scraper instance for a scraping job type Collector struct { // UserAgent is the User-Agent string used by HTTP requests UserAgent string + // Custom headers for the request + Headers *http.Header // MaxDepth limits the recursion depth of visited URLs. // Set it to 0 for infinite recursion (default). MaxDepth int @@ -102,28 +105,43 @@ type Collector struct { // without explicit charset declaration. This feature uses https://github.com/saintfish/chardet DetectCharset bool // RedirectHandler allows control on how a redirect will be managed - RedirectHandler func(req *http.Request, via []*http.Request) error + // use c.SetRedirectHandler to set this value + redirectHandler func(req *http.Request, via []*http.Request) error // CheckHead performs a HEAD request before every GET to pre-validate the response - CheckHead bool - store storage.Storage - debugger debug.Debugger - robotsMap map[string]*robotstxt.RobotsData - htmlCallbacks []*htmlCallbackContainer - xmlCallbacks []*xmlCallbackContainer - requestCallbacks []RequestCallback - responseCallbacks []ResponseCallback - errorCallbacks []ErrorCallback - scrapedCallbacks []ScrapedCallback - requestCount uint32 - responseCount uint32 - backend *httpBackend - wg *sync.WaitGroup - lock *sync.RWMutex + CheckHead bool + // TraceHTTP enables capturing and reporting request performance for crawler tuning. + // When set to true, the Response.Trace will be filled in with an HTTPTrace object. + TraceHTTP bool + // Context is the context that will be used for HTTP requests. You can set this + // to support clean cancellation of scraping. + Context context.Context + // MaxRequests limit the number of requests done by the instance. + // Set it to 0 for infinite requests (default). + MaxRequests uint32 + + store storage.Storage + debugger debug.Debugger + robotsMap map[string]*robotstxt.RobotsData + htmlCallbacks []*htmlCallbackContainer + xmlCallbacks []*xmlCallbackContainer + requestCallbacks []RequestCallback + responseCallbacks []ResponseCallback + responseHeadersCallbacks []ResponseHeadersCallback + errorCallbacks []ErrorCallback + scrapedCallbacks []ScrapedCallback + requestCount uint32 + responseCount uint32 + backend *httpBackend + wg *sync.WaitGroup + lock *sync.RWMutex } // RequestCallback is a type alias for OnRequest callback functions type RequestCallback func(*Request) +// ResponseHeadersCallback is a type alias for OnResponseHeaders callback functions +type ResponseHeadersCallback func(*Response) + // ResponseCallback is a type alias for OnResponse callback functions type ResponseCallback func(*Response) @@ -142,6 +160,26 @@ type ScrapedCallback func(*Response) // ProxyFunc is a type alias for proxy setter functions. type ProxyFunc func(*http.Request) (*url.URL, error) +// AlreadyVisitedError is the error type for already visited URLs. +// +// It's returned synchronously by Visit when the URL passed to Visit +// is already visited. +// +// When already visited URL is encountered after following +// redirects, this error appears in OnError callback, and if Async +// mode is not enabled, is also returned by Visit. +type AlreadyVisitedError struct { + // Destination is the URL that was attempted to be visited. + // It might not match the URL passed to Visit if redirect + // was followed. + Destination *url.URL +} + +// Error implements error interface. +func (e *AlreadyVisitedError) Error() string { + return fmt.Sprintf("%q already visited", e.Destination) +} + type htmlCallbackContainer struct { Selector string Function HTMLCallback @@ -181,14 +219,22 @@ var ( // ErrNoURLFiltersMatch is the error thrown if visiting // a URL which is not allowed by URLFilters ErrNoURLFiltersMatch = errors.New("No URLFilters match") - // ErrAlreadyVisited is the error type for already visited URLs - ErrAlreadyVisited = errors.New("URL already visited") // ErrRobotsTxtBlocked is the error type for robots.txt errors ErrRobotsTxtBlocked = errors.New("URL blocked by robots.txt") // ErrNoCookieJar is the error type for missing cookie jar ErrNoCookieJar = errors.New("Cookie jar is not available") // ErrNoPattern is the error type for LimitRules without patterns ErrNoPattern = errors.New("No pattern defined in LimitRule") + // ErrEmptyProxyURL is the error type for empty Proxy URL list + ErrEmptyProxyURL = errors.New("Proxy URL list is empty") + // ErrAbortedAfterHeaders is the error returned when OnResponseHeaders aborts the transfer. + ErrAbortedAfterHeaders = errors.New("Aborted after receiving response headers") + // ErrQueueFull is the error returned when the queue is full + ErrQueueFull = errors.New("Queue MaxSize reached") + // ErrMaxRequests is the error returned when exceeding max requests + ErrMaxRequests = errors.New("Max Requests limit reached") + // ErrRetryBodyUnseekable is the error when retry with not seekable body + ErrRetryBodyUnseekable = errors.New("Retry Body Unseekable") ) var envMap = map[string]func(*Collector, string){ @@ -212,7 +258,7 @@ var envMap = map[string]func(*Collector, string){ }, "FOLLOW_REDIRECTS": func(c *Collector, val string) { if !isYesString(val) { - c.RedirectHandler = func(req *http.Request, via []*http.Request) error { + c.redirectHandler = func(req *http.Request, via []*http.Request) error { return http.ErrUseLastResponse } } @@ -225,20 +271,31 @@ var envMap = map[string]func(*Collector, string){ }, "MAX_DEPTH": func(c *Collector, val string) { maxDepth, err := strconv.Atoi(val) - if err != nil { + if err == nil { c.MaxDepth = maxDepth } }, + "MAX_REQUESTS": func(c *Collector, val string) { + maxRequests, err := strconv.ParseUint(val, 0, 32) + if err == nil { + c.MaxRequests = uint32(maxRequests) + } + }, "PARSE_HTTP_ERROR_RESPONSE": func(c *Collector, val string) { c.ParseHTTPErrorResponse = isYesString(val) }, + "TRACE_HTTP": func(c *Collector, val string) { + c.TraceHTTP = isYesString(val) + }, "USER_AGENT": func(c *Collector, val string) { c.UserAgent = val }, } +var urlParser = whatwgUrl.NewParser(whatwgUrl.WithPercentEncodeSinglePercentSign()) + // NewCollector creates a new Collector instance with default configuration -func NewCollector(options ...func(*Collector)) *Collector { +func NewCollector(options ...CollectorOption) *Collector { c := &Collector{} c.Init() @@ -252,35 +309,54 @@ func NewCollector(options ...func(*Collector)) *Collector { } // UserAgent sets the user agent used by the Collector. -func UserAgent(ua string) func(*Collector) { +func UserAgent(ua string) CollectorOption { return func(c *Collector) { c.UserAgent = ua } } +// Headers sets the custom headers used by the Collector. +func Headers(headers map[string]string) CollectorOption { + return func(c *Collector) { + customHeaders := make(http.Header) + for header, value := range headers { + customHeaders.Add(header, value) + } + c.Headers = &customHeaders + } +} + // MaxDepth limits the recursion depth of visited URLs. -func MaxDepth(depth int) func(*Collector) { +func MaxDepth(depth int) CollectorOption { return func(c *Collector) { c.MaxDepth = depth } } +// MaxRequests limit the number of requests done by the instance. +// Set it to 0 for infinite requests (default). +func MaxRequests(max uint32) CollectorOption { + return func(c *Collector) { + c.MaxRequests = max + } +} + // AllowedDomains sets the domain whitelist used by the Collector. -func AllowedDomains(domains ...string) func(*Collector) { +func AllowedDomains(domains ...string) CollectorOption { return func(c *Collector) { c.AllowedDomains = domains } } // ParseHTTPErrorResponse allows parsing responses with HTTP errors -func ParseHTTPErrorResponse() func(*Collector) { +func ParseHTTPErrorResponse() CollectorOption { return func(c *Collector) { c.ParseHTTPErrorResponse = true } } // DisallowedDomains sets the domain blacklist used by the Collector. -func DisallowedDomains(domains ...string) func(*Collector) { +func DisallowedDomains(domains ...string) CollectorOption { return func(c *Collector) { c.DisallowedDomains = domains } @@ -288,7 +364,7 @@ func DisallowedDomains(domains ...string) func(*Collector) { // DisallowedURLFilters sets the list of regular expressions which restricts // visiting URLs. If any of the rules matches to a URL the request will be stopped. -func DisallowedURLFilters(filters ...*regexp.Regexp) func(*Collector) { +func DisallowedURLFilters(filters ...*regexp.Regexp) CollectorOption { return func(c *Collector) { c.DisallowedURLFilters = filters } @@ -296,28 +372,28 @@ func DisallowedURLFilters(filters ...*regexp.Regexp) func(*Collector) { // URLFilters sets the list of regular expressions which restricts // visiting URLs. If any of the rules matches to a URL the request won't be stopped. -func URLFilters(filters ...*regexp.Regexp) func(*Collector) { +func URLFilters(filters ...*regexp.Regexp) CollectorOption { return func(c *Collector) { c.URLFilters = filters } } // AllowURLRevisit instructs the Collector to allow multiple downloads of the same URL -func AllowURLRevisit() func(*Collector) { +func AllowURLRevisit() CollectorOption { return func(c *Collector) { c.AllowURLRevisit = true } } // MaxBodySize sets the limit of the retrieved response body in bytes. -func MaxBodySize(sizeInBytes int) func(*Collector) { +func MaxBodySize(sizeInBytes int) CollectorOption { return func(c *Collector) { c.MaxBodySize = sizeInBytes } } // CacheDir specifies the location where GET requests are cached as files. -func CacheDir(path string) func(*Collector) { +func CacheDir(path string) CollectorOption { return func(c *Collector) { c.CacheDir = path } @@ -325,47 +401,76 @@ func CacheDir(path string) func(*Collector) { // IgnoreRobotsTxt instructs the Collector to ignore any restrictions // set by the target host's robots.txt file. -func IgnoreRobotsTxt() func(*Collector) { +func IgnoreRobotsTxt() CollectorOption { return func(c *Collector) { c.IgnoreRobotsTxt = true } } +// TraceHTTP instructs the Collector to collect and report request trace data +// on the Response.Trace. +func TraceHTTP() CollectorOption { + return func(c *Collector) { + c.TraceHTTP = true + } +} + +// StdlibContext sets the context that will be used for HTTP requests. +// You can set this to support clean cancellation of scraping. +func StdlibContext(ctx context.Context) CollectorOption { + return func(c *Collector) { + c.Context = ctx + } +} + // ID sets the unique identifier of the Collector. -func ID(id uint32) func(*Collector) { +func ID(id uint32) CollectorOption { return func(c *Collector) { c.ID = id } } // Async turns on asynchronous network requests. -func Async(a bool) func(*Collector) { +func Async(a ...bool) CollectorOption { return func(c *Collector) { - c.Async = a + if len(a) > 0 { + c.Async = a[0] + } else { + c.Async = true + } } } // DetectCharset enables character encoding detection for non-utf8 response bodies // without explicit charset declaration. This feature uses https://github.com/saintfish/chardet -func DetectCharset() func(*Collector) { +func DetectCharset() CollectorOption { return func(c *Collector) { c.DetectCharset = true } } // Debugger sets the debugger used by the Collector. -func Debugger(d debug.Debugger) func(*Collector) { +func Debugger(d debug.Debugger) CollectorOption { return func(c *Collector) { d.Init() c.debugger = d } } +// CheckHead performs a HEAD request before every GET to pre-validate the response +func CheckHead() CollectorOption { + return func(c *Collector) { + c.CheckHead = true + } +} + // Init initializes the Collector's private variables and sets default // configuration for the Collector func (c *Collector) Init() { - c.UserAgent = "colly - https://github.com/gocolly/colly" + c.UserAgent = "colly - https://github.com/gocolly/colly/v2" + c.Headers = nil c.MaxDepth = 0 + c.MaxRequests = 0 c.store = &storage.InMemoryStorage{} c.store.Init() c.MaxBodySize = 10 * 1024 * 1024 @@ -378,19 +483,22 @@ func (c *Collector) Init() { c.robotsMap = make(map[string]*robotstxt.RobotsData) c.IgnoreRobotsTxt = true c.ID = atomic.AddUint32(&collectorCounter, 1) + c.TraceHTTP = false + c.Context = context.Background() } // Appengine will replace the Collector's backend http.Client // With an Http.Client that is provided by appengine/urlfetch // This function should be used when the scraper is run on // Google App Engine. Example: -// func startScraper(w http.ResponseWriter, r *http.Request) { -// ctx := appengine.NewContext(r) -// c := colly.NewCollector() -// c.Appengine(ctx) -// ... -// c.Visit("https://google.ca") -// } +// +// func startScraper(w http.ResponseWriter, r *http.Request) { +// ctx := appengine.NewContext(r) +// c := colly.NewCollector() +// c.Appengine(ctx) +// ... +// c.Visit("https://google.ca") +// } func (c *Collector) Appengine(ctx context.Context) { client := urlfetch.Client(ctx) client.Jar = c.backend.Client.Jar @@ -412,6 +520,17 @@ func (c *Collector) Visit(URL string) error { return c.scrape(URL, "GET", 1, nil, nil, nil, true) } +// HasVisited checks if the provided URL has been visited +func (c *Collector) HasVisited(URL string) (bool, error) { + return c.checkHasVisited(URL, nil) +} + +// HasPosted checks if the provided URL and requestData has been visited +// This method is useful more likely to prevent re-visit same URL and POST body +func (c *Collector) HasPosted(URL string, requestData map[string]string) (bool, error) { + return c.checkHasVisited(URL, requestData) +} + // Head starts a collector job by creating a HEAD request. func (c *Collector) Head(URL string) error { return c.scrape(URL, "HEAD", 1, nil, nil, nil, false) @@ -481,6 +600,7 @@ func (c *Collector) UnmarshalRequest(r []byte) (*Request, error) { return &Request{ Method: req.Method, URL: u, + Depth: req.Depth, Body: bytes.NewReader(req.Body), Ctx: ctx, ID: atomic.AddUint32(&c.requestCount, 1), @@ -490,48 +610,50 @@ func (c *Collector) UnmarshalRequest(r []byte) (*Request, error) { } func (c *Collector) scrape(u, method string, depth int, requestData io.Reader, ctx *Context, hdr http.Header, checkRevisit bool) error { - if err := c.requestCheck(u, method, depth, checkRevisit); err != nil { + parsedWhatwgURL, err := urlParser.Parse(u) + if err != nil { return err } - parsedURL, err := url.Parse(u) + parsedURL, err := url.Parse(parsedWhatwgURL.Href(false)) if err != nil { return err } - if parsedURL.Scheme == "" { - parsedURL.Scheme = "http" + if hdr == nil { + hdr = http.Header{} + if c.Headers != nil { + for k, v := range *c.Headers { + for _, value := range v { + hdr.Add(k, value) + } + } + } } - if !c.isDomainAllowed(parsedURL.Host) { - return ErrForbiddenDomain + if _, ok := hdr["User-Agent"]; !ok { + hdr.Set("User-Agent", c.UserAgent) } - if method != "HEAD" && !c.IgnoreRobotsTxt { - if err = c.checkRobots(parsedURL); err != nil { + if seeker, ok := requestData.(io.ReadSeeker); ok { + _, err := seeker.Seek(0, io.SeekStart) + if err != nil { return err } } - if hdr == nil { - hdr = http.Header{"User-Agent": []string{c.UserAgent}} - } - rc, ok := requestData.(io.ReadCloser) - if !ok && requestData != nil { - rc = ioutil.NopCloser(requestData) + + req, err := http.NewRequest(method, parsedURL.String(), requestData) + if err != nil { + return err } + req.Header = hdr // The Go HTTP API ignores "Host" in the headers, preferring the client // to use the Host field on Request. - host := parsedURL.Host if hostHeader := hdr.Get("Host"); hostHeader != "" { - host = hostHeader - } - req := &http.Request{ - Method: method, - URL: parsedURL, - Proto: "HTTP/1.1", - ProtoMajor: 1, - ProtoMinor: 1, - Header: hdr, - Body: rc, - Host: host, - } - setRequestBody(req, requestData) + req.Host = hostHeader + } + // note: once 1.13 is minimum supported Go version, + // replace this with http.NewRequestWithContext + req = req.WithContext(c.Context) + if err := c.requestCheck(parsedURL, method, req.GetBody, depth, checkRevisit); err != nil { + return err + } u = parsedURL.String() c.wg.Add(1) if c.Async { @@ -541,38 +663,6 @@ func (c *Collector) scrape(u, method string, depth int, requestData io.Reader, c return c.fetch(u, method, depth, requestData, ctx, hdr, req) } -func setRequestBody(req *http.Request, body io.Reader) { - if body != nil { - switch v := body.(type) { - case *bytes.Buffer: - req.ContentLength = int64(v.Len()) - buf := v.Bytes() - req.GetBody = func() (io.ReadCloser, error) { - r := bytes.NewReader(buf) - return ioutil.NopCloser(r), nil - } - case *bytes.Reader: - req.ContentLength = int64(v.Len()) - snapshot := *v - req.GetBody = func() (io.ReadCloser, error) { - r := snapshot - return ioutil.NopCloser(&r), nil - } - case *strings.Reader: - req.ContentLength = int64(v.Len()) - snapshot := *v - req.GetBody = func() (io.ReadCloser, error) { - r := snapshot - return ioutil.NopCloser(&r), nil - } - } - if req.GetBody != nil && req.ContentLength == 0 { - req.Body = http.NoBody - req.GetBody = func() (io.ReadCloser, error) { return http.NoBody, nil } - } - } -} - func (c *Collector) fetch(u, method string, depth int, requestData io.Reader, ctx *Context, hdr http.Header, req *http.Request) error { defer c.wg.Done() if ctx == nil { @@ -581,6 +671,7 @@ func (c *Collector) fetch(u, method string, depth int, requestData io.Reader, ct request := &Request{ URL: req.URL, Headers: &req.Header, + Host: req.Host, Ctx: ctx, Depth: depth, Method: method, @@ -589,6 +680,10 @@ func (c *Collector) fetch(u, method string, depth int, requestData io.Reader, ct ID: atomic.AddUint32(&c.requestCount, 1), } + if req.Header.Get("Accept") == "" { + req.Header.Set("Accept", "*/*") + } + c.handleOnRequest(request) if request.abort { @@ -599,25 +694,31 @@ func (c *Collector) fetch(u, method string, depth int, requestData io.Reader, ct req.Header.Add("Content-Type", "application/x-www-form-urlencoded") } - if req.Header.Get("Accept") == "" { - req.Header.Set("Accept", "*/*") + var hTrace *HTTPTrace + if c.TraceHTTP { + hTrace = &HTTPTrace{} + req = hTrace.WithTrace(req) } - origURL := req.URL - response, err := c.backend.Cache(req, c.MaxBodySize, c.CacheDir) + checkHeadersFunc := func(req *http.Request, statusCode int, headers http.Header) bool { + if req.URL != origURL { + request.URL = req.URL + request.Headers = &req.Header + } + c.handleOnResponseHeaders(&Response{Ctx: ctx, Request: request, StatusCode: statusCode, Headers: &headers}) + return !request.abort + } + response, err := c.backend.Cache(req, c.MaxBodySize, checkHeadersFunc, c.CacheDir) if proxyURL, ok := req.Context().Value(ProxyURLKey).(string); ok { request.ProxyURL = proxyURL } if err := c.handleOnError(response, err, request, ctx); err != nil { return err } - if req.URL != origURL { - request.URL = req.URL - request.Headers = &req.Header - } atomic.AddUint32(&c.responseCount, 1) response.Ctx = ctx response.Request = request + response.Trace = hTrace err = response.fixCharset(c.DetectCharset, request.ResponseCharacterEncoding) if err != nil { @@ -641,39 +742,69 @@ func (c *Collector) fetch(u, method string, depth int, requestData io.Reader, ct return err } -func (c *Collector) requestCheck(u, method string, depth int, checkRevisit bool) error { - if u == "" { - return ErrMissingURL - } +func (c *Collector) requestCheck(parsedURL *url.URL, method string, getBody func() (io.ReadCloser, error), depth int, checkRevisit bool) error { + u := parsedURL.String() if c.MaxDepth > 0 && c.MaxDepth < depth { return ErrMaxDepth } - if len(c.DisallowedURLFilters) > 0 { - if isMatchingFilter(c.DisallowedURLFilters, []byte(u)) { - return ErrForbiddenURL - } + if c.MaxRequests > 0 && c.requestCount >= c.MaxRequests { + return ErrMaxRequests } - if len(c.URLFilters) > 0 { - if !isMatchingFilter(c.URLFilters, []byte(u)) { - return ErrNoURLFiltersMatch + if err := c.checkFilters(u, parsedURL.Hostname()); err != nil { + return err + } + if method != "HEAD" && !c.IgnoreRobotsTxt { + if err := c.checkRobots(parsedURL); err != nil { + return err } } - if checkRevisit && !c.AllowURLRevisit && method == "GET" { - h := fnv.New64a() - h.Write([]byte(u)) - uHash := h.Sum64() + if checkRevisit && !c.AllowURLRevisit { + // TODO weird behaviour, it allows CheckHead to work correctly, + // but it should probably better be solved with + // "check-but-not-save" flag or something + if method != "GET" && getBody == nil { + return nil + } + + var body io.ReadCloser + if getBody != nil { + var err error + body, err = getBody() + if err != nil { + return err + } + defer body.Close() + } + uHash := requestHash(u, body) visited, err := c.store.IsVisited(uHash) if err != nil { return err } if visited { - return ErrAlreadyVisited + return &AlreadyVisitedError{parsedURL} } return c.store.Visited(uHash) } return nil } +func (c *Collector) checkFilters(URL, domain string) error { + if len(c.DisallowedURLFilters) > 0 { + if isMatchingFilter(c.DisallowedURLFilters, []byte(URL)) { + return ErrForbiddenURL + } + } + if len(c.URLFilters) > 0 { + if !isMatchingFilter(c.URLFilters, []byte(URL)) { + return ErrNoURLFiltersMatch + } + } + if !c.isDomainAllowed(domain) { + return ErrForbiddenDomain + } + return nil +} + func (c *Collector) isDomainAllowed(domain string) bool { for _, d2 := range c.DisallowedDomains { if d2 == domain { @@ -702,6 +833,8 @@ func (c *Collector) checkRobots(u *url.URL) error { if err != nil { return err } + defer resp.Body.Close() + robot, err = robotstxt.FromResponse(resp) if err != nil { return err @@ -716,7 +849,11 @@ func (c *Collector) checkRobots(u *url.URL) error { return nil } - if !uaGroup.Test(u.EscapedPath()) { + eu := u.EscapedPath() + if u.RawQuery != "" { + eu += "?" + u.Query().Encode() + } + if !uaGroup.Test(eu) { return ErrRobotsTxtBlocked } return nil @@ -727,8 +864,8 @@ func (c *Collector) checkRobots(u *url.URL) error { func (c *Collector) String() string { return fmt.Sprintf( "Requests made: %d (%d responses) | Callbacks: OnRequest: %d, OnHTML: %d, OnResponse: %d, OnError: %d", - c.requestCount, - c.responseCount, + atomic.LoadUint32(&c.requestCount), + atomic.LoadUint32(&c.responseCount), len(c.requestCallbacks), len(c.htmlCallbacks), len(c.responseCallbacks), @@ -752,6 +889,23 @@ func (c *Collector) OnRequest(f RequestCallback) { c.lock.Unlock() } +// OnResponseHeaders registers a function. Function will be executed on every response +// when headers and status are already received, but body is not yet read. +// +// Like in OnRequest, you can call Request.Abort to abort the transfer. This might be +// useful if, for example, you're following all hyperlinks, but want to avoid +// downloading files. +// +// Be aware that using this will prevent HTTP/1.1 connection reuse, as +// the only way to abort a download is to immediately close the connection. +// HTTP/2 doesn't suffer from this problem, as it's possible to close +// specific stream inside the connection. +func (c *Collector) OnResponseHeaders(f ResponseHeadersCallback) { + c.lock.Lock() + c.responseHeadersCallbacks = append(c.responseHeadersCallbacks, f) + c.lock.Unlock() +} + // OnResponse registers a function. Function will be executed on every response func (c *Collector) OnResponse(f ResponseCallback) { c.lock.Lock() @@ -846,6 +1000,11 @@ func (c *Collector) OnScraped(f ScrapedCallback) { c.lock.Unlock() } +// SetClient will override the previously set http.Client +func (c *Collector) SetClient(client *http.Client) { + c.backend.Client = client +} + // WithTransport allows you to set a custom http.RoundTripper (transport) func (c *Collector) WithTransport(transport http.RoundTripper) { c.backend.Client.Transport = transport @@ -857,7 +1016,7 @@ func (c *Collector) DisableCookies() { } // SetCookieJar overrides the previously set cookie jar -func (c *Collector) SetCookieJar(j *cookiejar.Jar) { +func (c *Collector) SetCookieJar(j http.CookieJar) { c.backend.Client.Jar = j } @@ -904,9 +1063,11 @@ func (c *Collector) SetProxyFunc(p ProxyFunc) { t, ok := c.backend.Client.Transport.(*http.Transport) if c.backend.Client.Transport != nil && ok { t.Proxy = p + t.DisableKeepAlives = true } else { c.backend.Client.Transport = &http.Transport{ - Proxy: p, + Proxy: p, + DisableKeepAlives: true, } } } @@ -943,16 +1104,53 @@ func (c *Collector) handleOnResponse(r *Response) { } } +func (c *Collector) handleOnResponseHeaders(r *Response) { + if c.debugger != nil { + c.debugger.Event(createEvent("responseHeaders", r.Request.ID, c.ID, map[string]string{ + "url": r.Request.URL.String(), + "status": http.StatusText(r.StatusCode), + })) + } + for _, f := range c.responseHeadersCallbacks { + f(r) + } +} + func (c *Collector) handleOnHTML(resp *Response) error { - if len(c.htmlCallbacks) == 0 || !strings.Contains(strings.ToLower(resp.Headers.Get("Content-Type")), "html") { + if len(c.htmlCallbacks) == 0 { + return nil + } + + contentType := resp.Headers.Get("Content-Type") + if contentType == "" { + contentType = http.DetectContentType(resp.Body) + } + // implementation of mime.ParseMediaType without parsing the params + // part + mediatype, _, _ := strings.Cut(contentType, ";") + mediatype = strings.TrimSpace(strings.ToLower(mediatype)) + + // TODO we also want to parse application/xml as XHTML if it has + // appropriate doctype + switch mediatype { + case "text/html", "application/xhtml+xml": + default: return nil } + doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(resp.Body)) if err != nil { return err } if href, found := doc.Find("base[href]").Attr("href"); found { - resp.Request.baseURL, _ = url.Parse(href) + u, err := urlParser.ParseRef(resp.Request.URL.String(), href) + if err == nil { + baseURL, err := url.Parse(u.Href(false)) + if err == nil { + resp.Request.baseURL = baseURL + } + } + } for _, cc := range c.htmlCallbacks { i := 0 @@ -978,7 +1176,8 @@ func (c *Collector) handleOnXML(resp *Response) error { return nil } contentType := strings.ToLower(resp.Headers.Get("Content-Type")) - if !strings.Contains(contentType, "html") && !strings.Contains(contentType, "xml") { + isXMLFile := strings.HasSuffix(strings.ToLower(resp.Request.URL.Path), ".xml") || strings.HasSuffix(strings.ToLower(resp.Request.URL.Path), ".xml.gz") + if !strings.Contains(contentType, "html") && (!strings.Contains(contentType, "xml") && !isXMLFile) { return nil } @@ -990,7 +1189,10 @@ func (c *Collector) handleOnXML(resp *Response) error { if e := htmlquery.FindOne(doc, "//base"); e != nil { for _, a := range e.Attr { if a.Key == "href" { - resp.Request.baseURL, _ = url.Parse(a.Val) + baseURL, err := resp.Request.URL.Parse(a.Val) + if err == nil { + resp.Request.baseURL = baseURL + } break } } @@ -1008,7 +1210,7 @@ func (c *Collector) handleOnXML(resp *Response) error { cc.Function(e) } } - } else if strings.Contains(contentType, "xml") { + } else if strings.Contains(contentType, "xml") || isXMLFile { doc, err := xmlquery.Parse(bytes.NewBuffer(resp.Body)) if err != nil { return err @@ -1082,6 +1284,12 @@ func (c *Collector) Limits(rules []*LimitRule) error { return c.backend.Limits(rules) } +// SetRedirectHandler instructs the Collector to allow multiple downloads of the same URL +func (c *Collector) SetRedirectHandler(f func(req *http.Request, via []*http.Request) error) { + c.redirectHandler = f + c.backend.Client.CheckRedirect = c.checkRedirectFunc() +} + // SetCookies handles the receipt of the cookies in a reply for the given URL func (c *Collector) SetCookies(URL string, cookies []*http.Cookie) error { if c.backend.Client.Jar == nil { @@ -1121,16 +1329,20 @@ func (c *Collector) Clone() *Collector { IgnoreRobotsTxt: c.IgnoreRobotsTxt, MaxBodySize: c.MaxBodySize, MaxDepth: c.MaxDepth, + MaxRequests: c.MaxRequests, DisallowedURLFilters: c.DisallowedURLFilters, URLFilters: c.URLFilters, CheckHead: c.CheckHead, ParseHTTPErrorResponse: c.ParseHTTPErrorResponse, UserAgent: c.UserAgent, + Headers: c.Headers, + TraceHTTP: c.TraceHTTP, + Context: c.Context, store: c.store, backend: c.backend, debugger: c.debugger, Async: c.Async, - RedirectHandler: c.RedirectHandler, + redirectHandler: c.redirectHandler, errorCallbacks: make([]ErrorCallback, 0, 8), htmlCallbacks: make([]*htmlCallbackContainer, 0, 8), xmlCallbacks: make([]*xmlCallbackContainer, 0, 8), @@ -1145,12 +1357,41 @@ func (c *Collector) Clone() *Collector { func (c *Collector) checkRedirectFunc() func(req *http.Request, via []*http.Request) error { return func(req *http.Request, via []*http.Request) error { - if !c.isDomainAllowed(req.URL.Host) { - return fmt.Errorf("Not following redirect to %s because its not in AllowedDomains", req.URL.Host) + if err := c.checkFilters(req.URL.String(), req.URL.Hostname()); err != nil { + return fmt.Errorf("Not following redirect to %q: %w", req.URL, err) + } + + // allow redirects to the original destination + // to support websites redirecting to the same page while setting + // session cookies + samePageRedirect := normalizeURL(req.URL.String()) == normalizeURL(via[0].URL.String()) + + if !c.AllowURLRevisit && !samePageRedirect { + var body io.ReadCloser + if req.GetBody != nil { + var err error + body, err = req.GetBody() + if err != nil { + return err + } + defer body.Close() + } + uHash := requestHash(req.URL.String(), body) + visited, err := c.store.IsVisited(uHash) + if err != nil { + return err + } + if visited { + return &AlreadyVisitedError{req.URL} + } + err = c.store.Visited(uHash) + if err != nil { + return err + } } - if c.RedirectHandler != nil { - return c.RedirectHandler(req, via) + if c.redirectHandler != nil { + return c.redirectHandler(req, via) } // Honor golangs default of maximum of 10 redirects @@ -1160,13 +1401,6 @@ func (c *Collector) checkRedirectFunc() func(req *http.Request, via []*http.Requ lastRequest := via[len(via)-1] - // Copy the headers from last request - for hName, hValues := range lastRequest.Header { - for _, hValue := range hValues { - req.Header.Set(hName, hValue) - } - } - // If domain has changed, remove the Authorization-header if it exists if req.URL.Host != lastRequest.URL.Host { req.Header.Del("Authorization") @@ -1190,6 +1424,11 @@ func (c *Collector) parseSettingsFromEnv() { } } +func (c *Collector) checkHasVisited(URL string, requestData map[string]string) (bool, error) { + hash := requestHash(URL, createFormReader(requestData)) + return c.store.IsVisited(hash) +} + // SanitizeFileName replaces dangerous characters in a string // so the return value can be used as a safe file name. func SanitizeFileName(fileName string) string { @@ -1228,7 +1467,8 @@ func createMultipartReader(boundary string, data map[string][]byte) io.Reader { buffer.WriteString("\n") } buffer.WriteString(dashBoundary + "--\n\n") - return buffer + return bytes.NewReader(buffer.Bytes()) + } // randomBoundary was borrowed from @@ -1298,3 +1538,22 @@ func isMatchingFilter(fs []*regexp.Regexp, d []byte) bool { } return false } + +func normalizeURL(u string) string { + parsed, err := urlParser.Parse(u) + if err != nil { + return u + } + return parsed.String() +} + +func requestHash(url string, body io.Reader) uint64 { + h := fnv.New64a() + // reparse the url to fix ambiguities such as + // "http://example.com" vs "http://example.com/" + io.WriteString(h, normalizeURL(url)) + if body != nil { + io.Copy(h, body) + } + return h.Sum64() +} diff --git a/colly_test.go b/colly_test.go index d5c88294b..e70d2774e 100644 --- a/colly_test.go +++ b/colly_test.go @@ -15,19 +15,24 @@ package colly import ( + "bufio" "bytes" + "context" + "errors" "fmt" "net/http" "net/http/httptest" + "net/url" "os" "reflect" "regexp" "strings" "testing" + "time" "github.com/PuerkitoBio/goquery" - "github.com/gocolly/colly/debug" + "github.com/gocolly/colly/v2/debug" ) var serverIndexResponse = []byte("hello world\n") @@ -35,9 +40,10 @@ var robotsFile = ` User-agent: * Allow: /allowed Disallow: /disallowed +Disallow: /allowed*q= ` -func newTestServer() *httptest.Server { +func newUnstartedTestServer() *httptest.Server { mux := http.NewServeMux() mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { @@ -46,7 +52,11 @@ func newTestServer() *httptest.Server { }) mux.HandleFunc("/html", func(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "text/html") + if r.URL.Query().Get("no-content-type") != "" { + w.Header()["Content-Type"] = nil + } else { + w.Header().Set("Content-Type", "text/html") + } w.Write([]byte(` @@ -61,6 +71,17 @@ func newTestServer() *httptest.Server { `)) }) + mux.HandleFunc("/xml", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/xml") + w.Write([]byte(` + + Test Page + This is a test page + This is a test paragraph + + `)) + }) + mux.HandleFunc("/login", func(w http.ResponseWriter, r *http.Request) { if r.Method == "POST" { w.Header().Set("Content-Type", "text/html") @@ -84,7 +105,11 @@ func newTestServer() *httptest.Server { }) mux.Handle("/redirect", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - http.Redirect(w, r, "/redirected/", http.StatusSeeOther) + destination := "/redirected/" + if d := r.URL.Query().Get("d"); d != "" { + destination = d + } + http.Redirect(w, r, destination, http.StatusSeeOther) })) @@ -121,6 +146,21 @@ func newTestServer() *httptest.Server { w.Write([]byte(r.Header.Get("User-Agent"))) }) + mux.HandleFunc("/host_header", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(200) + w.Write([]byte(r.Host)) + }) + + mux.HandleFunc("/accept_header", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(200) + w.Write([]byte(r.Header.Get("Accept"))) + }) + + mux.HandleFunc("/custom_header", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(200) + w.Write([]byte(r.Header.Get("Test"))) + }) + mux.HandleFunc("/base", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/html") w.Write([]byte(` @@ -136,7 +176,99 @@ func newTestServer() *httptest.Server { `)) }) - return httptest.NewServer(mux) + mux.HandleFunc("/base_relative", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + w.Write([]byte(` + + +Test Page + + + +link + + + `)) + }) + + mux.HandleFunc("/tabs_and_newlines", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + w.Write([]byte(` + + +Test Page + + + +link + + + `)) + }) + + mux.HandleFunc("/foobar/xy", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + w.Write([]byte(` + + +Test Page + + +

hello

+ + + `)) + }) + + mux.HandleFunc("/100%25", func(w http.ResponseWriter, r *http.Request) { + w.Write([]byte("100 percent")) + }) + + mux.HandleFunc("/large_binary", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/octet-stream") + ww := bufio.NewWriter(w) + defer ww.Flush() + for { + // have to check error to detect client aborting download + if _, err := ww.Write([]byte{0x41}); err != nil { + return + } + } + }) + + mux.HandleFunc("/slow", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(200) + + ticker := time.NewTicker(100 * time.Millisecond) + defer ticker.Stop() + + i := 0 + + for { + select { + case <-r.Context().Done(): + return + case t := <-ticker.C: + fmt.Fprintf(w, "%s\n", t) + if flusher, ok := w.(http.Flusher); ok { + flusher.Flush() + } + i++ + if i == 10 { + return + } + } + } + }) + + return httptest.NewUnstartedServer(mux) +} + +func newTestServer() *httptest.Server { + srv := newUnstartedTestServer() + srv.Start() + return srv } var newCollectorTests = map[string]func(*testing.T){ @@ -285,6 +417,53 @@ var newCollectorTests = map[string]func(*testing.T){ t.Fatalf("c.debugger = %v, want %v", got, want) } }, + "CheckHead": func(t *testing.T) { + c := NewCollector(CheckHead()) + + if !c.CheckHead { + t.Fatal("c.CheckHead = false, want true") + } + }, + "Async": func(t *testing.T) { + c := NewCollector(Async()) + + if !c.Async { + t.Fatal("c.Async = false, want true") + } + }, +} + +func TestNoAcceptHeader(t *testing.T) { + ts := newTestServer() + defer ts.Close() + + var receivedHeader string + // checks if Accept is enabled by default + func() { + c := NewCollector() + c.OnResponse(func(resp *Response) { + receivedHeader = string(resp.Body) + }) + c.Visit(ts.URL + "/accept_header") + if receivedHeader != "*/*" { + t.Errorf("default Accept header isn't */*. got: %v", receivedHeader) + } + }() + + // checks if Accept can be disabled + func() { + c := NewCollector() + c.OnRequest(func(r *Request) { + r.Headers.Del("Accept") + }) + c.OnResponse(func(resp *Response) { + receivedHeader = string(resp.Body) + }) + c.Visit(ts.URL + "/accept_header") + if receivedHeader != "" { + t.Errorf("failed to pass request with no Accept header. got: %v", receivedHeader) + } + }() } func TestNewCollector(t *testing.T) { @@ -349,6 +528,65 @@ func TestCollectorVisit(t *testing.T) { } } +func TestCollectorVisitWithAllowedDomains(t *testing.T) { + ts := newTestServer() + defer ts.Close() + + c := NewCollector(AllowedDomains("localhost", "127.0.0.1", "::1")) + err := c.Visit(ts.URL) + if err != nil { + t.Errorf("Failed to visit url %s", ts.URL) + } + + err = c.Visit("http://example.com") + if err != ErrForbiddenDomain { + t.Errorf("c.Visit should return ErrForbiddenDomain, but got %v", err) + } +} + +func TestCollectorVisitWithDisallowedDomains(t *testing.T) { + ts := newTestServer() + defer ts.Close() + + c := NewCollector(DisallowedDomains("localhost", "127.0.0.1", "::1")) + err := c.Visit(ts.URL) + if err != ErrForbiddenDomain { + t.Errorf("c.Visit should return ErrForbiddenDomain, but got %v", err) + } + + c2 := NewCollector(DisallowedDomains("example.com")) + err = c2.Visit("http://example.com:8080") + if err != ErrForbiddenDomain { + t.Errorf("c.Visit should return ErrForbiddenDomain, but got %v", err) + } + err = c2.Visit(ts.URL) + if err != nil { + t.Errorf("Failed to visit url %s", ts.URL) + } +} + +func TestCollectorVisitResponseHeaders(t *testing.T) { + ts := newTestServer() + defer ts.Close() + + var onResponseHeadersCalled bool + + c := NewCollector() + c.OnResponseHeaders(func(r *Response) { + onResponseHeadersCalled = true + if r.Headers.Get("Content-Type") == "application/octet-stream" { + r.Request.Abort() + } + }) + c.OnResponse(func(r *Response) { + t.Error("OnResponse was called") + }) + c.Visit(ts.URL + "/large_binary") + if !onResponseHeadersCalled { + t.Error("OnResponseHeaders was not called") + } +} + func TestCollectorOnHTML(t *testing.T) { ts := newTestServer() defer ts.Close() @@ -393,6 +631,34 @@ func TestCollectorOnHTML(t *testing.T) { } } +func TestCollectorContentSniffing(t *testing.T) { + ts := newTestServer() + defer ts.Close() + + c := NewCollector() + + htmlCallbackCalled := false + + c.OnResponse(func(r *Response) { + if (*r.Headers)["Content-Type"] != nil { + t.Error("Content-Type unexpectedly not nil") + } + }) + + c.OnHTML("html", func(e *HTMLElement) { + htmlCallbackCalled = true + }) + + err := c.Visit(ts.URL + "/html?no-content-type=yes") + if err != nil { + t.Fatal(err) + } + + if !htmlCallbackCalled { + t.Error("OnHTML was not called") + } +} + func TestCollectorURLRevisit(t *testing.T) { ts := newTestServer() defer ts.Close() @@ -422,160 +688,560 @@ func TestCollectorURLRevisit(t *testing.T) { } } -func TestCollectorPost(t *testing.T) { +func TestCollectorPostRevisit(t *testing.T) { ts := newTestServer() defer ts.Close() postValue := "hello" - c := NewCollector() + postData := map[string]string{ + "name": postValue, + } + visitCount := 0 + c := NewCollector() c.OnResponse(func(r *Response) { if postValue != string(r.Body) { t.Error("Failed to send data with POST") } + visitCount++ }) + c.Post(ts.URL+"/login", postData) + c.Post(ts.URL+"/login", postData) c.Post(ts.URL+"/login", map[string]string{ - "name": postValue, + "name": postValue, + "lastname": "world", }) -} - -func TestRedirect(t *testing.T) { - ts := newTestServer() - defer ts.Close() - c := NewCollector() - c.OnHTML("a[href]", func(e *HTMLElement) { - u := e.Request.AbsoluteURL(e.Attr("href")) - if !strings.HasSuffix(u, "/redirected/test") { - t.Error("Invalid URL after redirect: " + u) - } - }) - c.OnResponse(func(r *Response) { - if !strings.HasSuffix(r.Request.URL.String(), "/redirected/") { - t.Error("Invalid URL in Request after redirect: " + r.Request.URL.String()) - } - }) - c.Visit(ts.URL + "/redirect") -} + if visitCount != 2 { + t.Error("URL POST revisited") + } -func TestBaseTag(t *testing.T) { - ts := newTestServer() - defer ts.Close() + c.AllowURLRevisit = true - c := NewCollector() - c.OnHTML("a[href]", func(e *HTMLElement) { - u := e.Request.AbsoluteURL(e.Attr("href")) - if u != "http://xy.com/z" { - t.Error("Invalid tag handling in OnHTML: expected https://xy.com/z, got " + u) - } - }) - c.Visit(ts.URL + "/base") + c.Post(ts.URL+"/login", postData) + c.Post(ts.URL+"/login", postData) - c2 := NewCollector() - c2.OnXML("//a", func(e *XMLElement) { - u := e.Request.AbsoluteURL(e.Attr("href")) - if u != "http://xy.com/z" { - t.Error("Invalid tag handling in OnXML: expected https://xy.com/z, got " + u) - } - }) - c2.Visit(ts.URL + "/base") + if visitCount != 4 { + t.Error("URL POST not revisited") + } } -func TestCollectorCookies(t *testing.T) { +func TestCollectorURLRevisitCheck(t *testing.T) { ts := newTestServer() defer ts.Close() c := NewCollector() - if err := c.Visit(ts.URL + "/set_cookie"); err != nil { - t.Fatal(err) - } + visited, err := c.HasVisited(ts.URL) - if err := c.Visit(ts.URL + "/check_cookie"); err != nil { - t.Fatalf("Failed to use previously set cookies: %s", err) + if err != nil { + t.Error(err.Error()) } -} - -func TestRobotsWhenAllowed(t *testing.T) { - ts := newTestServer() - defer ts.Close() - c := NewCollector() - c.IgnoreRobotsTxt = false + if visited != false { + t.Error("Expected URL to NOT have been visited") + } - c.OnResponse(func(resp *Response) { - if resp.StatusCode != 200 { - t.Fatalf("Wrong response code: %d", resp.StatusCode) - } - }) + c.Visit(ts.URL) - err := c.Visit(ts.URL + "/allowed") + visited, err = c.HasVisited(ts.URL) if err != nil { - t.Fatal(err) + t.Error(err.Error()) } -} -func TestRobotsWhenDisallowed(t *testing.T) { - ts := newTestServer() - defer ts.Close() + if visited != true { + t.Error("Expected URL to have been visited") + } - c := NewCollector() - c.IgnoreRobotsTxt = false + errorTestCases := []struct { + Path string + DestinationError string + }{ + {"/", "/"}, + {"/redirect?d=/", "/"}, + // now that /redirect?d=/ itself is recorded as visited, + // it's now returned in error + {"/redirect?d=/", "/redirect?d=/"}, + {"/redirect?d=/redirect%3Fd%3D/", "/redirect?d=/"}, + {"/redirect?d=/redirect%3Fd%3D/", "/redirect?d=/redirect%3Fd%3D/"}, + {"/redirect?d=/redirect%3Fd%3D/&foo=bar", "/redirect?d=/"}, + } - c.OnResponse(func(resp *Response) { - t.Fatalf("Received response: %d", resp.StatusCode) - }) + for i, testCase := range errorTestCases { + err := c.Visit(ts.URL + testCase.Path) + if testCase.DestinationError == "" { + if err != nil { + t.Errorf("got unexpected error in test %d: %q", i, err) + } + } else { + var ave *AlreadyVisitedError + if !errors.As(err, &ave) { + t.Errorf("err=%q returned when trying to revisit, expected AlreadyVisitedError", err) + } else { + if got, want := ave.Destination.String(), ts.URL+testCase.DestinationError; got != want { + t.Errorf("wrong destination in AlreadyVisitedError in test %d, got=%q want=%q", i, got, want) + } + } + } + } +} - err := c.Visit(ts.URL + "/disallowed") - if err.Error() != "URL blocked by robots.txt" { - t.Fatalf("wrong error message: %v", err) +func TestSetCookieRedirect(t *testing.T) { + type middleware = func(http.Handler) http.Handler + for _, m := range []middleware{ + requireSessionCookieSimple, + requireSessionCookieAuthPage, + } { + t.Run("", func(t *testing.T) { + ts := newUnstartedTestServer() + ts.Config.Handler = m(ts.Config.Handler) + ts.Start() + defer ts.Close() + c := NewCollector() + c.OnResponse(func(r *Response) { + if got, want := r.Body, serverIndexResponse; !bytes.Equal(got, want) { + t.Errorf("bad response body got=%q want=%q", got, want) + } + if got, want := r.StatusCode, http.StatusOK; got != want { + t.Errorf("bad response code got=%d want=%d", got, want) + } + }) + if err := c.Visit(ts.URL); err != nil { + t.Fatal(err) + } + }) } } -func TestIgnoreRobotsWhenDisallowed(t *testing.T) { +func TestCollectorPostURLRevisitCheck(t *testing.T) { ts := newTestServer() defer ts.Close() c := NewCollector() - c.IgnoreRobotsTxt = true - c.OnResponse(func(resp *Response) { - if resp.StatusCode != 200 { - t.Fatalf("Wrong response code: %d", resp.StatusCode) - } - }) + postValue := "hello" + postData := map[string]string{ + "name": postValue, + } - err := c.Visit(ts.URL + "/disallowed") + posted, err := c.HasPosted(ts.URL+"/login", postData) if err != nil { - t.Fatal(err) + t.Error(err.Error()) } -} + if posted != false { + t.Error("Expected URL to NOT have been visited") + } -func TestConnectionErrorOnRobotsTxtResultsInError(t *testing.T) { - ts := newTestServer() - ts.Close() // immediately close the server to force a connection error + c.Post(ts.URL+"/login", postData) - c := NewCollector() - c.IgnoreRobotsTxt = false - err := c.Visit(ts.URL) + posted, err = c.HasPosted(ts.URL+"/login", postData) - if err == nil { - t.Fatal("Error expected") + if err != nil { + t.Error(err.Error()) } -} -func TestEnvSettings(t *testing.T) { - ts := newTestServer() - defer ts.Close() + if posted != true { + t.Error("Expected URL to have been visited") + } - os.Setenv("COLLY_USER_AGENT", "test") - defer os.Unsetenv("COLLY_USER_AGENT") + postData["lastname"] = "world" + posted, err = c.HasPosted(ts.URL+"/login", postData) - c := NewCollector() + if err != nil { + t.Error(err.Error()) + } + + if posted != false { + t.Error("Expected URL to NOT have been visited") + } + + c.Post(ts.URL+"/login", postData) + + posted, err = c.HasPosted(ts.URL+"/login", postData) + + if err != nil { + t.Error(err.Error()) + } + + if posted != true { + t.Error("Expected URL to have been visited") + } +} + +// TestCollectorURLRevisitDisallowed ensures that disallowed URL is not considered visited. +func TestCollectorURLRevisitDomainDisallowed(t *testing.T) { + ts := newTestServer() + defer ts.Close() + + parsedURL, err := url.Parse(ts.URL) + if err != nil { + t.Fatal(err) + } + + c := NewCollector(DisallowedDomains(parsedURL.Hostname())) + err = c.Visit(ts.URL) + if got, want := err, ErrForbiddenDomain; got != want { + t.Fatalf("wrong error on first visit: got=%v want=%v", got, want) + } + err = c.Visit(ts.URL) + if got, want := err, ErrForbiddenDomain; got != want { + t.Fatalf("wrong error on second visit: got=%v want=%v", got, want) + } + +} + +func TestCollectorPost(t *testing.T) { + ts := newTestServer() + defer ts.Close() + + postValue := "hello" + c := NewCollector() + + c.OnResponse(func(r *Response) { + if postValue != string(r.Body) { + t.Error("Failed to send data with POST") + } + }) + + c.Post(ts.URL+"/login", map[string]string{ + "name": postValue, + }) +} + +func TestCollectorPostRaw(t *testing.T) { + ts := newTestServer() + defer ts.Close() + + postValue := "hello" + c := NewCollector() + + c.OnResponse(func(r *Response) { + if postValue != string(r.Body) { + t.Error("Failed to send data with POST") + } + }) + + c.PostRaw(ts.URL+"/login", []byte("name="+postValue)) +} + +func TestCollectorPostRawRevisit(t *testing.T) { + ts := newTestServer() + defer ts.Close() + + postValue := "hello" + postData := "name=" + postValue + visitCount := 0 + + c := NewCollector() + c.OnResponse(func(r *Response) { + if postValue != string(r.Body) { + t.Error("Failed to send data with POST RAW") + } + visitCount++ + }) + + c.PostRaw(ts.URL+"/login", []byte(postData)) + c.PostRaw(ts.URL+"/login", []byte(postData)) + c.PostRaw(ts.URL+"/login", []byte(postData+"&lastname=world")) + + if visitCount != 2 { + t.Error("URL POST RAW revisited") + } + + c.AllowURLRevisit = true + + c.PostRaw(ts.URL+"/login", []byte(postData)) + c.PostRaw(ts.URL+"/login", []byte(postData)) + + if visitCount != 4 { + t.Error("URL POST RAW not revisited") + } +} + +func TestRedirect(t *testing.T) { + ts := newTestServer() + defer ts.Close() + + c := NewCollector() + c.OnHTML("a[href]", func(e *HTMLElement) { + u := e.Request.AbsoluteURL(e.Attr("href")) + if !strings.HasSuffix(u, "/redirected/test") { + t.Error("Invalid URL after redirect: " + u) + } + }) + + c.OnResponseHeaders(func(r *Response) { + if !strings.HasSuffix(r.Request.URL.String(), "/redirected/") { + t.Error("Invalid URL in Request after redirect (OnResponseHeaders): " + r.Request.URL.String()) + } + }) + + c.OnResponse(func(r *Response) { + if !strings.HasSuffix(r.Request.URL.String(), "/redirected/") { + t.Error("Invalid URL in Request after redirect (OnResponse): " + r.Request.URL.String()) + } + }) + c.Visit(ts.URL + "/redirect") +} + +func TestIssue594(t *testing.T) { + // This is a regression test for a data race bug. There's no + // assertions because it's meant to be used with race detector + ts := newTestServer() + defer ts.Close() + + c := NewCollector() + // if timeout is set, this bug is not triggered + c.SetClient(&http.Client{Timeout: 0 * time.Second}) + + c.Visit(ts.URL) +} + +func TestRedirectWithDisallowedURLs(t *testing.T) { + ts := newTestServer() + defer ts.Close() + + c := NewCollector() + c.DisallowedURLFilters = []*regexp.Regexp{regexp.MustCompile(ts.URL + "/redirected/test")} + c.OnHTML("a[href]", func(e *HTMLElement) { + u := e.Request.AbsoluteURL(e.Attr("href")) + err := c.Visit(u) + if !errors.Is(err, ErrForbiddenURL) { + t.Error("URL should have been forbidden: " + u) + } + }) + + c.Visit(ts.URL + "/redirect") +} + +func TestBaseTag(t *testing.T) { + ts := newTestServer() + defer ts.Close() + + c := NewCollector() + c.OnHTML("a[href]", func(e *HTMLElement) { + u := e.Request.AbsoluteURL(e.Attr("href")) + if u != "http://xy.com/z" { + t.Error("Invalid tag handling in OnHTML: expected https://xy.com/z, got " + u) + } + }) + c.Visit(ts.URL + "/base") + + c2 := NewCollector() + c2.OnXML("//a", func(e *XMLElement) { + u := e.Request.AbsoluteURL(e.Attr("href")) + if u != "http://xy.com/z" { + t.Error("Invalid tag handling in OnXML: expected https://xy.com/z, got " + u) + } + }) + c2.Visit(ts.URL + "/base") +} + +func TestBaseTagRelative(t *testing.T) { + ts := newTestServer() + defer ts.Close() + + c := NewCollector() + c.OnHTML("a[href]", func(e *HTMLElement) { + u := e.Request.AbsoluteURL(e.Attr("href")) + expected := ts.URL + "/foobar/z" + if u != expected { + t.Errorf("Invalid tag handling in OnHTML: expected %q, got %q", expected, u) + } + }) + c.Visit(ts.URL + "/base_relative") + + c2 := NewCollector() + c2.OnXML("//a", func(e *XMLElement) { + u := e.Request.AbsoluteURL(e.Attr("href")) + expected := ts.URL + "/foobar/z" + if u != expected { + t.Errorf("Invalid tag handling in OnXML: expected %q, got %q", expected, u) + } + }) + c2.Visit(ts.URL + "/base_relative") +} + +func TestTabsAndNewlines(t *testing.T) { + // this test might look odd, but see step 3 of + // https://url.spec.whatwg.org/#concept-basic-url-parser + + ts := newTestServer() + defer ts.Close() + + visited := map[string]struct{}{} + expected := map[string]struct{}{ + "/tabs_and_newlines": {}, + "/foobar/xy": {}, + } + + c := NewCollector() + c.OnResponse(func(res *Response) { + visited[res.Request.URL.EscapedPath()] = struct{}{} + }) + c.OnHTML("a[href]", func(e *HTMLElement) { + if err := e.Request.Visit(e.Attr("href")); err != nil { + t.Errorf("visit failed: %v", err) + } + }) + + if err := c.Visit(ts.URL + "/tabs_and_newlines"); err != nil { + t.Errorf("visit failed: %v", err) + } + + if !reflect.DeepEqual(visited, expected) { + t.Errorf("visited=%v expected=%v", visited, expected) + } +} + +func TestLonePercent(t *testing.T) { + ts := newTestServer() + defer ts.Close() + + var visitedPath string + + c := NewCollector() + c.OnResponse(func(res *Response) { + visitedPath = res.Request.URL.RequestURI() + }) + if err := c.Visit(ts.URL + "/100%"); err != nil { + t.Errorf("visit failed: %v", err) + } + // Automatic encoding is not really correct: browsers + // would send bare percent here. However, Go net/http + // cannot send such requests due to + // https://github.com/golang/go/issues/29808. So we have two + // alternatives really: return an error when attempting + // to fetch such URLs, or at least try the encoded variant. + // This test checks that the latter is attempted. + if got, want := visitedPath, "/100%25"; got != want { + t.Errorf("got=%q want=%q", got, want) + } + // invalid URL escape in query component is not a problem, + // but check it anyway + if err := c.Visit(ts.URL + "/?a=100%zz"); err != nil { + t.Errorf("visit failed: %v", err) + } + if got, want := visitedPath, "/?a=100%zz"; got != want { + t.Errorf("got=%q want=%q", got, want) + } +} + +func TestCollectorCookies(t *testing.T) { + ts := newTestServer() + defer ts.Close() + + c := NewCollector() + + if err := c.Visit(ts.URL + "/set_cookie"); err != nil { + t.Fatal(err) + } + + if err := c.Visit(ts.URL + "/check_cookie"); err != nil { + t.Fatalf("Failed to use previously set cookies: %s", err) + } +} + +func TestRobotsWhenAllowed(t *testing.T) { + ts := newTestServer() + defer ts.Close() + + c := NewCollector() + c.IgnoreRobotsTxt = false + + c.OnResponse(func(resp *Response) { + if resp.StatusCode != 200 { + t.Fatalf("Wrong response code: %d", resp.StatusCode) + } + }) + + err := c.Visit(ts.URL + "/allowed") + + if err != nil { + t.Fatal(err) + } +} + +func TestRobotsWhenDisallowed(t *testing.T) { + ts := newTestServer() + defer ts.Close() + + c := NewCollector() + c.IgnoreRobotsTxt = false + + c.OnResponse(func(resp *Response) { + t.Fatalf("Received response: %d", resp.StatusCode) + }) + + err := c.Visit(ts.URL + "/disallowed") + if err.Error() != "URL blocked by robots.txt" { + t.Fatalf("wrong error message: %v", err) + } +} + +func TestRobotsWhenDisallowedWithQueryParameter(t *testing.T) { + ts := newTestServer() + defer ts.Close() + + c := NewCollector() + c.IgnoreRobotsTxt = false + + c.OnResponse(func(resp *Response) { + t.Fatalf("Received response: %d", resp.StatusCode) + }) + + err := c.Visit(ts.URL + "/allowed?q=1") + if err.Error() != "URL blocked by robots.txt" { + t.Fatalf("wrong error message: %v", err) + } +} + +func TestIgnoreRobotsWhenDisallowed(t *testing.T) { + ts := newTestServer() + defer ts.Close() + + c := NewCollector() + c.IgnoreRobotsTxt = true + + c.OnResponse(func(resp *Response) { + if resp.StatusCode != 200 { + t.Fatalf("Wrong response code: %d", resp.StatusCode) + } + }) + + err := c.Visit(ts.URL + "/disallowed") + + if err != nil { + t.Fatal(err) + } + +} + +func TestConnectionErrorOnRobotsTxtResultsInError(t *testing.T) { + ts := newTestServer() + ts.Close() // immediately close the server to force a connection error + + c := NewCollector() + c.IgnoreRobotsTxt = false + err := c.Visit(ts.URL) + + if err == nil { + t.Fatal("Error expected") + } +} + +func TestEnvSettings(t *testing.T) { + ts := newTestServer() + defer ts.Close() + + os.Setenv("COLLY_USER_AGENT", "test") + defer os.Unsetenv("COLLY_USER_AGENT") + + c := NewCollector() valid := false @@ -592,6 +1258,121 @@ func TestEnvSettings(t *testing.T) { } } +func TestUserAgent(t *testing.T) { + const exampleUserAgent1 = "Example/1.0" + const exampleUserAgent2 = "Example/2.0" + const defaultUserAgent = "colly - https://github.com/gocolly/colly/v2" + + ts := newTestServer() + defer ts.Close() + + var receivedUserAgent string + + func() { + c := NewCollector() + c.OnResponse(func(resp *Response) { + receivedUserAgent = string(resp.Body) + }) + c.Visit(ts.URL + "/user_agent") + if got, want := receivedUserAgent, defaultUserAgent; got != want { + t.Errorf("mismatched User-Agent: got=%q want=%q", got, want) + } + }() + func() { + c := NewCollector(UserAgent(exampleUserAgent1)) + c.OnResponse(func(resp *Response) { + receivedUserAgent = string(resp.Body) + }) + c.Visit(ts.URL + "/user_agent") + if got, want := receivedUserAgent, exampleUserAgent1; got != want { + t.Errorf("mismatched User-Agent: got=%q want=%q", got, want) + } + }() + func() { + c := NewCollector(UserAgent(exampleUserAgent1)) + c.OnResponse(func(resp *Response) { + receivedUserAgent = string(resp.Body) + }) + + c.Request("GET", ts.URL+"/user_agent", nil, nil, nil) + if got, want := receivedUserAgent, exampleUserAgent1; got != want { + t.Errorf("mismatched User-Agent (nil hdr): got=%q want=%q", got, want) + } + }() + func() { + c := NewCollector(UserAgent(exampleUserAgent1)) + c.OnResponse(func(resp *Response) { + receivedUserAgent = string(resp.Body) + }) + + c.Request("GET", ts.URL+"/user_agent", nil, nil, http.Header{}) + if got, want := receivedUserAgent, exampleUserAgent1; got != want { + t.Errorf("mismatched User-Agent (non-nil hdr): got=%q want=%q", got, want) + } + }() + func() { + c := NewCollector(UserAgent(exampleUserAgent1)) + c.OnResponse(func(resp *Response) { + receivedUserAgent = string(resp.Body) + }) + hdr := http.Header{} + hdr.Set("User-Agent", "") + + c.Request("GET", ts.URL+"/user_agent", nil, nil, hdr) + if got, want := receivedUserAgent, ""; got != want { + t.Errorf("mismatched User-Agent (hdr with empty UA): got=%q want=%q", got, want) + } + }() + func() { + c := NewCollector(UserAgent(exampleUserAgent1)) + c.OnResponse(func(resp *Response) { + receivedUserAgent = string(resp.Body) + }) + hdr := http.Header{} + hdr.Set("User-Agent", exampleUserAgent2) + + c.Request("GET", ts.URL+"/user_agent", nil, nil, hdr) + if got, want := receivedUserAgent, exampleUserAgent2; got != want { + t.Errorf("mismatched User-Agent (hdr with UA): got=%q want=%q", got, want) + } + }() +} + +func TestHeaders(t *testing.T) { + const exampleHostHeader = "example.com" + const exampleTestHeader = "Testing" + + ts := newTestServer() + defer ts.Close() + + var receivedHeader string + + func() { + c := NewCollector( + Headers(map[string]string{"Host": exampleHostHeader}), + ) + c.OnResponse(func(resp *Response) { + receivedHeader = string(resp.Body) + }) + c.Visit(ts.URL + "/host_header") + if got, want := receivedHeader, exampleHostHeader; got != want { + t.Errorf("mismatched Host header: got=%q want=%q", got, want) + } + }() + func() { + c := NewCollector( + Headers(map[string]string{"Test": exampleTestHeader}), + ) + c.OnResponse(func(resp *Response) { + receivedHeader = string(resp.Body) + }) + c.Visit(ts.URL + "/custom_header") + if got, want := receivedHeader, exampleTestHeader; got != want { + t.Errorf("mismatched custom header: got=%q want=%q", got, want) + } + }() +} + func TestParseHTTPErrorResponse(t *testing.T) { contentCount := 0 ts := newTestServer() @@ -662,7 +1443,7 @@ func TestHTMLElement(t *testing.T) { } } -func TestCollectorOnXML(t *testing.T) { +func TestCollectorOnXMLWithHtml(t *testing.T) { ts := newTestServer() defer ts.Close() @@ -706,6 +1487,186 @@ func TestCollectorOnXML(t *testing.T) { } } +func TestCollectorOnXMLWithXML(t *testing.T) { + ts := newTestServer() + defer ts.Close() + + c := NewCollector() + + titleCallbackCalled := false + paragraphCallbackCount := 0 + + c.OnXML("//page/title", func(e *XMLElement) { + titleCallbackCalled = true + if e.Text != "Test Page" { + t.Error("Title element text does not match, got", e.Text) + } + }) + + c.OnXML("//page/paragraph", func(e *XMLElement) { + paragraphCallbackCount++ + if e.Attr("type") != "description" { + t.Error("Failed to get paragraph's type attribute") + } + }) + + c.OnXML("/page", func(e *XMLElement) { + if e.ChildAttr("paragraph", "type") != "description" { + t.Error("Invalid type value") + } + classes := e.ChildAttrs("paragraph", "type") + if len(classes) != 2 { + t.Error("Invalid type values") + } + }) + + c.Visit(ts.URL + "/xml") + + if !titleCallbackCalled { + t.Error("Failed to call OnXML callback for tag") + } + + if paragraphCallbackCount != 2 { + t.Error("Failed to find all <paragraph> tags") + } +} + +func TestCollectorVisitWithTrace(t *testing.T) { + ts := newTestServer() + defer ts.Close() + + c := NewCollector(AllowedDomains("localhost", "127.0.0.1", "::1"), TraceHTTP()) + c.OnResponse(func(resp *Response) { + if resp.Trace == nil { + t.Error("Failed to initialize trace") + } + }) + + err := c.Visit(ts.URL) + if err != nil { + t.Errorf("Failed to visit url %s", ts.URL) + } +} + +func TestCollectorVisitWithCheckHead(t *testing.T) { + ts := newTestServer() + defer ts.Close() + + c := NewCollector(CheckHead()) + var requestMethodChain []string + c.OnResponse(func(resp *Response) { + requestMethodChain = append(requestMethodChain, resp.Request.Method) + }) + + err := c.Visit(ts.URL) + if err != nil { + t.Errorf("Failed to visit url %s", ts.URL) + } + if requestMethodChain[0] != "HEAD" && requestMethodChain[1] != "GET" { + t.Errorf("Failed to perform a HEAD request before GET") + } +} + +func TestCollectorDepth(t *testing.T) { + ts := newTestServer() + defer ts.Close() + maxDepth := 2 + c1 := NewCollector( + MaxDepth(maxDepth), + AllowURLRevisit(), + ) + requestCount := 0 + c1.OnResponse(func(resp *Response) { + requestCount++ + if requestCount >= 10 { + return + } + c1.Visit(ts.URL) + }) + c1.Visit(ts.URL) + if requestCount < 10 { + t.Errorf("Invalid number of requests: %d (expected 10) without using MaxDepth", requestCount) + } + + c2 := c1.Clone() + requestCount = 0 + c2.OnResponse(func(resp *Response) { + requestCount++ + resp.Request.Visit(ts.URL) + }) + c2.Visit(ts.URL) + if requestCount != 2 { + t.Errorf("Invalid number of requests: %d (expected 2) with using MaxDepth 2", requestCount) + } + + c1.Visit(ts.URL) + if requestCount < 10 { + t.Errorf("Invalid number of requests: %d (expected 10) without using MaxDepth again", requestCount) + } + + requestCount = 0 + c2.Visit(ts.URL) + if requestCount != 2 { + t.Errorf("Invalid number of requests: %d (expected 2) with using MaxDepth 2 again", requestCount) + } +} + +func TestCollectorRequests(t *testing.T) { + ts := newTestServer() + defer ts.Close() + maxRequests := uint32(5) + c1 := NewCollector( + MaxRequests(maxRequests), + AllowURLRevisit(), + ) + requestCount := 0 + c1.OnResponse(func(resp *Response) { + requestCount++ + c1.Visit(ts.URL) + }) + c1.Visit(ts.URL) + if requestCount != 5 { + t.Errorf("Invalid number of requests: %d (expected 5) with MaxRequests", requestCount) + } +} + +func TestCollectorContext(t *testing.T) { + // "/slow" takes 1 second to return the response. + // If context does abort the transfer after 0.5 seconds as it should, + // OnError will be called, and the test is passed. Otherwise, test is failed. + + ts := newTestServer() + defer ts.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) + defer cancel() + + c := NewCollector(StdlibContext(ctx)) + + onErrorCalled := false + + c.OnResponse(func(resp *Response) { + t.Error("OnResponse was called, expected OnError") + }) + + c.OnError(func(resp *Response, err error) { + onErrorCalled = true + if err != context.DeadlineExceeded { + t.Errorf("OnError got err=%#v, expected context.DeadlineExceeded", err) + } + }) + + err := c.Visit(ts.URL + "/slow") + if err != context.DeadlineExceeded { + t.Errorf("Visit return err=%#v, expected context.DeadlineExceeded", err) + } + + if !onErrorCalled { + t.Error("OnError was not called") + } + +} + func BenchmarkOnHTML(b *testing.B) { ts := newTestServer() defer ts.Close() @@ -742,3 +1703,114 @@ func BenchmarkOnResponse(b *testing.B) { c.Visit(ts.URL) } } + +func requireSessionCookieSimple(handler http.Handler) http.Handler { + const cookieName = "session_id" + + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if _, err := r.Cookie(cookieName); err == http.ErrNoCookie { + http.SetCookie(w, &http.Cookie{Name: cookieName, Value: "1"}) + http.Redirect(w, r, r.RequestURI, http.StatusFound) + return + } + handler.ServeHTTP(w, r) + }) +} + +func requireSessionCookieAuthPage(handler http.Handler) http.Handler { + const setCookiePath = "/auth" + const cookieName = "session_id" + + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path == setCookiePath { + destination := r.URL.Query().Get("return") + http.Redirect(w, r, destination, http.StatusFound) + return + } + if _, err := r.Cookie(cookieName); err == http.ErrNoCookie { + http.SetCookie(w, &http.Cookie{Name: cookieName, Value: "1"}) + http.Redirect(w, r, setCookiePath+"?return="+url.QueryEscape(r.RequestURI), http.StatusFound) + return + } + handler.ServeHTTP(w, r) + }) +} + +func TestCollectorPostRetry(t *testing.T) { + ts := newTestServer() + defer ts.Close() + + postValue := "hello" + c := NewCollector() + try := false + c.OnResponse(func(r *Response) { + if r.Ctx.Get("notFirst") == "" { + r.Ctx.Put("notFirst", "first") + _ = r.Request.Retry() + return + } + if postValue != string(r.Body) { + t.Error("Failed to send data with POST") + } + try = true + }) + + c.Post(ts.URL+"/login", map[string]string{ + "name": postValue, + }) + if !try { + t.Error("OnResponse Retry was not called") + } +} +func TestCollectorGetRetry(t *testing.T) { + ts := newTestServer() + defer ts.Close() + try := false + + c := NewCollector() + + c.OnResponse(func(r *Response) { + if r.Ctx.Get("notFirst") == "" { + r.Ctx.Put("notFirst", "first") + _ = r.Request.Retry() + return + } + if !bytes.Equal(r.Body, serverIndexResponse) { + t.Error("Response body does not match with the original content") + } + try = true + }) + + c.Visit(ts.URL) + if !try { + t.Error("OnResponse Retry was not called") + } +} + +func TestCollectorPostRetryUnseekable(t *testing.T) { + ts := newTestServer() + defer ts.Close() + try := false + postValue := "hello" + c := NewCollector() + + c.OnResponse(func(r *Response) { + if postValue != string(r.Body) { + t.Error("Failed to send data with POST") + } + + if r.Ctx.Get("notFirst") == "" { + r.Ctx.Put("notFirst", "first") + err := r.Request.Retry() + if !errors.Is(err, ErrRetryBodyUnseekable) { + t.Errorf("Unexpected error Type ErrRetryBodyUnseekable : %v", err) + } + return + } + try = true + }) + c.Request("POST", ts.URL+"/login", bytes.NewBuffer([]byte("name="+postValue)), nil, nil) + if try { + t.Error("OnResponse Retry was called but BodyUnseekable") + } +} diff --git a/debug/webdebugger.go b/debug/webdebugger.go index e246361e1..504a9eb04 100644 --- a/debug/webdebugger.go +++ b/debug/webdebugger.go @@ -18,6 +18,7 @@ import ( "encoding/json" "log" "net/http" + "sync" "time" ) @@ -28,6 +29,7 @@ type WebDebugger struct { initialized bool CurrentRequests map[uint32]requestInfo RequestLog []requestInfo + sync.Mutex } type requestInfo struct { @@ -61,6 +63,9 @@ func (w *WebDebugger) Init() error { // Event updates the debugger's status func (w *WebDebugger) Event(e *Event) { + w.Lock() + defer w.Unlock() + switch e.Type { case "request": w.CurrentRequests[e.RequestID] = requestInfo{ @@ -119,11 +124,11 @@ function fetchStatus() { $("#request_log_count").text('(' + data.RequestLog.length + ')'); for(var i in data.CurrentRequests) { var r = data.CurrentRequests[i]; - $("#current_requests").append(curRequestTpl(r.Url, r.Started, r.CollectorId)); + $("#current_requests").append(curRequestTpl(r.URL, r.Started, r.CollectorID)); } for(var i in data.RequestLog.reverse()) { var r = data.RequestLog[i]; - $("#request_log").append(requestLogTpl(r.Url, r.Duration, r.CollectorId)); + $("#request_log").append(requestLogTpl(r.URL, r.Duration, r.CollectorID)); } setTimeout(fetchStatus, 1000); }); @@ -138,7 +143,9 @@ $(document).ready(function() { } func (w *WebDebugger) statusHandler(wr http.ResponseWriter, r *http.Request) { + w.Lock() jsonData, err := json.MarshalIndent(w, "", " ") + w.Unlock() if err != nil { panic(err) } diff --git a/extensions/random_user_agent.go b/extensions/random_user_agent.go index 6426a14ff..296b2f6e6 100644 --- a/extensions/random_user_agent.go +++ b/extensions/random_user_agent.go @@ -3,57 +3,537 @@ package extensions import ( "fmt" "math/rand" + "strings" - "github.com/gocolly/colly" + "github.com/gocolly/colly/v2" ) var uaGens = []func() string{ genFirefoxUA, genChromeUA, + genEdgeUA, + genOperaUA, } -// RandomUserAgent generates a random browser user agent on every request +var uaGensMobile = []func() string{ + genMobilePixel7UA, + genMobilePixel6UA, + genMobilePixel5UA, + genMobilePixel4UA, + genMobileNexus10UA, +} + +// RandomUserAgent generates a random DESKTOP browser user-agent on every requests func RandomUserAgent(c *colly.Collector) { c.OnRequest(func(r *colly.Request) { r.Headers.Set("User-Agent", uaGens[rand.Intn(len(uaGens))]()) }) } +// RandomMobileUserAgent generates a random MOBILE browser user-agent on every requests +func RandomMobileUserAgent(c *colly.Collector) { + c.OnRequest(func(r *colly.Request) { + r.Headers.Set("User-Agent", uaGensMobile[rand.Intn(len(uaGensMobile))]()) + }) +} + var ffVersions = []float32{ - 58.0, - 57.0, - 56.0, - 52.0, - 48.0, - 40.0, - 35.0, + // NOTE: Only version released after Jun 1, 2022 will be listed. + // Data source: https://en.wikipedia.org/wiki/Firefox_version_history + + // 2022 + 102.0, + 103.0, + 104.0, + 105.0, + 106.0, + 107.0, + 108.0, + + // 2023 + 109.0, + 110.0, + 111.0, + 112.0, + 113.0, } var chromeVersions = []string{ - "65.0.3325.146", - "64.0.3282.0", - "41.0.2228.0", - "40.0.2214.93", - "37.0.2062.124", + // NOTE: Only version released after Jun 1, 2022 will be listed. + // Data source: https://chromereleases.googleblog.com/search/label/Stable%20updates + + // https://chromereleases.googleblog.com/2022/06/stable-channel-update-for-desktop.html + "102.0.5005.115", + + // https://chromereleases.googleblog.com/2022/06/stable-channel-update-for-desktop_21.html + "103.0.5060.53", + + // https://chromereleases.googleblog.com/2022/06/stable-channel-update-for-desktop_27.html + "103.0.5060.66", + + // https://chromereleases.googleblog.com/2022/07/stable-channel-update-for-desktop.html + "103.0.5060.114", + + // https://chromereleases.googleblog.com/2022/07/stable-channel-update-for-desktop_19.html + "103.0.5060.134", + + // https://chromereleases.googleblog.com/2022/08/stable-channel-update-for-desktop.html + "104.0.5112.79", + "104.0.5112.80", + "104.0.5112.81", + + // https://chromereleases.googleblog.com/2022/08/stable-channel-update-for-desktop_16.html + "104.0.5112.101", + "104.0.5112.102", + + // https://chromereleases.googleblog.com/2022/08/stable-channel-update-for-desktop_30.html + "105.0.5195.52", + "105.0.5195.53", + "105.0.5195.54", + + // https://chromereleases.googleblog.com/2022/09/stable-channel-update-for-desktop.html + "105.0.5195.102", + + // https://chromereleases.googleblog.com/2022/09/stable-channel-update-for-desktop_14.html + "105.0.5195.125", + "105.0.5195.126", + "105.0.5195.127", + + // https://chromereleases.googleblog.com/2022/09/stable-channel-update-for-desktop_27.html + "106.0.5249.61", + "106.0.5249.62", + + // https://chromereleases.googleblog.com/2022/09/stable-channel-update-for-desktop_30.html + "106.0.5249.91", + + // https://chromereleases.googleblog.com/2022/10/stable-channel-update-for-desktop.html + "106.0.5249.103", + + // https://chromereleases.googleblog.com/2022/10/stable-channel-update-for-desktop_11.html + "106.0.5249.119", + + // https://chromereleases.googleblog.com/2022/10/stable-channel-update-for-desktop_25.html + "107.0.5304.62", + "107.0.5304.63", + "107.0.5304.68", + + // https://chromereleases.googleblog.com/2022/10/stable-channel-update-for-desktop_27.html + "107.0.5304.87", + "107.0.5304.88", + + // https://chromereleases.googleblog.com/2022/11/stable-channel-update-for-desktop.html + "107.0.5304.106", + "107.0.5304.107", + "107.0.5304.110", + + // https://chromereleases.googleblog.com/2022/11/stable-channel-update-for-desktop_24.html + "107.0.5304.121", + "107.0.5304.122", + + // https://chromereleases.googleblog.com/2022/11/stable-channel-update-for-desktop_29.html + "108.0.5359.71", + "108.0.5359.72", + + // https://chromereleases.googleblog.com/2022/12/stable-channel-update-for-desktop.html + "108.0.5359.94", + "108.0.5359.95", + + // https://chromereleases.googleblog.com/2022/12/stable-channel-update-for-desktop_7.html + "108.0.5359.98", + "108.0.5359.99", + + // https://chromereleases.googleblog.com/2022/12/stable-channel-update-for-desktop_13.html + "108.0.5359.124", + "108.0.5359.125", + + // https://chromereleases.googleblog.com/2023/01/stable-channel-update-for-desktop.html + "109.0.5414.74", + "109.0.5414.75", + "109.0.5414.87", + + // https://chromereleases.googleblog.com/2023/01/stable-channel-update-for-desktop_24.html + "109.0.5414.119", + "109.0.5414.120", + + // https://chromereleases.googleblog.com/2023/02/stable-channel-update-for-desktop.html + "110.0.5481.77", + "110.0.5481.78", + + // https://chromereleases.googleblog.com/2023/02/stable-channel-desktop-update.html + "110.0.5481.96", + "110.0.5481.97", + + // https://chromereleases.googleblog.com/2023/02/stable-channel-desktop-update_14.html + "110.0.5481.100", + + // https://chromereleases.googleblog.com/2023/02/stable-channel-desktop-update_16.html + "110.0.5481.104", + + // https://chromereleases.googleblog.com/2023/02/stable-channel-desktop-update_22.html + "110.0.5481.177", + "110.0.5481.178", + + // https://chromereleases.googleblog.com/2023/02/stable-channel-desktop-update_97.html + "109.0.5414.129", + + // https://chromereleases.googleblog.com/2023/03/stable-channel-update-for-desktop.html + "111.0.5563.64", + "111.0.5563.65", + + // https://chromereleases.googleblog.com/2023/03/stable-channel-update-for-desktop_21.html + "111.0.5563.110", + "111.0.5563.111", + + // https://chromereleases.googleblog.com/2023/03/stable-channel-update-for-desktop_27.html + "111.0.5563.146", + "111.0.5563.147", + + // https://chromereleases.googleblog.com/2023/04/stable-channel-update-for-desktop.html + "112.0.5615.49", + "112.0.5615.50", + + // https://chromereleases.googleblog.com/2023/04/stable-channel-update-for-desktop_12.html + "112.0.5615.86", + "112.0.5615.87", + + // https://chromereleases.googleblog.com/2023/04/stable-channel-update-for-desktop_14.html + "112.0.5615.121", + + // https://chromereleases.googleblog.com/2023/04/stable-channel-update-for-desktop_18.html + "112.0.5615.137", + "112.0.5615.138", + "112.0.5615.165", + + // https://chromereleases.googleblog.com/2023/05/stable-channel-update-for-desktop.html + "113.0.5672.63", + "113.0.5672.64", + + // https://chromereleases.googleblog.com/2023/05/stable-channel-update-for-desktop_8.html + "113.0.5672.92", + "113.0.5672.93", +} + +var edgeVersions = []string{ + // NOTE: Only version released after Jun 1, 2022 will be listed. + // Data source: https://learn.microsoft.com/en-us/deployedge/microsoft-edge-release-schedule + + // 2022 + "103.0.0.0,103.0.1264.37", + "104.0.0.0,104.0.1293.47", + "105.0.0.0,105.0.1343.25", + "106.0.0.0,106.0.1370.34", + "107.0.0.0,107.0.1418.24", + "108.0.0.0,108.0.1462.42", + + // 2023 + "109.0.0.0,109.0.1518.49", + "110.0.0.0,110.0.1587.41", + "111.0.0.0,111.0.1661.41", + "112.0.0.0,112.0.1722.34", + "113.0.0.0,113.0.1774.3", +} + +var operaVersions = []string{ + // NOTE: Only version released after Jan 1, 2023 will be listed. + // Data source: https://blogs.opera.com/desktop/ + + // https://blogs.opera.com/desktop/changelog-for-96/ + "110.0.5449.0,96.0.4640.0", + "110.0.5464.2,96.0.4653.0", + "110.0.5464.2,96.0.4660.0", + "110.0.5481.30,96.0.4674.0", + "110.0.5481.30,96.0.4691.0", + "110.0.5481.30,96.0.4693.12", + "110.0.5481.77,96.0.4693.16", + "110.0.5481.100,96.0.4693.20", + "110.0.5481.178,96.0.4693.31", + "110.0.5481.178,96.0.4693.50", + "110.0.5481.192,96.0.4693.80", + + // https://blogs.opera.com/desktop/changelog-for-97/ + "111.0.5532.2,97.0.4711.0", + "111.0.5532.2,97.0.4704.0", + "111.0.5532.2,97.0.4697.0", + "111.0.5562.0,97.0.4718.0", + "111.0.5563.19,97.0.4719.4", + "111.0.5563.19,97.0.4719.11", + "111.0.5563.41,97.0.4719.17", + "111.0.5563.65,97.0.4719.26", + "111.0.5563.65,97.0.4719.28", + "111.0.5563.111,97.0.4719.43", + "111.0.5563.147,97.0.4719.63", + "111.0.5563.147,97.0.4719.83", + + // https://blogs.opera.com/desktop/changelog-for-98/ + "112.0.5596.2,98.0.4756.0", + "112.0.5596.2,98.0.4746.0", + "112.0.5615.20,98.0.4759.1", + "112.0.5615.50,98.0.4759.3", + "112.0.5615.87,98.0.4759.6", + "112.0.5615.165,98.0.4759.15", + "112.0.5615.165,98.0.4759.21", + "112.0.5615.165,98.0.4759.39", +} + +var pixel7AndroidVersions = []string{ + // Data source: + // - https://developer.android.com/about/versions + // - https://source.android.com/docs/setup/about/build-numbers#source-code-tags-and-builds + "13", +} + +var pixel6AndroidVersions = []string{ + // Data source: + // - https://developer.android.com/about/versions + // - https://source.android.com/docs/setup/about/build-numbers#source-code-tags-and-builds + "12", + "13", +} + +var pixel5AndroidVersions = []string{ + // Data source: + // - https://developer.android.com/about/versions + // - https://source.android.com/docs/setup/about/build-numbers#source-code-tags-and-builds + "11", + "12", + "13", +} + +var pixel4AndroidVersions = []string{ + // Data source: + // - https://developer.android.com/about/versions + // - https://source.android.com/docs/setup/about/build-numbers#source-code-tags-and-builds + "10", + "11", + "12", + "13", +} + +var nexus10AndroidVersions = []string{ + // Data source: + // - https://developer.android.com/about/versions + // - https://source.android.com/docs/setup/about/build-numbers#source-code-tags-and-builds + "4.4.2", + "4.4.4", + "5.0", + "5.0.1", + "5.0.2", + "5.1", + "5.1.1", +} + +var nexus10Builds = []string{ + // Data source: https://source.android.com/docs/setup/about/build-numbers#source-code-tags-and-builds + + "LMY49M", // android-5.1.1_r38 (Lollipop) + "LMY49J", // android-5.1.1_r37 (Lollipop) + "LMY49I", // android-5.1.1_r36 (Lollipop) + "LMY49H", // android-5.1.1_r35 (Lollipop) + "LMY49G", // android-5.1.1_r34 (Lollipop) + "LMY49F", // android-5.1.1_r33 (Lollipop) + "LMY48Z", // android-5.1.1_r30 (Lollipop) + "LMY48X", // android-5.1.1_r25 (Lollipop) + "LMY48T", // android-5.1.1_r19 (Lollipop) + "LMY48M", // android-5.1.1_r14 (Lollipop) + "LMY48I", // android-5.1.1_r9 (Lollipop) + "LMY47V", // android-5.1.1_r1 (Lollipop) + "LMY47D", // android-5.1.0_r1 (Lollipop) + "LRX22G", // android-5.0.2_r1 (Lollipop) + "LRX22C", // android-5.0.1_r1 (Lollipop) + "LRX21P", // android-5.0.0_r4.0.1 (Lollipop) + "KTU84P", // android-4.4.4_r1 (KitKat) + "KTU84L", // android-4.4.3_r1 (KitKat) + "KOT49H", // android-4.4.2_r1 (KitKat) + "KOT49E", // android-4.4.1_r1 (KitKat) + "KRT16S", // android-4.4_r1.2 (KitKat) + "JWR66Y", // android-4.3_r1.1 (Jelly Bean) + "JWR66V", // android-4.3_r1 (Jelly Bean) + "JWR66N", // android-4.3_r0.9.1 (Jelly Bean) + "JDQ39 ", // android-4.2.2_r1 (Jelly Bean) + "JOP40F", // android-4.2.1_r1.1 (Jelly Bean) + "JOP40D", // android-4.2.1_r1 (Jelly Bean) + "JOP40C", // android-4.2_r1 (Jelly Bean) } var osStrings = []string{ - "Macintosh; Intel Mac OS X 10_10", - "Windows NT 10.0", + // MacOS - High Sierra + "Macintosh; Intel Mac OS X 10_13", + "Macintosh; Intel Mac OS X 10_13_1", + "Macintosh; Intel Mac OS X 10_13_2", + "Macintosh; Intel Mac OS X 10_13_3", + "Macintosh; Intel Mac OS X 10_13_4", + "Macintosh; Intel Mac OS X 10_13_5", + "Macintosh; Intel Mac OS X 10_13_6", + + // MacOS - Mojave + "Macintosh; Intel Mac OS X 10_14", + "Macintosh; Intel Mac OS X 10_14_1", + "Macintosh; Intel Mac OS X 10_14_2", + "Macintosh; Intel Mac OS X 10_14_3", + "Macintosh; Intel Mac OS X 10_14_4", + "Macintosh; Intel Mac OS X 10_14_5", + "Macintosh; Intel Mac OS X 10_14_6", + + // MacOS - Catalina + "Macintosh; Intel Mac OS X 10_15", + "Macintosh; Intel Mac OS X 10_15_1", + "Macintosh; Intel Mac OS X 10_15_2", + "Macintosh; Intel Mac OS X 10_15_3", + "Macintosh; Intel Mac OS X 10_15_4", + "Macintosh; Intel Mac OS X 10_15_5", + "Macintosh; Intel Mac OS X 10_15_6", + "Macintosh; Intel Mac OS X 10_15_7", + + // MacOS - Big Sur + "Macintosh; Intel Mac OS X 11_0", + "Macintosh; Intel Mac OS X 11_0_1", + "Macintosh; Intel Mac OS X 11_1", + "Macintosh; Intel Mac OS X 11_2", + "Macintosh; Intel Mac OS X 11_2_1", + "Macintosh; Intel Mac OS X 11_2_2", + "Macintosh; Intel Mac OS X 11_2_3", + "Macintosh; Intel Mac OS X 11_3", + "Macintosh; Intel Mac OS X 11_3_1", + "Macintosh; Intel Mac OS X 11_4", + "Macintosh; Intel Mac OS X 11_5", + "Macintosh; Intel Mac OS X 11_5_1", + "Macintosh; Intel Mac OS X 11_5_2", + "Macintosh; Intel Mac OS X 11_6", + "Macintosh; Intel Mac OS X 11_6_1", + "Macintosh; Intel Mac OS X 11_6_2", + "Macintosh; Intel Mac OS X 11_6_3", + "Macintosh; Intel Mac OS X 11_6_4", + "Macintosh; Intel Mac OS X 11_6_5", + "Macintosh; Intel Mac OS X 11_6_6", + "Macintosh; Intel Mac OS X 11_6_7", + "Macintosh; Intel Mac OS X 11_6_8", + "Macintosh; Intel Mac OS X 11_7", + "Macintosh; Intel Mac OS X 11_7_1", + "Macintosh; Intel Mac OS X 11_7_2", + "Macintosh; Intel Mac OS X 11_7_3", + "Macintosh; Intel Mac OS X 11_7_4", + "Macintosh; Intel Mac OS X 11_7_5", + "Macintosh; Intel Mac OS X 11_7_6", + + // MacOS - Monterey + "Macintosh; Intel Mac OS X 12_0", + "Macintosh; Intel Mac OS X 12_0_1", + "Macintosh; Intel Mac OS X 12_1", + "Macintosh; Intel Mac OS X 12_2", + "Macintosh; Intel Mac OS X 12_2_1", + "Macintosh; Intel Mac OS X 12_3", + "Macintosh; Intel Mac OS X 12_3_1", + "Macintosh; Intel Mac OS X 12_4", + "Macintosh; Intel Mac OS X 12_5", + "Macintosh; Intel Mac OS X 12_5_1", + "Macintosh; Intel Mac OS X 12_6", + "Macintosh; Intel Mac OS X 12_6_1", + "Macintosh; Intel Mac OS X 12_6_2", + "Macintosh; Intel Mac OS X 12_6_3", + "Macintosh; Intel Mac OS X 12_6_4", + "Macintosh; Intel Mac OS X 12_6_5", + + // MacOS - Ventura + "Macintosh; Intel Mac OS X 13_0", + "Macintosh; Intel Mac OS X 13_0_1", + "Macintosh; Intel Mac OS X 13_1", + "Macintosh; Intel Mac OS X 13_2", + "Macintosh; Intel Mac OS X 13_2_1", + "Macintosh; Intel Mac OS X 13_3", + "Macintosh; Intel Mac OS X 13_3_1", + + // Windows + "Windows NT 10.0; Win64; x64", "Windows NT 5.1", "Windows NT 6.1; WOW64", "Windows NT 6.1; Win64; x64", + + // Linux "X11; Linux x86_64", } +// Generates Firefox Browser User-Agent (Desktop) +// +// -> "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:87.0) Gecko/20100101 Firefox/87.0" func genFirefoxUA() string { version := ffVersions[rand.Intn(len(ffVersions))] os := osStrings[rand.Intn(len(osStrings))] return fmt.Sprintf("Mozilla/5.0 (%s; rv:%.1f) Gecko/20100101 Firefox/%.1f", os, version, version) } +// Generates Chrome Browser User-Agent (Desktop) +// +// -> "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36" func genChromeUA() string { version := chromeVersions[rand.Intn(len(chromeVersions))] os := osStrings[rand.Intn(len(osStrings))] return fmt.Sprintf("Mozilla/5.0 (%s) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36", os, version) } + +// Generates Microsoft Edge User-Agent (Desktop) +// +// -> "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36 Edg/90.0.818.39" +func genEdgeUA() string { + version := edgeVersions[rand.Intn(len(edgeVersions))] + chromeVersion := strings.Split(version, ",")[0] + edgeVersion := strings.Split(version, ",")[1] + os := osStrings[rand.Intn(len(osStrings))] + return fmt.Sprintf("Mozilla/5.0 (%s) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36 Edg/%s", os, chromeVersion, edgeVersion) +} + +// Generates Opera Browser User-Agent (Desktop) +// +// -> "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 OPR/98.0.4759.3" +func genOperaUA() string { + version := operaVersions[rand.Intn(len(operaVersions))] + chromeVersion := strings.Split(version, ",")[0] + operaVersion := strings.Split(version, ",")[1] + os := osStrings[rand.Intn(len(osStrings))] + return fmt.Sprintf("Mozilla/5.0 (%s) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36 OPR/%s", os, chromeVersion, operaVersion) +} + +// Generates Pixel 7 Browser User-Agent (Mobile) +// +// -> Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36 +func genMobilePixel7UA() string { + android := pixel7AndroidVersions[rand.Intn(len(pixel7AndroidVersions))] + chrome := chromeVersions[rand.Intn(len(chromeVersions))] + return fmt.Sprintf("Mozilla/5.0 (Linux; Android %s; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36", android, chrome) +} + +// Generates Pixel 6 Browser User-Agent (Mobile) +// +// -> "Mozilla/5.0 (Linux; Android 13; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36" +func genMobilePixel6UA() string { + android := pixel6AndroidVersions[rand.Intn(len(pixel6AndroidVersions))] + chrome := chromeVersions[rand.Intn(len(chromeVersions))] + return fmt.Sprintf("Mozilla/5.0 (Linux; Android %s; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36", android, chrome) +} + +// Generates Pixel 5 Browser User-Agent (Mobile) +// +// -> "Mozilla/5.0 (Linux; Android 13; Pixel 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36" +func genMobilePixel5UA() string { + android := pixel5AndroidVersions[rand.Intn(len(pixel5AndroidVersions))] + chrome := chromeVersions[rand.Intn(len(chromeVersions))] + return fmt.Sprintf("Mozilla/5.0 (Linux; Android %s; Pixel 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36", android, chrome) +} + +// Generates Pixel 4 Browser User-Agent (Mobile) +// +// -> "Mozilla/5.0 (Linux; Android 13; Pixel 4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36" +func genMobilePixel4UA() string { + android := pixel4AndroidVersions[rand.Intn(len(pixel4AndroidVersions))] + chrome := chromeVersions[rand.Intn(len(chromeVersions))] + return fmt.Sprintf("Mozilla/5.0 (Linux; Android %s; Pixel 4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36", android, chrome) +} + +// Generates Nexus 10 Browser User-Agent (Mobile) +// +// -> "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 10 Build/LMY48T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.91 Safari/537.36" +func genMobileNexus10UA() string { + build := nexus10Builds[rand.Intn(len(nexus10Builds))] + android := nexus10AndroidVersions[rand.Intn(len(nexus10AndroidVersions))] + chrome := chromeVersions[rand.Intn(len(chromeVersions))] + return fmt.Sprintf("Mozilla/5.0 (Linux; Android %s; Nexus 10 Build/%s) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36", android, build, chrome) +} diff --git a/extensions/referer.go b/extensions/referer.go index 6b13a32a3..32a1c69ea 100644 --- a/extensions/referer.go +++ b/extensions/referer.go @@ -1,7 +1,7 @@ package extensions import ( - "github.com/gocolly/colly" + "github.com/gocolly/colly/v2" ) // Referer sets valid Referer HTTP header to requests. diff --git a/extensions/url_length_filter.go b/extensions/url_length_filter.go index 695b74e57..141cfb57d 100644 --- a/extensions/url_length_filter.go +++ b/extensions/url_length_filter.go @@ -1,7 +1,7 @@ package extensions import ( - "github.com/gocolly/colly" + "github.com/gocolly/colly/v2" ) // URLLengthFilter filters out requests with URLs longer than URLLengthLimit diff --git a/go.mod b/go.mod new file mode 100644 index 000000000..8cdce202e --- /dev/null +++ b/go.mod @@ -0,0 +1,19 @@ +module github.com/gocolly/colly/v2 + +go 1.12 + +require ( + github.com/PuerkitoBio/goquery v1.5.1 + github.com/andybalholm/cascadia v1.3.1 // indirect + github.com/antchfx/htmlquery v1.2.3 + github.com/antchfx/xmlquery v1.3.4 + github.com/gobwas/glob v0.2.3 + github.com/jawher/mow.cli v1.1.0 + github.com/kennygrant/sanitize v1.2.4 + github.com/nlnwa/whatwg-url v0.1.2 + github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca + github.com/temoto/robotstxt v1.1.1 + golang.org/x/net v0.17.0 + google.golang.org/appengine v1.6.6 + google.golang.org/protobuf v1.33.0 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 000000000..861506403 --- /dev/null +++ b/go.sum @@ -0,0 +1,104 @@ +github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE= +github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= +github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= +github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= +github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= +github.com/antchfx/htmlquery v1.2.3 h1:sP3NFDneHx2stfNXCKbhHFo8XgNjCACnU/4AO5gWz6M= +github.com/antchfx/htmlquery v1.2.3/go.mod h1:B0ABL+F5irhhMWg54ymEZinzMSi0Kt3I2if0BLYa3V0= +github.com/antchfx/xmlquery v1.3.4 h1:RuhsI4AA5Ma4XoXhaAr2VjJxU0Xp0W2zy/f9ZIpsF4s= +github.com/antchfx/xmlquery v1.3.4/go.mod h1:64w0Xesg2sTaawIdNqMB+7qaW/bSqkQm+ssPaCMWNnc= +github.com/antchfx/xpath v1.1.6/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= +github.com/antchfx/xpath v1.1.10 h1:cJ0pOvEdN/WvYXxvRrzQH9x5QWKpzHacYO8qzCcDYAg= +github.com/antchfx/xpath v1.1.10/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= +github.com/bits-and-blooms/bitset v1.2.2-0.20220111210104-dfa3e347c392 h1:9d7ak0NpT8/bhFM5ZkQuLpeS8Ey9zDY9OJJcOYqYV4c= +github.com/bits-and-blooms/bitset v1.2.2-0.20220111210104-dfa3e347c392/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= +github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= +github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e h1:1r7pUrabqp18hOBcwBwiTsbnFeTZHV9eER/QT5JVZxY= +github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.5.0 h1:LUVKkCeviFUMKqHa4tXIIij/lbhnMbP7Fn5wKdKkRh4= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/jawher/mow.cli v1.1.0 h1:NdtHXRc0CwZQ507wMvQ/IS+Q3W3x2fycn973/b8Zuk8= +github.com/jawher/mow.cli v1.1.0/go.mod h1:aNaQlc7ozF3vw6IJ2dHjp2ZFiA4ozMIYY6PyuRJwlUg= +github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o= +github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= +github.com/nlnwa/whatwg-url v0.1.2 h1:BqqsIVG6xv71wOoMAoFDmV6OK6/2sXn7BJdOsTkBl88= +github.com/nlnwa/whatwg-url v0.1.2/go.mod h1:b0r+dEyM/KztLMDSVY6ApcO9Fmzgq+e9+Ugq20UBYck= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI= +github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE= +github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/temoto/robotstxt v1.1.1 h1:Gh8RCs8ouX3hRSxxK7B1mO5RFByQ4CmJZDwgom++JaA= +github.com/temoto/robotstxt v1.1.1/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200813134508-3edf25e44fcc/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/net v0.0.0-20220114011407-0dd24b26b47d/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= +golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/appengine v1.6.6 h1:lMO5rYAqUxkmaj76jAkRUvt5JZgFymx/+Q5Mzfivuhc= +google.golang.org/appengine v1.6.6/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI= +google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= diff --git a/htmlelement.go b/htmlelement.go index 92484bd2b..7128949e5 100644 --- a/htmlelement.go +++ b/htmlelement.go @@ -68,6 +68,17 @@ func (h *HTMLElement) ChildText(goquerySelector string) string { return strings.TrimSpace(h.DOM.Find(goquerySelector).Text()) } +// ChildTexts returns the stripped text content of all the matching +// elements. +func (h *HTMLElement) ChildTexts(goquerySelector string) []string { + var res []string + h.DOM.Find(goquerySelector).Each(func(_ int, s *goquery.Selection) { + + res = append(res, strings.TrimSpace(s.Text())) + }) + return res +} + // ChildAttr returns the stripped text content of the first matching // element's attribute. func (h *HTMLElement) ChildAttr(goquerySelector, attrName string) string { diff --git a/http_backend.go b/http_backend.go index 5c3c216d2..e580f7a2e 100644 --- a/http_backend.go +++ b/http_backend.go @@ -19,12 +19,12 @@ import ( "encoding/gob" "encoding/hex" "io" - "io/ioutil" "math/rand" "net/http" "os" "path" "regexp" + "strings" "sync" "time" @@ -39,16 +39,18 @@ type httpBackend struct { lock *sync.RWMutex } +type checkHeadersFunc func(req *http.Request, statusCode int, header http.Header) bool + // LimitRule provides connection restrictions for domains. // Both DomainRegexp and DomainGlob can be used to specify // the included domains patterns, but at least one is required. // There can be two kind of limitations: -// - Parallelism: Set limit for the number of concurrent requests to matching domains -// - Delay: Wait specified amount of time between requests (parallelism is 1 in this case) +// - Parallelism: Set limit for the number of concurrent requests to matching domains +// - Delay: Wait specified amount of time between requests (parallelism is 1 in this case) type LimitRule struct { // DomainRegexp is a regular expression to match against domains DomainRegexp string - // DomainRegexp is a glob pattern to match against domains + // DomainGlob is a glob pattern to match against domains DomainGlob string // Delay is the duration to wait before creating a new request to the matching domains Delay time.Duration @@ -126,9 +128,9 @@ func (h *httpBackend) GetMatchingRule(domain string) *LimitRule { return nil } -func (h *httpBackend) Cache(request *http.Request, bodySize int, cacheDir string) (*Response, error) { - if cacheDir == "" || request.Method != "GET" { - return h.Do(request, bodySize) +func (h *httpBackend) Cache(request *http.Request, bodySize int, checkHeadersFunc checkHeadersFunc, cacheDir string) (*Response, error) { + if cacheDir == "" || request.Method != "GET" || request.Header.Get("Cache-Control") == "no-cache" { + return h.Do(request, bodySize, checkHeadersFunc) } sum := sha1.Sum([]byte(request.URL.String())) hash := hex.EncodeToString(sum[:]) @@ -138,11 +140,12 @@ func (h *httpBackend) Cache(request *http.Request, bodySize int, cacheDir string resp := new(Response) err := gob.NewDecoder(file).Decode(resp) file.Close() + checkHeadersFunc(request, resp.StatusCode, *resp.Headers) if resp.StatusCode < 500 { return resp, err } } - resp, err := h.Do(request, bodySize) + resp, err := h.Do(request, bodySize, checkHeadersFunc) if err != nil || resp.StatusCode >= 500 { return resp, err } @@ -163,7 +166,7 @@ func (h *httpBackend) Cache(request *http.Request, bodySize int, cacheDir string return resp, os.Rename(filename+"~", filename) } -func (h *httpBackend) Do(request *http.Request, bodySize int) (*Response, error) { +func (h *httpBackend) Do(request *http.Request, bodySize int, checkHeadersFunc checkHeadersFunc) (*Response, error) { r := h.GetMatchingRule(request.URL.Host) if r != nil { r.waitChan <- true @@ -181,22 +184,31 @@ func (h *httpBackend) Do(request *http.Request, bodySize int) (*Response, error) if err != nil { return nil, err } + defer res.Body.Close() + + finalRequest := request if res.Request != nil { - *request = *res.Request + finalRequest = res.Request + } + if !checkHeadersFunc(finalRequest, res.StatusCode, res.Header) { + // closing res.Body (see defer above) without reading it aborts + // the download + return nil, ErrAbortedAfterHeaders } var bodyReader io.Reader = res.Body if bodySize > 0 { bodyReader = io.LimitReader(bodyReader, int64(bodySize)) } - if !res.Uncompressed && res.Header.Get("Content-Encoding") == "gzip" { + contentEncoding := strings.ToLower(res.Header.Get("Content-Encoding")) + if !res.Uncompressed && (strings.Contains(contentEncoding, "gzip") || (contentEncoding == "" && strings.Contains(strings.ToLower(res.Header.Get("Content-Type")), "gzip")) || strings.HasSuffix(strings.ToLower(finalRequest.URL.Path), ".xml.gz")) { bodyReader, err = gzip.NewReader(bodyReader) if err != nil { return nil, err } + defer bodyReader.(*gzip.Reader).Close() } - body, err := ioutil.ReadAll(bodyReader) - defer res.Body.Close() + body, err := io.ReadAll(bodyReader) if err != nil { return nil, err } diff --git a/http_trace.go b/http_trace.go new file mode 100644 index 000000000..bcacbe313 --- /dev/null +++ b/http_trace.go @@ -0,0 +1,37 @@ +package colly + +import ( + "net/http" + "net/http/httptrace" + "time" +) + +// HTTPTrace provides a datastructure for storing an http trace. +type HTTPTrace struct { + start, connect time.Time + ConnectDuration time.Duration + FirstByteDuration time.Duration +} + +// trace returns a httptrace.ClientTrace object to be used with an http +// request via httptrace.WithClientTrace() that fills in the HttpTrace. +func (ht *HTTPTrace) trace() *httptrace.ClientTrace { + trace := &httptrace.ClientTrace{ + ConnectStart: func(network, addr string) { ht.connect = time.Now() }, + ConnectDone: func(network, addr string, err error) { + ht.ConnectDuration = time.Since(ht.connect) + }, + + GetConn: func(hostPort string) { ht.start = time.Now() }, + GotFirstResponseByte: func() { + ht.FirstByteDuration = time.Since(ht.start) + }, + } + return trace +} + +// WithTrace returns the given HTTP Request with this HTTPTrace added to its +// context. +func (ht *HTTPTrace) WithTrace(req *http.Request) *http.Request { + return req.WithContext(httptrace.WithClientTrace(req.Context(), ht.trace())) +} diff --git a/http_trace_test.go b/http_trace_test.go new file mode 100644 index 000000000..6f4d88d9d --- /dev/null +++ b/http_trace_test.go @@ -0,0 +1,73 @@ +package colly + +import ( + "net/http" + "net/http/httptest" + "testing" + "time" +) + +const testDelay = 200 * time.Millisecond + +func newTraceTestServer(delay time.Duration) *httptest.Server { + mux := http.NewServeMux() + + mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + time.Sleep(delay) + w.WriteHeader(200) + }) + mux.HandleFunc("/error", func(w http.ResponseWriter, r *http.Request) { + time.Sleep(delay) + w.WriteHeader(500) + }) + + return httptest.NewServer(mux) +} + +func TestTraceWithNoDelay(t *testing.T) { + ts := newTraceTestServer(0) + defer ts.Close() + + client := ts.Client() + req, err := http.NewRequest("GET", ts.URL, nil) + if err != nil { + t.Errorf("Failed to construct request %v", err) + } + trace := &HTTPTrace{} + req = trace.WithTrace(req) + + if _, err = client.Do(req); err != nil { + t.Errorf("Failed to make request %v", err) + } + + if trace.ConnectDuration > testDelay { + t.Errorf("trace ConnectDuration should be (almost) 0, got %v", trace.ConnectDuration) + } + if trace.FirstByteDuration > testDelay { + t.Errorf("trace FirstByteDuration should be (almost) 0, got %v", trace.FirstByteDuration) + } +} + +func TestTraceWithDelay(t *testing.T) { + ts := newTraceTestServer(testDelay) + defer ts.Close() + + client := ts.Client() + req, err := http.NewRequest("GET", ts.URL, nil) + if err != nil { + t.Errorf("Failed to construct request %v", err) + } + trace := &HTTPTrace{} + req = trace.WithTrace(req) + + if _, err = client.Do(req); err != nil { + t.Errorf("Failed to make request %v", err) + } + + if trace.ConnectDuration > testDelay { + t.Errorf("trace ConnectDuration should be (almost) 0, got %v", trace.ConnectDuration) + } + if trace.FirstByteDuration < testDelay { + t.Errorf("trace FirstByteDuration should be at least 200ms, got %v", trace.FirstByteDuration) + } +} diff --git a/proxy/proxy.go b/proxy/proxy.go index 18bcb2ad3..a4bd84852 100644 --- a/proxy/proxy.go +++ b/proxy/proxy.go @@ -20,7 +20,7 @@ import ( "net/url" "sync/atomic" - "github.com/gocolly/colly" + "github.com/gocolly/colly/v2" ) type roundRobinSwitcher struct { @@ -29,8 +29,9 @@ type roundRobinSwitcher struct { } func (r *roundRobinSwitcher) GetProxy(pr *http.Request) (*url.URL, error) { - u := r.proxyURLs[r.index%uint32(len(r.proxyURLs))] - atomic.AddUint32(&r.index, 1) + index := atomic.AddUint32(&r.index, 1) - 1 + u := r.proxyURLs[index%uint32(len(r.proxyURLs))] + ctx := context.WithValue(pr.Context(), colly.ProxyURLKey, u.String()) *pr = *pr.WithContext(ctx) return u, nil @@ -42,6 +43,9 @@ func (r *roundRobinSwitcher) GetProxy(pr *http.Request) (*url.URL, error) { // and "socks5" are supported. If the scheme is empty, // "http" is assumed. func RoundRobinProxySwitcher(ProxyURLs ...string) (colly.ProxyFunc, error) { + if len(ProxyURLs) < 1 { + return nil, colly.ErrEmptyProxyURL + } urls := make([]*url.URL, len(ProxyURLs)) for i, u := range ProxyURLs { parsedU, err := url.Parse(u) diff --git a/queue/queue.go b/queue/queue.go index f7a133d3b..0d0d78a66 100644 --- a/queue/queue.go +++ b/queue/queue.go @@ -3,14 +3,18 @@ package queue import ( "net/url" "sync" - "sync/atomic" - "github.com/gocolly/colly" + whatwgUrl "github.com/nlnwa/whatwg-url/url" + + "github.com/gocolly/colly/v2" ) const stop = true +var urlParser = whatwgUrl.NewParser(whatwgUrl.WithPercentEncodeSinglePercentSign()) + // Storage is the interface of the queue's storage backend +// Storage must be concurrently safe for multiple goroutines. type Storage interface { // Init initializes the storage Init() error @@ -27,11 +31,11 @@ type Storage interface { // requests in multiple threads type Queue struct { // Threads defines the number of consumer threads - Threads int - storage Storage - activeThreadCount int32 - threadChans []chan bool - lock *sync.Mutex + Threads int + storage Storage + wake chan struct{} + mut sync.Mutex // guards wake and running + running bool } // InMemoryQueueStorage is the default implementation of the Storage interface. @@ -61,10 +65,9 @@ func New(threads int, s Storage) (*Queue, error) { return nil, err } return &Queue{ - Threads: threads, - storage: s, - lock: &sync.Mutex{}, - threadChans: make([]chan bool, 0, threads), + Threads: threads, + storage: s, + running: true, }, nil } @@ -76,12 +79,16 @@ func (q *Queue) IsEmpty() bool { // AddURL adds a new URL to the queue func (q *Queue) AddURL(URL string) error { - u, err := url.Parse(URL) + u, err := urlParser.Parse(URL) + if err != nil { + return err + } + u2, err := url.Parse(u.Href(false)) if err != nil { return err } r := &colly.Request{ - URL: u, + URL: u2, Method: "GET", } d, err := r.Marshal() @@ -93,20 +100,26 @@ func (q *Queue) AddURL(URL string) error { // AddRequest adds a new Request to the queue func (q *Queue) AddRequest(r *colly.Request) error { - d, err := r.Marshal() + q.mut.Lock() + waken := q.wake != nil + q.mut.Unlock() + if !waken { + return q.storeRequest(r) + } + err := q.storeRequest(r) if err != nil { return err } - if err := q.storage.AddRequest(d); err != nil { + q.wake <- struct{}{} + return nil +} + +func (q *Queue) storeRequest(r *colly.Request) error { + d, err := r.Marshal() + if err != nil { return err } - q.lock.Lock() - for _, c := range q.threadChans { - c <- !stop - } - q.threadChans = make([]chan bool, 0, q.Threads) - q.lock.Unlock() - return nil + return q.storage.AddRequest(d) } // Size returns the size of the queue @@ -116,56 +129,96 @@ func (q *Queue) Size() (int, error) { // Run starts consumer threads and calls the Collector // to perform requests. Run blocks while the queue has active requests +// The given Storage must not be used directly while Run blocks. func (q *Queue) Run(c *colly.Collector) error { - wg := &sync.WaitGroup{} + q.mut.Lock() + if q.wake != nil && q.running == true { + q.mut.Unlock() + panic("cannot call duplicate Queue.Run") + } + q.wake = make(chan struct{}) + q.running = true + q.mut.Unlock() + + requestc := make(chan *colly.Request) + complete, errc := make(chan struct{}), make(chan error, 1) for i := 0; i < q.Threads; i++ { - wg.Add(1) - go func(c *colly.Collector, wg *sync.WaitGroup) { - defer wg.Done() - for { - if q.IsEmpty() { - if q.activeThreadCount == 0 { - break - } - ch := make(chan bool) - q.lock.Lock() - q.threadChans = append(q.threadChans, ch) - q.lock.Unlock() - action := <-ch - if action == stop && q.IsEmpty() { - break - } - } - q.lock.Lock() - atomic.AddInt32(&q.activeThreadCount, 1) - q.lock.Unlock() - rb, err := q.storage.GetRequest() - if err != nil || rb == nil { - q.finish() - continue + go independentRunner(requestc, complete) + } + go q.loop(c, requestc, complete, errc) + defer close(requestc) + return <-errc +} + +// Stop will stop the running queue +func (q *Queue) Stop() { + q.mut.Lock() + q.running = false + q.mut.Unlock() +} + +func (q *Queue) loop(c *colly.Collector, requestc chan<- *colly.Request, complete <-chan struct{}, errc chan<- error) { + var active int + for { + size, err := q.storage.QueueSize() + if err != nil { + errc <- err + break + } + if size == 0 && active == 0 || !q.running { + // Terminate when + // 1. No active requests + // 2. Empty queue + errc <- nil + break + } + sent := requestc + var req *colly.Request + if size > 0 { + req, err = q.loadRequest(c) + if err != nil { + // ignore an error returned by GetRequest() or + // UnmarshalRequest() + continue + } + } else { + sent = nil + } + Sent: + for { + select { + case sent <- req: + active++ + break Sent + case <-q.wake: + if sent == nil { + break Sent } - r, err := c.UnmarshalRequest(rb) - if err != nil || r == nil { - q.finish() - continue + case <-complete: + active-- + if sent == nil && active == 0 { + break Sent } - r.Do() - q.finish() } - }(c, wg) + } } - wg.Wait() - return nil } -func (q *Queue) finish() { - q.lock.Lock() - q.activeThreadCount-- - for _, c := range q.threadChans { - c <- stop +func independentRunner(requestc <-chan *colly.Request, complete chan<- struct{}) { + for req := range requestc { + req.Do() + complete <- struct{}{} + } +} + +func (q *Queue) loadRequest(c *colly.Collector) (*colly.Request, error) { + buf, err := q.storage.GetRequest() + if err != nil { + return nil, err } - q.threadChans = make([]chan bool, 0, q.Threads) - q.lock.Unlock() + copied := make([]byte, len(buf)) + copy(copied, buf) + return c.UnmarshalRequest(copied) } // Init implements Storage.Init() function @@ -180,7 +233,7 @@ func (q *InMemoryQueueStorage) AddRequest(r []byte) error { defer q.lock.Unlock() // Discard URLs if size limit exceeded if q.MaxSize > 0 && q.size >= q.MaxSize { - return nil + return colly.ErrQueueFull } i := &inMemoryQueueItem{Request: r} if q.first == nil { diff --git a/queue/queue_test.go b/queue/queue_test.go new file mode 100644 index 000000000..1d10f8377 --- /dev/null +++ b/queue/queue_test.go @@ -0,0 +1,112 @@ +package queue + +import ( + "math/rand" + "net/http" + "net/http/httptest" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/gocolly/colly/v2" +) + +func TestQueue(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(serverHandler)) + defer server.Close() + + rng := rand.New(rand.NewSource(12387123712321232)) + var rngMu sync.Mutex + + var ( + items uint32 + requests uint32 + success uint32 + failure uint32 + ) + storage := &InMemoryQueueStorage{MaxSize: 100000} + q, err := New(10, storage) + if err != nil { + panic(err) + } + put := func() { + rngMu.Lock() + t := time.Duration(rng.Intn(50)) * time.Microsecond + rngMu.Unlock() + url := server.URL + "/delay?t=" + t.String() + atomic.AddUint32(&items, 1) + q.AddURL(url) + } + for i := 0; i < 3000; i++ { + put() + storage.AddRequest([]byte("error request")) + } + c := colly.NewCollector( + colly.AllowURLRevisit(), + ) + c.OnRequest(func(req *colly.Request) { + atomic.AddUint32(&requests, 1) + }) + c.OnResponse(func(resp *colly.Response) { + if resp.StatusCode == http.StatusOK { + atomic.AddUint32(&success, 1) + } else { + atomic.AddUint32(&failure, 1) + } + rngMu.Lock() + toss := rng.Intn(2) == 0 + rngMu.Unlock() + if toss { + put() + } + }) + c.OnError(func(resp *colly.Response, err error) { + atomic.AddUint32(&failure, 1) + }) + err = q.Run(c) + if err != nil { + t.Fatalf("Queue.Run() return an error: %v", err) + } + if items != requests || success+failure != requests || failure > 0 { + t.Fatalf("wrong Queue implementation: "+ + "items = %d, requests = %d, success = %d, failure = %d", + items, requests, success, failure) + } +} + +func serverHandler(w http.ResponseWriter, req *http.Request) { + if !serverRoute(w, req) { + shutdown(w) + } +} + +func serverRoute(w http.ResponseWriter, req *http.Request) bool { + if req.URL.Path == "/delay" { + return serveDelay(w, req) == nil + } + return false +} + +func serveDelay(w http.ResponseWriter, req *http.Request) error { + q := req.URL.Query() + t, err := time.ParseDuration(q.Get("t")) + if err != nil { + return err + } + time.Sleep(t) + w.WriteHeader(http.StatusOK) + return nil +} + +func shutdown(w http.ResponseWriter) { + taker, ok := w.(http.Hijacker) + if !ok { + return + } + raw, _, err := taker.Hijack() + if err != nil { + return + } + raw.Close() +} diff --git a/request.go b/request.go index 4b94cd209..5c80e2bb8 100644 --- a/request.go +++ b/request.go @@ -18,7 +18,6 @@ import ( "bytes" "encoding/json" "io" - "io/ioutil" "net/http" "net/url" "strings" @@ -31,6 +30,8 @@ type Request struct { URL *url.URL // Headers contains the Request's HTTP headers Headers *http.Header + // the Host header + Host string // Ctx is a context between a Request and a Response Ctx *Context // Depth is the number of the parents of the request @@ -55,24 +56,31 @@ type Request struct { type serializableRequest struct { URL string Method string + Depth int Body []byte ID uint32 Ctx map[string]interface{} Headers http.Header + Host string } // New creates a new request with the context of the original request func (r *Request) New(method, URL string, body io.Reader) (*Request, error) { - u, err := url.Parse(URL) + u, err := urlParser.Parse(URL) + if err != nil { + return nil, err + } + u2, err := url.Parse(u.Href(false)) if err != nil { return nil, err } return &Request{ Method: method, - URL: u, + URL: u2, Body: body, Ctx: r.Ctx, Headers: &http.Header{}, + Host: r.Host, ID: atomic.AddUint32(&r.collector.requestCount, 1), collector: r.collector, }, nil @@ -96,15 +104,12 @@ func (r *Request) AbsoluteURL(u string) string { } else { base = r.URL } - absURL, err := base.Parse(u) + + absURL, err := urlParser.ParseRef(base.String(), u) if err != nil { return "" } - absURL.Fragment = "" - if absURL.Scheme == "//" { - absURL.Scheme = r.URL.Scheme - } - return absURL.String() + return absURL.Href(false) } // Visit continues Collector's collecting job by creating a @@ -114,6 +119,11 @@ func (r *Request) Visit(URL string) error { return r.collector.scrape(r.AbsoluteURL(URL), "GET", r.Depth+1, nil, r.Ctx, nil, true) } +// HasVisited checks if the provided URL has been visited +func (r *Request) HasVisited(URL string) (bool, error) { + return r.collector.HasVisited(URL) +} + // Post continues a collector job by creating a POST request and preserves the Context // of the previous request. // Post also calls the previously provided callbacks @@ -141,6 +151,10 @@ func (r *Request) PostMultipart(URL string, requestData map[string][]byte) error // Retry submits HTTP request again with the same parameters func (r *Request) Retry() error { + r.Headers.Del("Cookie") + if _, ok := r.Body.(io.ReadSeeker); r.Body != nil && !ok { + return ErrRetryBodyUnseekable + } return r.collector.scrape(r.URL.String(), r.Method, r.Depth, r.Body, r.Ctx, *r.Headers, false) } @@ -161,14 +175,16 @@ func (r *Request) Marshal() ([]byte, error) { var err error var body []byte if r.Body != nil { - body, err = ioutil.ReadAll(r.Body) + body, err = io.ReadAll(r.Body) if err != nil { return nil, err } } sr := &serializableRequest{ URL: r.URL.String(), + Host: r.Host, Method: r.Method, + Depth: r.Depth, Body: body, ID: r.ID, Ctx: ctx, diff --git a/response.go b/response.go index 29ba6ae14..30cdeae66 100644 --- a/response.go +++ b/response.go @@ -17,9 +17,10 @@ package colly import ( "bytes" "fmt" - "io/ioutil" + "io" "mime" "net/http" + "os" "strings" "github.com/saintfish/chardet" @@ -38,11 +39,14 @@ type Response struct { Request *Request // Headers contains the Response's HTTP headers Headers *http.Header + // Trace contains the HTTPTrace for the request. Will only be set by the + // collector if Collector.TraceHTTP is set to true. + Trace *HTTPTrace } // Save writes response body to disk func (r *Response) Save(fileName string) error { - return ioutil.WriteFile(fileName, r.Body, 0644) + return os.WriteFile(fileName, r.Body, 0644) } // FileName returns the sanitized file name parsed from "Content-Disposition" @@ -59,6 +63,9 @@ func (r *Response) FileName() string { } func (r *Response) fixCharset(detectCharset bool, defaultEncoding string) error { + if len(r.Body) == 0 { + return nil + } if defaultEncoding != "" { tmpBody, err := encodeBytes(r.Body, "text/plain; charset="+defaultEncoding) if err != nil { @@ -68,6 +75,16 @@ func (r *Response) fixCharset(detectCharset bool, defaultEncoding string) error return nil } contentType := strings.ToLower(r.Headers.Get("Content-Type")) + + if strings.Contains(contentType, "image/") || + strings.Contains(contentType, "video/") || + strings.Contains(contentType, "audio/") || + strings.Contains(contentType, "font/") { + // These MIME types should not have textual data. + + return nil + } + if !strings.Contains(contentType, "charset") { if !detectCharset { return nil @@ -95,5 +112,5 @@ func encodeBytes(b []byte, contentType string) ([]byte, error) { if err != nil { return nil, err } - return ioutil.ReadAll(r) + return io.ReadAll(r) } diff --git a/unmarshal.go b/unmarshal.go index 302f25871..42ceb2a69 100644 --- a/unmarshal.go +++ b/unmarshal.go @@ -35,17 +35,17 @@ func (h *HTMLElement) UnmarshalWithMap(v interface{}, structMap map[string]strin // UnmarshalHTML declaratively extracts text or attributes to a struct from // HTML response using struct tags composed of css selectors. // Allowed struct tags: -// - "selector" (required): CSS (goquery) selector of the desired data -// - "attr" (optional): Selects the matching element's attribute's value. +// - "selector" (required): CSS (goquery) selector of the desired data +// - "attr" (optional): Selects the matching element's attribute's value. // Leave it blank or omit to get the text of the element. // // Example struct declaration: // -// type Nested struct { -// String string `selector:"div > p"` -// Classes []string `selector:"li" attr:"class"` -// Struct *Nested `selector:"div > div"` -// } +// type Nested struct { +// String string `selector:"div > p"` +// Classes []string `selector:"li" attr:"class"` +// Struct *Nested `selector:"div > div"` +// } // // Supported types: struct, *struct, string, []string func UnmarshalHTML(v interface{}, s *goquery.Selection, structMap map[string]string) error { diff --git a/xmlelement.go b/xmlelement.go index 7ff5fe553..857900e85 100644 --- a/xmlelement.go +++ b/xmlelement.go @@ -15,7 +15,6 @@ package colly import ( - "encoding/xml" "strings" "github.com/antchfx/htmlquery" @@ -76,7 +75,7 @@ func (h *XMLElement) Attr(k string) string { } } } else { - for _, a := range h.attributes.([]xml.Attr) { + for _, a := range h.attributes.([]xmlquery.Attr) { if a.Name.Local == k { return a.Value } diff --git a/xmlelement_test.go b/xmlelement_test.go index ac7a1aeca..90a434826 100644 --- a/xmlelement_test.go +++ b/xmlelement_test.go @@ -16,7 +16,7 @@ package colly_test import ( "github.com/antchfx/htmlquery" - "github.com/gocolly/colly" + "github.com/gocolly/colly/v2" "reflect" "strings" "testing"