diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
deleted file mode 100644
index e17c682be..000000000
--- a/.github/ISSUE_TEMPLATE.md
+++ /dev/null
@@ -1,2 +0,0 @@
-
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 000000000..e330bb3e5
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,15 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 000000000..1f8e627d7
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,5 @@
+blank_issues_enabled: true
+contact_links:
+ - name: Question
+ url: https://stackoverflow.com/
+ about: Questions should go to Stack Overflow. You can use go-colly tag.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 000000000..bf3b90799
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,13 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 000000000..20f2af00a
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,81 @@
+---
+name: CI
+on:
+ push:
+ branches:
+ - '**'
+ pull_request:
+
+jobs:
+ test:
+ name: Test ${{matrix.go}}
+ runs-on: [ubuntu-latest]
+ strategy:
+ fail-fast: false
+ max-parallel: 4
+ matrix:
+ go: [
+ "1.22",
+ "1.21",
+ "1.20",
+ "1.19",
+ ]
+
+ steps:
+ - name: Checkout branch
+ uses: actions/checkout@v2
+
+ - name: Setup go
+ uses: actions/setup-go@v2
+ with:
+ go-version: ${{matrix.go}}
+
+ - name: Test
+ run: |
+ go install golang.org/x/lint/golint@latest
+ OUT="$(go get -a)"; test -z "$OUT" || (echo "$OUT" && return 1)
+ OUT="$(gofmt -l -d ./)"; test -z "$OUT" || (echo "$OUT" && return 1)
+ golint -set_exit_status
+ go vet -v ./...
+ go test -race -v -coverprofile=coverage.txt -covermode=atomic ./...
+
+ build:
+ name: Build ${{matrix.go}}
+ runs-on: [ubuntu-latest]
+ strategy:
+ fail-fast: false
+ max-parallel: 4
+ matrix:
+ go: [
+ "1.22",
+ "1.21",
+ "1.20",
+ "1.19",
+ ]
+
+ steps:
+ - name: Checkout branch
+ uses: actions/checkout@v2
+
+ - name: Setup go
+ uses: actions/setup-go@v2
+ with:
+ go-version: ${{matrix.go}}
+
+ - name: Build
+ run: |
+ go install golang.org/x/lint/golint@latest
+ OUT="$(go get -a)"; test -z "$OUT" || (echo "$OUT" && return 1)
+ OUT="$(gofmt -l -d ./)"; test -z "$OUT" || (echo "$OUT" && return 1)
+ golint -set_exit_status
+ go build
+
+ codecov:
+ name: Codecov
+ runs-on: [ubuntu-latest]
+ needs:
+ - test
+ - build
+ steps:
+ - name: Run Codecov
+ run: bash <(curl -s https://codecov.io/bash)
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index d72ef3847..000000000
--- a/.travis.yml
+++ /dev/null
@@ -1,17 +0,0 @@
-language: go
-sudo: false
-go:
- - 1.9.x
- - 1.10.x
- - 1.11.x
- - tip
-script:
- - go get -u golang.org/x/lint/golint
- - OUT="$(go get -a)"; test -z "$OUT" || (echo "$OUT" && return 1)
- - OUT="$(gofmt -l -d ./)"; test -z "$OUT" || (echo "$OUT" && return 1)
- - OUT="$(golint ./...)"; test -z "$OUT" || (echo "$OUT" && return 1)
- - go vet -v ./...
- - go test -race -v -coverprofile=coverage.txt -covermode=atomic ./
- - go build
-after_success:
- - bash <(curl -s https://codecov.io/bash)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 933d9eff1..166327f1e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,23 @@
+# 2.1.0 - 2020.06.09
+
+ - HTTP tracing support
+ - New callback: OnResponseHeader
+ - Queue fixes
+ - New collector option: Collector.CheckHead
+ - Proxy fixes
+ - Fixed POST revisit checking
+ - Updated dependencies
+
+# 2.0.0 - 2019.11.28
+
+ - Breaking change: Change Collector.RedirectHandler member to Collector.SetRedirectHandler function
+ - Go module support
+ - Collector.HasVisited method added to be able to check if an url has been visited
+ - Collector.SetClient method introduced
+ - HTMLElement.ChildTexts method added
+ - New user agents
+ - Multiple bugfixes
+
# 1.2.0 - 2019.02.13
- Compatibility with the latest htmlquery package
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 17df63602..c42dbc8f8 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -13,7 +13,7 @@ We welcome any type of contribution, not only code. You can help with
## Your First Contribution
-Working on your first Pull Request? You can learn how from this *free* series, [How to Contribute to an Open Source Project on GitHub](https://egghead.io/series/how-to-contribute-to-an-open-source-project-on-github).
+Working on your first Pull Request? You can learn how from this *free* series, [How to Contribute to an Open Source Project on GitHub](https://app.egghead.io/playlists/how-to-contribute-to-an-open-source-project-on-github).
## Submitting code
diff --git a/README.md b/README.md
index 06e73cbea..6205799ed 100644
--- a/README.md
+++ b/README.md
@@ -6,8 +6,8 @@ Colly provides a clean interface to write any kind of crawler/scraper/spider.
With Colly you can easily extract structured data from websites, which can be used for a wide range of applications, like data mining, data processing or archiving.
-[](https://godoc.org/github.com/gocolly/colly)
-[](#backers) [](#sponsors) [](https://travis-ci.org/gocolly/colly)
+[](https://pkg.go.dev/github.com/gocolly/colly/v2)
+[](#backers) [](#sponsors) [](https://github.com/gocolly/colly/actions/workflows/ci.yml)
[](http://goreportcard.com/report/gocolly/colly)
[](https://github.com/gocolly/colly/tree/master/_examples)
[](https://codecov.io/github/gocolly/colly?branch=master)
@@ -15,20 +15,39 @@ With Colly you can easily extract structured data from websites, which can be us
[](https://twitter.com/gocolly)
-## Features
+------
+
+
+## Sponsors
+
+
+
- * Clean API
- * Fast (>1k request/sec on a single core)
- * Manages request delays and maximum concurrency per domain
- * Automatic cookie and session handling
- * Sync/async/parallel scraping
- * Caching
- * Automatic encoding of non-unicode responses
- * Robots.txt support
- * Distributed scraping
- * Configuration via environment variables
- * Extensions
+[Scrapfly](https://scrapfly.io/?utm_source=Github&utm_medium=repo&utm_campaign=colly)
+is an enterprise-grade solution providing Web Scraping API that aims to simplify the
+scraping process by managing everything: real browser rendering, rotating proxies, and
+fingerprints (TLS, HTTP, browser) to bypass all major anti-bots. Scrapfly also unlocks the
+observability by providing an analytical dashboard and measuring the success rate/block
+rate in detail.
+
+
+------
+
+
+
+## Features
+- Clean API
+- Fast (>1k request/sec on a single core)
+- Manages request delays and maximum concurrency per domain
+- Automatic cookie and session handling
+- Sync/async/parallel scraping
+- Caching
+- Automatic encoding of non-unicode responses
+- Robots.txt support
+- Distributed scraping
+- Configuration via environment variables
+- Extensions
## Example
@@ -51,29 +70,44 @@ func main() {
See [examples folder](https://github.com/gocolly/colly/tree/master/_examples) for more detailed examples.
-
## Installation
+Add colly to your `go.mod` file:
+
```
-go get -u github.com/gocolly/colly/...
-```
+module github.com/x/y
+go 1.14
+
+require (
+ github.com/gocolly/colly/v2 latest
+)
+```
## Bugs
Bugs or suggestions? Visit the [issue tracker](https://github.com/gocolly/colly/issues) or join `#colly` on freenode
-
## Other Projects Using Colly
Below is a list of public, open source projects that use Colly:
- * [greenpeace/check-my-pages](https://github.com/greenpeace/check-my-pages) Scraping script to test the Spanish Greenpeace web archive
- * [altsab/gowap](https://github.com/altsab/gowap) Wappalyzer implementation in Go
- * [jesuiscamille/goquotes](https://github.com/jesuiscamille/goquotes) A quotes scrapper, making your day a little better!
- * [jivesearch/jivesearch](https://github.com/jivesearch/jivesearch) A search engine that doesn't track you.
- * [Leagify/colly-draft-prospects](https://github.com/Leagify/colly-draft-prospects) A scraper for future NFL Draft prospects.
- * [lucasepe/go-ps4](https://github.com/lucasepe/go-ps4) Search playstation store for your favorite PS4 games using the command line.
+- [greenpeace/check-my-pages](https://github.com/greenpeace/check-my-pages) Scraping script to test the Spanish Greenpeace web archive.
+- [altsab/gowap](https://github.com/altsab/gowap) Wappalyzer implementation in Go.
+- [jesuiscamille/goquotes](https://github.com/jesuiscamille/goquotes) A quotes scraper, making your day a little better!
+- [jivesearch/jivesearch](https://github.com/jivesearch/jivesearch) A search engine that doesn't track you.
+- [Leagify/colly-draft-prospects](https://github.com/Leagify/colly-draft-prospects) A scraper for future NFL Draft prospects.
+- [lucasepe/go-ps4](https://github.com/lucasepe/go-ps4) Search playstation store for your favorite PS4 games using the command line.
+- [yringler/inside-chassidus-scraper](https://github.com/yringler/inside-chassidus-scraper) Scrapes Rabbi Paltiel's web site for lesson metadata.
+- [gamedb/gamedb](https://github.com/gamedb/gamedb) A database of Steam games.
+- [lawzava/scrape](https://github.com/lawzava/scrape) CLI for email scraping from any website.
+- [eureka101v/WeiboSpiderGo](https://github.com/eureka101v/WeiboSpiderGo) A sina weibo(chinese twitter) scraper
+- [Go-phie/gophie](https://github.com/Go-phie/gophie) Search, Download and Stream movies from your terminal
+- [imthaghost/goclone](https://github.com/imthaghost/goclone) Clone websites to your computer within seconds.
+- [superiss/spidy](https://github.com/superiss/spidy) Crawl the web and collect expired domains.
+- [docker-slim/docker-slim](https://github.com/docker-slim/docker-slim) Optimize your Docker containers to make them smaller and better.
+- [seversky/gachifinder](https://github.com/seversky/gachifinder) an agent for asynchronous scraping, parsing and writing to some storages(elasticsearch for now)
+- [eval-exec/goodreads](https://github.com/eval-exec/goodreads) crawl all tags and all pages of quotes from goodreads.
If you are using Colly in a project please send a pull request to add it to the list.
@@ -82,14 +116,12 @@ If you are using Colly in a project please send a pull request to add it to the
This project exists thanks to all the people who contribute. [[Contribute]](CONTRIBUTING.md).
-
## Backers
Thank you to all our backers! 🙏 [[Become a backer](https://opencollective.com/colly#backer)]
-
## Sponsors
Support this project by becoming a sponsor. Your logo will show up here with a link to your website. [[Become a sponsor](https://opencollective.com/colly#sponsor)]
@@ -105,8 +137,6 @@ Support this project by becoming a sponsor. Your logo will show up here with a l
-
-
-
## License
+
[](https://app.fossa.io/projects/git%2Bgithub.com%2Fgocolly%2Fcolly?ref=badge_large)
diff --git a/VERSION b/VERSION
index 26aaba0e8..7ec1d6db4 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.2.0
+2.1.0
diff --git a/_examples/basic/basic.go b/_examples/basic/basic.go
index cd7abf4c1..b3f37251b 100644
--- a/_examples/basic/basic.go
+++ b/_examples/basic/basic.go
@@ -3,7 +3,7 @@ package main
import (
"fmt"
- "github.com/gocolly/colly"
+ "github.com/gocolly/colly/v2"
)
func main() {
diff --git a/_examples/coursera_courses/coursera_courses.go b/_examples/coursera_courses/coursera_courses.go
index 45544a383..8526b9a95 100644
--- a/_examples/coursera_courses/coursera_courses.go
+++ b/_examples/coursera_courses/coursera_courses.go
@@ -6,7 +6,7 @@ import (
"os"
"strings"
- "github.com/gocolly/colly"
+ "github.com/gocolly/colly/v2"
)
// Course stores information about a coursera course
@@ -18,11 +18,18 @@ type Course struct {
URL string
Language string
Commitment string
- HowToPass string
Rating string
}
func main() {
+ fName := "courses.json"
+ file, err := os.Create(fName)
+ if err != nil {
+ log.Fatalf("Cannot create file %q: %s\n", fName, err)
+ return
+ }
+ defer file.Close()
+
// Instantiate default collector
c := colly.NewCollector(
// Visit only domains: coursera.org, www.coursera.org
@@ -38,7 +45,7 @@ func main() {
courses := make([]Course, 0, 200)
- // On every a element which has href attribute call callback
+ // On every element which has "href" attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
// If attribute class is this long string return from callback
// As this a is irrelevant
@@ -59,8 +66,8 @@ func main() {
log.Println("visiting", r.URL.String())
})
- // On every a HTML element which has name attribute call callback
- c.OnHTML(`a[name]`, func(e *colly.HTMLElement) {
+ // On every element with collection-product-card class call callback
+ c.OnHTML(`a.collection-product-card`, func(e *colly.HTMLElement) {
// Activate detailCollector if the link contains "coursera.org/learn"
courseURL := e.Request.AbsoluteURL(e.Attr("href"))
if strings.Index(courseURL, "coursera.org/learn") != -1 {
@@ -71,7 +78,7 @@ func main() {
// Extract details of the course
detailCollector.OnHTML(`div[id=rendered-content]`, func(e *colly.HTMLElement) {
log.Println("Course found", e.Request.URL)
- title := e.ChildText(".course-title")
+ title := e.ChildText(".banner-title")
if title == "" {
log.Println("No title found", e.Request.URL)
}
@@ -79,22 +86,23 @@ func main() {
Title: title,
URL: e.Request.URL.String(),
Description: e.ChildText("div.content"),
- Creator: e.ChildText("div.creator-names > span"),
+ Creator: e.ChildText("li.banner-instructor-info > a > div > div > span"),
+ Rating: e.ChildText("span.number-rating"),
}
- // Iterate over rows of the table which contains different information
- // about the course
- e.ForEach("table.basic-info-table tr", func(_ int, el *colly.HTMLElement) {
- switch el.ChildText("td:first-child") {
- case "Language":
- course.Language = el.ChildText("td:nth-child(2)")
+ // Iterate over div components and add details to course
+ e.ForEach(".AboutCourse .ProductGlance > div", func(_ int, el *colly.HTMLElement) {
+ svgTitle := strings.Split(el.ChildText("div:nth-child(1) svg title"), " ")
+ lastWord := svgTitle[len(svgTitle)-1]
+ switch lastWord {
+ // svg Title: Available Languages
+ case "languages":
+ course.Language = el.ChildText("div:nth-child(2) > div:nth-child(1)")
+ // svg Title: Mixed/Beginner/Intermediate/Advanced Level
case "Level":
- course.Level = el.ChildText("td:nth-child(2)")
- case "Commitment":
- course.Commitment = el.ChildText("td:nth-child(2)")
- case "How To Pass":
- course.HowToPass = el.ChildText("td:nth-child(2)")
- case "User Ratings":
- course.Rating = el.ChildText("td:nth-child(2) div:nth-of-type(2)")
+ course.Level = el.ChildText("div:nth-child(2) > div:nth-child(1)")
+ // svg Title: Hours to complete
+ case "complete":
+ course.Commitment = el.ChildText("div:nth-child(2) > div:nth-child(1)")
}
})
courses = append(courses, course)
@@ -103,7 +111,7 @@ func main() {
// Start scraping on http://coursera.com/browse
c.Visit("https://coursera.org/browse")
- enc := json.NewEncoder(os.Stdout)
+ enc := json.NewEncoder(file)
enc.SetIndent("", " ")
// Dump json to the standard output
diff --git a/_examples/cryptocoinmarketcap/cryptocoinmarketcap.go b/_examples/cryptocoinmarketcap/cryptocoinmarketcap.go
index b84bb3457..3de34d8f8 100644
--- a/_examples/cryptocoinmarketcap/cryptocoinmarketcap.go
+++ b/_examples/cryptocoinmarketcap/cryptocoinmarketcap.go
@@ -5,7 +5,7 @@ import (
"log"
"os"
- "github.com/gocolly/colly"
+ "github.com/gocolly/colly/v2"
)
func main() {
@@ -20,21 +20,22 @@ func main() {
defer writer.Flush()
// Write CSV header
- writer.Write([]string{"Name", "Symbol", "Price (USD)", "Volume (USD)", "Market capacity (USD)", "Change (1h)", "Change (24h)", "Change (7d)"})
+ writer.Write([]string{"Name", "Symbol", "Market Cap (USD)", "Price (USD)", "Circulating Supply (USD)", "Volume (24h)", "Change (1h)", "Change (24h)", "Change (7d)"})
// Instantiate default collector
c := colly.NewCollector()
- c.OnHTML("#currencies-all tbody tr", func(e *colly.HTMLElement) {
+ c.OnHTML("tbody tr", func(e *colly.HTMLElement) {
writer.Write([]string{
- e.ChildText(".currency-name-container"),
- e.ChildText(".col-symbol"),
- e.ChildAttr("a.price", "data-usd"),
- e.ChildAttr("a.volume", "data-usd"),
- e.ChildAttr(".market-cap", "data-usd"),
- e.ChildText(".percent-1h"),
- e.ChildText(".percent-24h"),
- e.ChildText(".percent-7d"),
+ e.ChildText(".cmc-table__column-name"),
+ e.ChildText(".cmc-table__cell--sort-by__symbol"),
+ e.ChildText(".cmc-table__cell--sort-by__market-cap"),
+ e.ChildText(".cmc-table__cell--sort-by__price"),
+ e.ChildText(".cmc-table__cell--sort-by__circulating-supply"),
+ e.ChildText(".cmc-table__cell--sort-by__volume-24-h"),
+ e.ChildText(".cmc-table__cell--sort-by__percent-change-1-h"),
+ e.ChildText(".cmc-table__cell--sort-by__percent-change-24-h"),
+ e.ChildText(".cmc-table__cell--sort-by__percent-change-7-d"),
})
})
diff --git a/_examples/error_handling/error_handling.go b/_examples/error_handling/error_handling.go
index 7ac9d8f54..7d9d3d797 100644
--- a/_examples/error_handling/error_handling.go
+++ b/_examples/error_handling/error_handling.go
@@ -3,7 +3,7 @@ package main
import (
"fmt"
- "github.com/gocolly/colly"
+ "github.com/gocolly/colly/v2"
)
func main() {
diff --git a/_examples/factba.se/factbase.go b/_examples/factba.se/factbase.go
index 76edfc067..440acfd98 100644
--- a/_examples/factba.se/factbase.go
+++ b/_examples/factba.se/factbase.go
@@ -3,10 +3,10 @@ package main
import (
"encoding/json"
"fmt"
- "io/ioutil"
+ "os"
"strconv"
- "github.com/gocolly/colly"
+ "github.com/gocolly/colly/v2"
)
var baseSearchURL = "https://factba.se/json/json-transcript.php?q=&f=&dt=&p="
@@ -45,7 +45,7 @@ func main() {
if err != nil {
return
}
- ioutil.WriteFile(colly.SanitizeFileName(e.Request.Ctx.Get("date")+"_"+e.Request.Ctx.Get("slug"))+".json", jsonData, 0644)
+ os.WriteFile(colly.SanitizeFileName(e.Request.Ctx.Get("date")+"_"+e.Request.Ctx.Get("slug"))+".json", jsonData, 0644)
})
stop := false
diff --git a/_examples/google_groups/google_groups.go b/_examples/google_groups/google_groups.go
index cbeb97e9c..d838a831d 100644
--- a/_examples/google_groups/google_groups.go
+++ b/_examples/google_groups/google_groups.go
@@ -7,7 +7,7 @@ import (
"os"
"strings"
- "github.com/gocolly/colly"
+ "github.com/gocolly/colly/v2"
)
// Mail is the container of a single e-mail
diff --git a/_examples/hackernews_comments/hackernews_comments.go b/_examples/hackernews_comments/hackernews_comments.go
index 8859a5f3c..1ecc3086a 100644
--- a/_examples/hackernews_comments/hackernews_comments.go
+++ b/_examples/hackernews_comments/hackernews_comments.go
@@ -8,7 +8,7 @@ import (
"strconv"
"strings"
- "github.com/gocolly/colly"
+ "github.com/gocolly/colly/v2"
)
type comment struct {
diff --git a/_examples/instagram/instagram.go b/_examples/instagram/instagram.go
index de2b21368..c514ce0e7 100644
--- a/_examples/instagram/instagram.go
+++ b/_examples/instagram/instagram.go
@@ -10,7 +10,7 @@ import (
"regexp"
"strings"
- "github.com/gocolly/colly"
+ "github.com/gocolly/colly/v2"
)
// "id": user id, "after": end cursor
diff --git a/_examples/local_files/local_files b/_examples/local_files/local_files
deleted file mode 100755
index 4d7f677b5..000000000
Binary files a/_examples/local_files/local_files and /dev/null differ
diff --git a/_examples/local_files/local_files.go b/_examples/local_files/local_files.go
index ba2b9865c..3473a67f5 100644
--- a/_examples/local_files/local_files.go
+++ b/_examples/local_files/local_files.go
@@ -6,7 +6,7 @@ import (
"os"
"path/filepath"
- "github.com/gocolly/colly"
+ "github.com/gocolly/colly/v2"
)
func main() {
diff --git a/_examples/login/login.go b/_examples/login/login.go
index eeadeba36..ab7a6b06d 100644
--- a/_examples/login/login.go
+++ b/_examples/login/login.go
@@ -3,7 +3,7 @@ package main
import (
"log"
- "github.com/gocolly/colly"
+ "github.com/gocolly/colly/v2"
)
func main() {
diff --git a/_examples/max_depth/max_depth.go b/_examples/max_depth/max_depth.go
index 2e28e568c..d11af1805 100644
--- a/_examples/max_depth/max_depth.go
+++ b/_examples/max_depth/max_depth.go
@@ -3,7 +3,7 @@ package main
import (
"fmt"
- "github.com/gocolly/colly"
+ "github.com/gocolly/colly/v2"
)
func main() {
diff --git a/_examples/multipart/multipart.go b/_examples/multipart/multipart.go
index d8809241f..6d74facf9 100644
--- a/_examples/multipart/multipart.go
+++ b/_examples/multipart/multipart.go
@@ -2,19 +2,19 @@ package main
import (
"fmt"
- "io/ioutil"
+ "io"
"net/http"
"os"
"time"
- "github.com/gocolly/colly"
+ "github.com/gocolly/colly/v2"
)
func generateFormData() map[string][]byte {
f, _ := os.Open("gocolly.jpg")
defer f.Close()
- imgData, _ := ioutil.ReadAll(f)
+ imgData, _ := io.ReadAll(f)
return map[string][]byte{
"firstname": []byte("one"),
diff --git a/_examples/openedx_courses/openedx_courses.go b/_examples/openedx_courses/openedx_courses.go
index 293637541..f9a70d1c2 100644
--- a/_examples/openedx_courses/openedx_courses.go
+++ b/_examples/openedx_courses/openedx_courses.go
@@ -6,11 +6,11 @@ import (
"strings"
"time"
- "github.com/gocolly/colly"
+ "github.com/gocolly/colly/v2"
)
// DATE_FORMAT default format date used in openedx
-const DATE_FORMAT = "Jan 02, 2006"
+const DATE_FORMAT = "02 Jan, 2006"
// Course store openedx course data
type Course struct {
@@ -42,18 +42,19 @@ func main() {
if !strings.HasPrefix(link, "/courses/") {
return
}
- // start scaping the page under the link found
+ // start scraping the page under the link found
e.Request.Visit(link)
})
- c.OnHTML("div[class=content-wrapper]", func(e *colly.HTMLElement) {
- if e.DOM.Find("section.course-info").Length() == 0 {
+ c.OnHTML("div[class=main-container]", func(e *colly.HTMLElement) {
+ if e.DOM.Find("section#course-info").Length() == 0 {
return
}
- title := strings.Split(e.ChildText(".course-title"), "\n")[0]
+ title := strings.Split(e.ChildText(".course-info__title"), "\n")[0]
course_id := e.ChildAttr("input[name=course_id]", "value")
- start_date, _ := time.Parse(DATE_FORMAT, e.ChildText("span.start-date"))
- end_date, _ := time.Parse(DATE_FORMAT, e.ChildText("span.final-date"))
+ texts := e.ChildTexts("span[data-datetime]")
+ start_date, _ := time.Parse(DATE_FORMAT, texts[0])
+ end_date, _ := time.Parse(DATE_FORMAT, texts[1])
var run string
if len(strings.Split(course_id, "_")) > 1 {
run = strings.Split(course_id, "_")[1]
diff --git a/_examples/parallel/parallel.go b/_examples/parallel/parallel.go
index 837b85b20..9a5c9ae62 100644
--- a/_examples/parallel/parallel.go
+++ b/_examples/parallel/parallel.go
@@ -3,7 +3,7 @@ package main
import (
"fmt"
- "github.com/gocolly/colly"
+ "github.com/gocolly/colly/v2"
)
func main() {
@@ -12,7 +12,7 @@ func main() {
// MaxDepth is 2, so only the links on the scraped page
// and links on those pages are visited
colly.MaxDepth(2),
- colly.Async(true),
+ colly.Async(),
)
// Limit the maximum parallelism to 2
diff --git a/_examples/proxy_switcher/proxy_switcher.go b/_examples/proxy_switcher/proxy_switcher.go
index 022699f47..4f0154483 100644
--- a/_examples/proxy_switcher/proxy_switcher.go
+++ b/_examples/proxy_switcher/proxy_switcher.go
@@ -4,8 +4,8 @@ import (
"bytes"
"log"
- "github.com/gocolly/colly"
- "github.com/gocolly/colly/proxy"
+ "github.com/gocolly/colly/v2"
+ "github.com/gocolly/colly/v2/proxy"
)
func main() {
diff --git a/_examples/queue/queue.go b/_examples/queue/queue.go
index ddf70a660..e6cacc3c2 100644
--- a/_examples/queue/queue.go
+++ b/_examples/queue/queue.go
@@ -3,8 +3,8 @@ package main
import (
"fmt"
- "github.com/gocolly/colly"
- "github.com/gocolly/colly/queue"
+ "github.com/gocolly/colly/v2"
+ "github.com/gocolly/colly/v2/queue"
)
func main() {
diff --git a/_examples/random_delay/random_delay.go b/_examples/random_delay/random_delay.go
index d9f58a250..21037efdf 100644
--- a/_examples/random_delay/random_delay.go
+++ b/_examples/random_delay/random_delay.go
@@ -4,8 +4,8 @@ import (
"fmt"
"time"
- "github.com/gocolly/colly"
- "github.com/gocolly/colly/debug"
+ "github.com/gocolly/colly/v2"
+ "github.com/gocolly/colly/v2/debug"
)
func main() {
@@ -15,7 +15,7 @@ func main() {
c := colly.NewCollector(
// Attach a debugger to the collector
colly.Debugger(&debug.LogDebugger{}),
- colly.Async(true),
+ colly.Async(),
)
// Limit the number of threads started by colly to two
diff --git a/_examples/rate_limit/rate_limit.go b/_examples/rate_limit/rate_limit.go
index e17f4941f..0c533ceea 100644
--- a/_examples/rate_limit/rate_limit.go
+++ b/_examples/rate_limit/rate_limit.go
@@ -3,8 +3,8 @@ package main
import (
"fmt"
- "github.com/gocolly/colly"
- "github.com/gocolly/colly/debug"
+ "github.com/gocolly/colly/v2"
+ "github.com/gocolly/colly/v2/debug"
)
func main() {
@@ -13,7 +13,7 @@ func main() {
// Instantiate default collector
c := colly.NewCollector(
// Turn on asynchronous requests
- colly.Async(true),
+ colly.Async(),
// Attach a debugger to the collector
colly.Debugger(&debug.LogDebugger{}),
)
diff --git a/_examples/reddit/reddit.go b/_examples/reddit/reddit.go
index bd69f396d..06e86148c 100644
--- a/_examples/reddit/reddit.go
+++ b/_examples/reddit/reddit.go
@@ -5,7 +5,7 @@ import (
"os"
"time"
- "github.com/gocolly/colly"
+ "github.com/gocolly/colly/v2"
)
type item struct {
@@ -23,6 +23,7 @@ func main() {
c := colly.NewCollector(
// Visit only domains: old.reddit.com
colly.AllowedDomains("old.reddit.com"),
+ // Parallelism
colly.Async(true),
)
diff --git a/_examples/request_context/request_context.go b/_examples/request_context/request_context.go
index b4b79b435..ace7edfbe 100644
--- a/_examples/request_context/request_context.go
+++ b/_examples/request_context/request_context.go
@@ -3,7 +3,7 @@ package main
import (
"fmt"
- "github.com/gocolly/colly"
+ "github.com/gocolly/colly/v2"
)
func main() {
diff --git a/_examples/scraper_server/scraper_server.go b/_examples/scraper_server/scraper_server.go
index 6d0f0d85b..3c4bca6e9 100644
--- a/_examples/scraper_server/scraper_server.go
+++ b/_examples/scraper_server/scraper_server.go
@@ -5,7 +5,7 @@ import (
"log"
"net/http"
- "github.com/gocolly/colly"
+ "github.com/gocolly/colly/v2"
)
type pageInfo struct {
diff --git a/_examples/shopify_sitemap/shopify_sitemap.go b/_examples/shopify_sitemap/shopify_sitemap.go
index c769f37f4..e26d8a185 100644
--- a/_examples/shopify_sitemap/shopify_sitemap.go
+++ b/_examples/shopify_sitemap/shopify_sitemap.go
@@ -3,7 +3,7 @@ package main
import (
"fmt"
- "github.com/gocolly/colly"
+ "github.com/gocolly/colly/v2"
)
func main() {
diff --git a/_examples/url_filter/url_filter.go b/_examples/url_filter/url_filter.go
index a4560f94c..a9210f127 100644
--- a/_examples/url_filter/url_filter.go
+++ b/_examples/url_filter/url_filter.go
@@ -4,7 +4,7 @@ import (
"fmt"
"regexp"
- "github.com/gocolly/colly"
+ "github.com/gocolly/colly/v2"
)
func main() {
diff --git a/_examples/xkcd_store/xkcd_store.go b/_examples/xkcd_store/xkcd_store.go
index e77a6cff6..43f233a16 100644
--- a/_examples/xkcd_store/xkcd_store.go
+++ b/_examples/xkcd_store/xkcd_store.go
@@ -5,7 +5,7 @@ import (
"log"
"os"
- "github.com/gocolly/colly"
+ "github.com/gocolly/colly/v2"
)
func main() {
@@ -33,7 +33,7 @@ func main() {
e.ChildAttr("a", "title"),
e.ChildText("span"),
e.Request.AbsoluteURL(e.ChildAttr("a", "href")),
- "https" + e.ChildAttr("img", "src"),
+ "https:" + e.ChildAttr("img", "src"),
})
})
diff --git a/assets/scrapfly.png b/assets/scrapfly.png
new file mode 100644
index 000000000..e4b384f66
Binary files /dev/null and b/assets/scrapfly.png differ
diff --git a/cmd/colly/colly.go b/cmd/colly/colly.go
index 8ad240a37..a8e626fd3 100644
--- a/cmd/colly/colly.go
+++ b/cmd/colly/colly.go
@@ -29,7 +29,7 @@ var scraperHeadTemplate = `package main
import (
"log"
- "github.com/gocolly/colly"
+ "github.com/gocolly/colly/v2"
)
func main() {
@@ -48,19 +48,19 @@ var htmlCallbackTemplate = `
`
var requestCallbackTemplate = `
- c.OnRequest("element-selector", func(r *colly.Request) {
+ c.OnRequest(func(r *colly.Request) {
log.Println("Visiting", r.URL)
})
`
var responseCallbackTemplate = `
- c.OnResponse("element-selector", func(r *colly.Response) {
+ c.OnResponse(func(r *colly.Response) {
log.Println("Visited", r.Request.URL, r.StatusCode)
})
`
var errorCallbackTemplate = `
- c.OnError("element-selector", func(r *colly.Response, err error) {
+ c.OnError(func(r *colly.Response, err error) {
log.Printf("Error on %s: %s", r.Request.URL, err)
})
`
diff --git a/colly.go b/colly.go
index 587174ba3..ae74b7c3e 100644
--- a/colly.go
+++ b/colly.go
@@ -24,7 +24,6 @@ import (
"fmt"
"hash/fnv"
"io"
- "io/ioutil"
"log"
"net/http"
"net/http/cookiejar"
@@ -38,22 +37,26 @@ import (
"sync/atomic"
"time"
- "google.golang.org/appengine/urlfetch"
-
"github.com/PuerkitoBio/goquery"
"github.com/antchfx/htmlquery"
"github.com/antchfx/xmlquery"
+ "github.com/gocolly/colly/v2/debug"
+ "github.com/gocolly/colly/v2/storage"
"github.com/kennygrant/sanitize"
+ whatwgUrl "github.com/nlnwa/whatwg-url/url"
"github.com/temoto/robotstxt"
-
- "github.com/gocolly/colly/debug"
- "github.com/gocolly/colly/storage"
+ "google.golang.org/appengine/urlfetch"
)
+// A CollectorOption sets an option on a Collector.
+type CollectorOption func(*Collector)
+
// Collector provides the scraper instance for a scraping job
type Collector struct {
// UserAgent is the User-Agent string used by HTTP requests
UserAgent string
+ // Custom headers for the request
+ Headers *http.Header
// MaxDepth limits the recursion depth of visited URLs.
// Set it to 0 for infinite recursion (default).
MaxDepth int
@@ -102,28 +105,43 @@ type Collector struct {
// without explicit charset declaration. This feature uses https://github.com/saintfish/chardet
DetectCharset bool
// RedirectHandler allows control on how a redirect will be managed
- RedirectHandler func(req *http.Request, via []*http.Request) error
+ // use c.SetRedirectHandler to set this value
+ redirectHandler func(req *http.Request, via []*http.Request) error
// CheckHead performs a HEAD request before every GET to pre-validate the response
- CheckHead bool
- store storage.Storage
- debugger debug.Debugger
- robotsMap map[string]*robotstxt.RobotsData
- htmlCallbacks []*htmlCallbackContainer
- xmlCallbacks []*xmlCallbackContainer
- requestCallbacks []RequestCallback
- responseCallbacks []ResponseCallback
- errorCallbacks []ErrorCallback
- scrapedCallbacks []ScrapedCallback
- requestCount uint32
- responseCount uint32
- backend *httpBackend
- wg *sync.WaitGroup
- lock *sync.RWMutex
+ CheckHead bool
+ // TraceHTTP enables capturing and reporting request performance for crawler tuning.
+ // When set to true, the Response.Trace will be filled in with an HTTPTrace object.
+ TraceHTTP bool
+ // Context is the context that will be used for HTTP requests. You can set this
+ // to support clean cancellation of scraping.
+ Context context.Context
+ // MaxRequests limit the number of requests done by the instance.
+ // Set it to 0 for infinite requests (default).
+ MaxRequests uint32
+
+ store storage.Storage
+ debugger debug.Debugger
+ robotsMap map[string]*robotstxt.RobotsData
+ htmlCallbacks []*htmlCallbackContainer
+ xmlCallbacks []*xmlCallbackContainer
+ requestCallbacks []RequestCallback
+ responseCallbacks []ResponseCallback
+ responseHeadersCallbacks []ResponseHeadersCallback
+ errorCallbacks []ErrorCallback
+ scrapedCallbacks []ScrapedCallback
+ requestCount uint32
+ responseCount uint32
+ backend *httpBackend
+ wg *sync.WaitGroup
+ lock *sync.RWMutex
}
// RequestCallback is a type alias for OnRequest callback functions
type RequestCallback func(*Request)
+// ResponseHeadersCallback is a type alias for OnResponseHeaders callback functions
+type ResponseHeadersCallback func(*Response)
+
// ResponseCallback is a type alias for OnResponse callback functions
type ResponseCallback func(*Response)
@@ -142,6 +160,26 @@ type ScrapedCallback func(*Response)
// ProxyFunc is a type alias for proxy setter functions.
type ProxyFunc func(*http.Request) (*url.URL, error)
+// AlreadyVisitedError is the error type for already visited URLs.
+//
+// It's returned synchronously by Visit when the URL passed to Visit
+// is already visited.
+//
+// When already visited URL is encountered after following
+// redirects, this error appears in OnError callback, and if Async
+// mode is not enabled, is also returned by Visit.
+type AlreadyVisitedError struct {
+ // Destination is the URL that was attempted to be visited.
+ // It might not match the URL passed to Visit if redirect
+ // was followed.
+ Destination *url.URL
+}
+
+// Error implements error interface.
+func (e *AlreadyVisitedError) Error() string {
+ return fmt.Sprintf("%q already visited", e.Destination)
+}
+
type htmlCallbackContainer struct {
Selector string
Function HTMLCallback
@@ -181,14 +219,22 @@ var (
// ErrNoURLFiltersMatch is the error thrown if visiting
// a URL which is not allowed by URLFilters
ErrNoURLFiltersMatch = errors.New("No URLFilters match")
- // ErrAlreadyVisited is the error type for already visited URLs
- ErrAlreadyVisited = errors.New("URL already visited")
// ErrRobotsTxtBlocked is the error type for robots.txt errors
ErrRobotsTxtBlocked = errors.New("URL blocked by robots.txt")
// ErrNoCookieJar is the error type for missing cookie jar
ErrNoCookieJar = errors.New("Cookie jar is not available")
// ErrNoPattern is the error type for LimitRules without patterns
ErrNoPattern = errors.New("No pattern defined in LimitRule")
+ // ErrEmptyProxyURL is the error type for empty Proxy URL list
+ ErrEmptyProxyURL = errors.New("Proxy URL list is empty")
+ // ErrAbortedAfterHeaders is the error returned when OnResponseHeaders aborts the transfer.
+ ErrAbortedAfterHeaders = errors.New("Aborted after receiving response headers")
+ // ErrQueueFull is the error returned when the queue is full
+ ErrQueueFull = errors.New("Queue MaxSize reached")
+ // ErrMaxRequests is the error returned when exceeding max requests
+ ErrMaxRequests = errors.New("Max Requests limit reached")
+ // ErrRetryBodyUnseekable is the error when retry with not seekable body
+ ErrRetryBodyUnseekable = errors.New("Retry Body Unseekable")
)
var envMap = map[string]func(*Collector, string){
@@ -212,7 +258,7 @@ var envMap = map[string]func(*Collector, string){
},
"FOLLOW_REDIRECTS": func(c *Collector, val string) {
if !isYesString(val) {
- c.RedirectHandler = func(req *http.Request, via []*http.Request) error {
+ c.redirectHandler = func(req *http.Request, via []*http.Request) error {
return http.ErrUseLastResponse
}
}
@@ -225,20 +271,31 @@ var envMap = map[string]func(*Collector, string){
},
"MAX_DEPTH": func(c *Collector, val string) {
maxDepth, err := strconv.Atoi(val)
- if err != nil {
+ if err == nil {
c.MaxDepth = maxDepth
}
},
+ "MAX_REQUESTS": func(c *Collector, val string) {
+ maxRequests, err := strconv.ParseUint(val, 0, 32)
+ if err == nil {
+ c.MaxRequests = uint32(maxRequests)
+ }
+ },
"PARSE_HTTP_ERROR_RESPONSE": func(c *Collector, val string) {
c.ParseHTTPErrorResponse = isYesString(val)
},
+ "TRACE_HTTP": func(c *Collector, val string) {
+ c.TraceHTTP = isYesString(val)
+ },
"USER_AGENT": func(c *Collector, val string) {
c.UserAgent = val
},
}
+var urlParser = whatwgUrl.NewParser(whatwgUrl.WithPercentEncodeSinglePercentSign())
+
// NewCollector creates a new Collector instance with default configuration
-func NewCollector(options ...func(*Collector)) *Collector {
+func NewCollector(options ...CollectorOption) *Collector {
c := &Collector{}
c.Init()
@@ -252,35 +309,54 @@ func NewCollector(options ...func(*Collector)) *Collector {
}
// UserAgent sets the user agent used by the Collector.
-func UserAgent(ua string) func(*Collector) {
+func UserAgent(ua string) CollectorOption {
return func(c *Collector) {
c.UserAgent = ua
}
}
+// Headers sets the custom headers used by the Collector.
+func Headers(headers map[string]string) CollectorOption {
+ return func(c *Collector) {
+ customHeaders := make(http.Header)
+ for header, value := range headers {
+ customHeaders.Add(header, value)
+ }
+ c.Headers = &customHeaders
+ }
+}
+
// MaxDepth limits the recursion depth of visited URLs.
-func MaxDepth(depth int) func(*Collector) {
+func MaxDepth(depth int) CollectorOption {
return func(c *Collector) {
c.MaxDepth = depth
}
}
+// MaxRequests limit the number of requests done by the instance.
+// Set it to 0 for infinite requests (default).
+func MaxRequests(max uint32) CollectorOption {
+ return func(c *Collector) {
+ c.MaxRequests = max
+ }
+}
+
// AllowedDomains sets the domain whitelist used by the Collector.
-func AllowedDomains(domains ...string) func(*Collector) {
+func AllowedDomains(domains ...string) CollectorOption {
return func(c *Collector) {
c.AllowedDomains = domains
}
}
// ParseHTTPErrorResponse allows parsing responses with HTTP errors
-func ParseHTTPErrorResponse() func(*Collector) {
+func ParseHTTPErrorResponse() CollectorOption {
return func(c *Collector) {
c.ParseHTTPErrorResponse = true
}
}
// DisallowedDomains sets the domain blacklist used by the Collector.
-func DisallowedDomains(domains ...string) func(*Collector) {
+func DisallowedDomains(domains ...string) CollectorOption {
return func(c *Collector) {
c.DisallowedDomains = domains
}
@@ -288,7 +364,7 @@ func DisallowedDomains(domains ...string) func(*Collector) {
// DisallowedURLFilters sets the list of regular expressions which restricts
// visiting URLs. If any of the rules matches to a URL the request will be stopped.
-func DisallowedURLFilters(filters ...*regexp.Regexp) func(*Collector) {
+func DisallowedURLFilters(filters ...*regexp.Regexp) CollectorOption {
return func(c *Collector) {
c.DisallowedURLFilters = filters
}
@@ -296,28 +372,28 @@ func DisallowedURLFilters(filters ...*regexp.Regexp) func(*Collector) {
// URLFilters sets the list of regular expressions which restricts
// visiting URLs. If any of the rules matches to a URL the request won't be stopped.
-func URLFilters(filters ...*regexp.Regexp) func(*Collector) {
+func URLFilters(filters ...*regexp.Regexp) CollectorOption {
return func(c *Collector) {
c.URLFilters = filters
}
}
// AllowURLRevisit instructs the Collector to allow multiple downloads of the same URL
-func AllowURLRevisit() func(*Collector) {
+func AllowURLRevisit() CollectorOption {
return func(c *Collector) {
c.AllowURLRevisit = true
}
}
// MaxBodySize sets the limit of the retrieved response body in bytes.
-func MaxBodySize(sizeInBytes int) func(*Collector) {
+func MaxBodySize(sizeInBytes int) CollectorOption {
return func(c *Collector) {
c.MaxBodySize = sizeInBytes
}
}
// CacheDir specifies the location where GET requests are cached as files.
-func CacheDir(path string) func(*Collector) {
+func CacheDir(path string) CollectorOption {
return func(c *Collector) {
c.CacheDir = path
}
@@ -325,47 +401,76 @@ func CacheDir(path string) func(*Collector) {
// IgnoreRobotsTxt instructs the Collector to ignore any restrictions
// set by the target host's robots.txt file.
-func IgnoreRobotsTxt() func(*Collector) {
+func IgnoreRobotsTxt() CollectorOption {
return func(c *Collector) {
c.IgnoreRobotsTxt = true
}
}
+// TraceHTTP instructs the Collector to collect and report request trace data
+// on the Response.Trace.
+func TraceHTTP() CollectorOption {
+ return func(c *Collector) {
+ c.TraceHTTP = true
+ }
+}
+
+// StdlibContext sets the context that will be used for HTTP requests.
+// You can set this to support clean cancellation of scraping.
+func StdlibContext(ctx context.Context) CollectorOption {
+ return func(c *Collector) {
+ c.Context = ctx
+ }
+}
+
// ID sets the unique identifier of the Collector.
-func ID(id uint32) func(*Collector) {
+func ID(id uint32) CollectorOption {
return func(c *Collector) {
c.ID = id
}
}
// Async turns on asynchronous network requests.
-func Async(a bool) func(*Collector) {
+func Async(a ...bool) CollectorOption {
return func(c *Collector) {
- c.Async = a
+ if len(a) > 0 {
+ c.Async = a[0]
+ } else {
+ c.Async = true
+ }
}
}
// DetectCharset enables character encoding detection for non-utf8 response bodies
// without explicit charset declaration. This feature uses https://github.com/saintfish/chardet
-func DetectCharset() func(*Collector) {
+func DetectCharset() CollectorOption {
return func(c *Collector) {
c.DetectCharset = true
}
}
// Debugger sets the debugger used by the Collector.
-func Debugger(d debug.Debugger) func(*Collector) {
+func Debugger(d debug.Debugger) CollectorOption {
return func(c *Collector) {
d.Init()
c.debugger = d
}
}
+// CheckHead performs a HEAD request before every GET to pre-validate the response
+func CheckHead() CollectorOption {
+ return func(c *Collector) {
+ c.CheckHead = true
+ }
+}
+
// Init initializes the Collector's private variables and sets default
// configuration for the Collector
func (c *Collector) Init() {
- c.UserAgent = "colly - https://github.com/gocolly/colly"
+ c.UserAgent = "colly - https://github.com/gocolly/colly/v2"
+ c.Headers = nil
c.MaxDepth = 0
+ c.MaxRequests = 0
c.store = &storage.InMemoryStorage{}
c.store.Init()
c.MaxBodySize = 10 * 1024 * 1024
@@ -378,19 +483,22 @@ func (c *Collector) Init() {
c.robotsMap = make(map[string]*robotstxt.RobotsData)
c.IgnoreRobotsTxt = true
c.ID = atomic.AddUint32(&collectorCounter, 1)
+ c.TraceHTTP = false
+ c.Context = context.Background()
}
// Appengine will replace the Collector's backend http.Client
// With an Http.Client that is provided by appengine/urlfetch
// This function should be used when the scraper is run on
// Google App Engine. Example:
-// func startScraper(w http.ResponseWriter, r *http.Request) {
-// ctx := appengine.NewContext(r)
-// c := colly.NewCollector()
-// c.Appengine(ctx)
-// ...
-// c.Visit("https://google.ca")
-// }
+//
+// func startScraper(w http.ResponseWriter, r *http.Request) {
+// ctx := appengine.NewContext(r)
+// c := colly.NewCollector()
+// c.Appengine(ctx)
+// ...
+// c.Visit("https://google.ca")
+// }
func (c *Collector) Appengine(ctx context.Context) {
client := urlfetch.Client(ctx)
client.Jar = c.backend.Client.Jar
@@ -412,6 +520,17 @@ func (c *Collector) Visit(URL string) error {
return c.scrape(URL, "GET", 1, nil, nil, nil, true)
}
+// HasVisited checks if the provided URL has been visited
+func (c *Collector) HasVisited(URL string) (bool, error) {
+ return c.checkHasVisited(URL, nil)
+}
+
+// HasPosted checks if the provided URL and requestData has been visited
+// This method is useful more likely to prevent re-visit same URL and POST body
+func (c *Collector) HasPosted(URL string, requestData map[string]string) (bool, error) {
+ return c.checkHasVisited(URL, requestData)
+}
+
// Head starts a collector job by creating a HEAD request.
func (c *Collector) Head(URL string) error {
return c.scrape(URL, "HEAD", 1, nil, nil, nil, false)
@@ -481,6 +600,7 @@ func (c *Collector) UnmarshalRequest(r []byte) (*Request, error) {
return &Request{
Method: req.Method,
URL: u,
+ Depth: req.Depth,
Body: bytes.NewReader(req.Body),
Ctx: ctx,
ID: atomic.AddUint32(&c.requestCount, 1),
@@ -490,48 +610,50 @@ func (c *Collector) UnmarshalRequest(r []byte) (*Request, error) {
}
func (c *Collector) scrape(u, method string, depth int, requestData io.Reader, ctx *Context, hdr http.Header, checkRevisit bool) error {
- if err := c.requestCheck(u, method, depth, checkRevisit); err != nil {
+ parsedWhatwgURL, err := urlParser.Parse(u)
+ if err != nil {
return err
}
- parsedURL, err := url.Parse(u)
+ parsedURL, err := url.Parse(parsedWhatwgURL.Href(false))
if err != nil {
return err
}
- if parsedURL.Scheme == "" {
- parsedURL.Scheme = "http"
+ if hdr == nil {
+ hdr = http.Header{}
+ if c.Headers != nil {
+ for k, v := range *c.Headers {
+ for _, value := range v {
+ hdr.Add(k, value)
+ }
+ }
+ }
}
- if !c.isDomainAllowed(parsedURL.Host) {
- return ErrForbiddenDomain
+ if _, ok := hdr["User-Agent"]; !ok {
+ hdr.Set("User-Agent", c.UserAgent)
}
- if method != "HEAD" && !c.IgnoreRobotsTxt {
- if err = c.checkRobots(parsedURL); err != nil {
+ if seeker, ok := requestData.(io.ReadSeeker); ok {
+ _, err := seeker.Seek(0, io.SeekStart)
+ if err != nil {
return err
}
}
- if hdr == nil {
- hdr = http.Header{"User-Agent": []string{c.UserAgent}}
- }
- rc, ok := requestData.(io.ReadCloser)
- if !ok && requestData != nil {
- rc = ioutil.NopCloser(requestData)
+
+ req, err := http.NewRequest(method, parsedURL.String(), requestData)
+ if err != nil {
+ return err
}
+ req.Header = hdr
// The Go HTTP API ignores "Host" in the headers, preferring the client
// to use the Host field on Request.
- host := parsedURL.Host
if hostHeader := hdr.Get("Host"); hostHeader != "" {
- host = hostHeader
- }
- req := &http.Request{
- Method: method,
- URL: parsedURL,
- Proto: "HTTP/1.1",
- ProtoMajor: 1,
- ProtoMinor: 1,
- Header: hdr,
- Body: rc,
- Host: host,
- }
- setRequestBody(req, requestData)
+ req.Host = hostHeader
+ }
+ // note: once 1.13 is minimum supported Go version,
+ // replace this with http.NewRequestWithContext
+ req = req.WithContext(c.Context)
+ if err := c.requestCheck(parsedURL, method, req.GetBody, depth, checkRevisit); err != nil {
+ return err
+ }
u = parsedURL.String()
c.wg.Add(1)
if c.Async {
@@ -541,38 +663,6 @@ func (c *Collector) scrape(u, method string, depth int, requestData io.Reader, c
return c.fetch(u, method, depth, requestData, ctx, hdr, req)
}
-func setRequestBody(req *http.Request, body io.Reader) {
- if body != nil {
- switch v := body.(type) {
- case *bytes.Buffer:
- req.ContentLength = int64(v.Len())
- buf := v.Bytes()
- req.GetBody = func() (io.ReadCloser, error) {
- r := bytes.NewReader(buf)
- return ioutil.NopCloser(r), nil
- }
- case *bytes.Reader:
- req.ContentLength = int64(v.Len())
- snapshot := *v
- req.GetBody = func() (io.ReadCloser, error) {
- r := snapshot
- return ioutil.NopCloser(&r), nil
- }
- case *strings.Reader:
- req.ContentLength = int64(v.Len())
- snapshot := *v
- req.GetBody = func() (io.ReadCloser, error) {
- r := snapshot
- return ioutil.NopCloser(&r), nil
- }
- }
- if req.GetBody != nil && req.ContentLength == 0 {
- req.Body = http.NoBody
- req.GetBody = func() (io.ReadCloser, error) { return http.NoBody, nil }
- }
- }
-}
-
func (c *Collector) fetch(u, method string, depth int, requestData io.Reader, ctx *Context, hdr http.Header, req *http.Request) error {
defer c.wg.Done()
if ctx == nil {
@@ -581,6 +671,7 @@ func (c *Collector) fetch(u, method string, depth int, requestData io.Reader, ct
request := &Request{
URL: req.URL,
Headers: &req.Header,
+ Host: req.Host,
Ctx: ctx,
Depth: depth,
Method: method,
@@ -589,6 +680,10 @@ func (c *Collector) fetch(u, method string, depth int, requestData io.Reader, ct
ID: atomic.AddUint32(&c.requestCount, 1),
}
+ if req.Header.Get("Accept") == "" {
+ req.Header.Set("Accept", "*/*")
+ }
+
c.handleOnRequest(request)
if request.abort {
@@ -599,25 +694,31 @@ func (c *Collector) fetch(u, method string, depth int, requestData io.Reader, ct
req.Header.Add("Content-Type", "application/x-www-form-urlencoded")
}
- if req.Header.Get("Accept") == "" {
- req.Header.Set("Accept", "*/*")
+ var hTrace *HTTPTrace
+ if c.TraceHTTP {
+ hTrace = &HTTPTrace{}
+ req = hTrace.WithTrace(req)
}
-
origURL := req.URL
- response, err := c.backend.Cache(req, c.MaxBodySize, c.CacheDir)
+ checkHeadersFunc := func(req *http.Request, statusCode int, headers http.Header) bool {
+ if req.URL != origURL {
+ request.URL = req.URL
+ request.Headers = &req.Header
+ }
+ c.handleOnResponseHeaders(&Response{Ctx: ctx, Request: request, StatusCode: statusCode, Headers: &headers})
+ return !request.abort
+ }
+ response, err := c.backend.Cache(req, c.MaxBodySize, checkHeadersFunc, c.CacheDir)
if proxyURL, ok := req.Context().Value(ProxyURLKey).(string); ok {
request.ProxyURL = proxyURL
}
if err := c.handleOnError(response, err, request, ctx); err != nil {
return err
}
- if req.URL != origURL {
- request.URL = req.URL
- request.Headers = &req.Header
- }
atomic.AddUint32(&c.responseCount, 1)
response.Ctx = ctx
response.Request = request
+ response.Trace = hTrace
err = response.fixCharset(c.DetectCharset, request.ResponseCharacterEncoding)
if err != nil {
@@ -641,39 +742,69 @@ func (c *Collector) fetch(u, method string, depth int, requestData io.Reader, ct
return err
}
-func (c *Collector) requestCheck(u, method string, depth int, checkRevisit bool) error {
- if u == "" {
- return ErrMissingURL
- }
+func (c *Collector) requestCheck(parsedURL *url.URL, method string, getBody func() (io.ReadCloser, error), depth int, checkRevisit bool) error {
+ u := parsedURL.String()
if c.MaxDepth > 0 && c.MaxDepth < depth {
return ErrMaxDepth
}
- if len(c.DisallowedURLFilters) > 0 {
- if isMatchingFilter(c.DisallowedURLFilters, []byte(u)) {
- return ErrForbiddenURL
- }
+ if c.MaxRequests > 0 && c.requestCount >= c.MaxRequests {
+ return ErrMaxRequests
}
- if len(c.URLFilters) > 0 {
- if !isMatchingFilter(c.URLFilters, []byte(u)) {
- return ErrNoURLFiltersMatch
+ if err := c.checkFilters(u, parsedURL.Hostname()); err != nil {
+ return err
+ }
+ if method != "HEAD" && !c.IgnoreRobotsTxt {
+ if err := c.checkRobots(parsedURL); err != nil {
+ return err
}
}
- if checkRevisit && !c.AllowURLRevisit && method == "GET" {
- h := fnv.New64a()
- h.Write([]byte(u))
- uHash := h.Sum64()
+ if checkRevisit && !c.AllowURLRevisit {
+ // TODO weird behaviour, it allows CheckHead to work correctly,
+ // but it should probably better be solved with
+ // "check-but-not-save" flag or something
+ if method != "GET" && getBody == nil {
+ return nil
+ }
+
+ var body io.ReadCloser
+ if getBody != nil {
+ var err error
+ body, err = getBody()
+ if err != nil {
+ return err
+ }
+ defer body.Close()
+ }
+ uHash := requestHash(u, body)
visited, err := c.store.IsVisited(uHash)
if err != nil {
return err
}
if visited {
- return ErrAlreadyVisited
+ return &AlreadyVisitedError{parsedURL}
}
return c.store.Visited(uHash)
}
return nil
}
+func (c *Collector) checkFilters(URL, domain string) error {
+ if len(c.DisallowedURLFilters) > 0 {
+ if isMatchingFilter(c.DisallowedURLFilters, []byte(URL)) {
+ return ErrForbiddenURL
+ }
+ }
+ if len(c.URLFilters) > 0 {
+ if !isMatchingFilter(c.URLFilters, []byte(URL)) {
+ return ErrNoURLFiltersMatch
+ }
+ }
+ if !c.isDomainAllowed(domain) {
+ return ErrForbiddenDomain
+ }
+ return nil
+}
+
func (c *Collector) isDomainAllowed(domain string) bool {
for _, d2 := range c.DisallowedDomains {
if d2 == domain {
@@ -702,6 +833,8 @@ func (c *Collector) checkRobots(u *url.URL) error {
if err != nil {
return err
}
+ defer resp.Body.Close()
+
robot, err = robotstxt.FromResponse(resp)
if err != nil {
return err
@@ -716,7 +849,11 @@ func (c *Collector) checkRobots(u *url.URL) error {
return nil
}
- if !uaGroup.Test(u.EscapedPath()) {
+ eu := u.EscapedPath()
+ if u.RawQuery != "" {
+ eu += "?" + u.Query().Encode()
+ }
+ if !uaGroup.Test(eu) {
return ErrRobotsTxtBlocked
}
return nil
@@ -727,8 +864,8 @@ func (c *Collector) checkRobots(u *url.URL) error {
func (c *Collector) String() string {
return fmt.Sprintf(
"Requests made: %d (%d responses) | Callbacks: OnRequest: %d, OnHTML: %d, OnResponse: %d, OnError: %d",
- c.requestCount,
- c.responseCount,
+ atomic.LoadUint32(&c.requestCount),
+ atomic.LoadUint32(&c.responseCount),
len(c.requestCallbacks),
len(c.htmlCallbacks),
len(c.responseCallbacks),
@@ -752,6 +889,23 @@ func (c *Collector) OnRequest(f RequestCallback) {
c.lock.Unlock()
}
+// OnResponseHeaders registers a function. Function will be executed on every response
+// when headers and status are already received, but body is not yet read.
+//
+// Like in OnRequest, you can call Request.Abort to abort the transfer. This might be
+// useful if, for example, you're following all hyperlinks, but want to avoid
+// downloading files.
+//
+// Be aware that using this will prevent HTTP/1.1 connection reuse, as
+// the only way to abort a download is to immediately close the connection.
+// HTTP/2 doesn't suffer from this problem, as it's possible to close
+// specific stream inside the connection.
+func (c *Collector) OnResponseHeaders(f ResponseHeadersCallback) {
+ c.lock.Lock()
+ c.responseHeadersCallbacks = append(c.responseHeadersCallbacks, f)
+ c.lock.Unlock()
+}
+
// OnResponse registers a function. Function will be executed on every response
func (c *Collector) OnResponse(f ResponseCallback) {
c.lock.Lock()
@@ -846,6 +1000,11 @@ func (c *Collector) OnScraped(f ScrapedCallback) {
c.lock.Unlock()
}
+// SetClient will override the previously set http.Client
+func (c *Collector) SetClient(client *http.Client) {
+ c.backend.Client = client
+}
+
// WithTransport allows you to set a custom http.RoundTripper (transport)
func (c *Collector) WithTransport(transport http.RoundTripper) {
c.backend.Client.Transport = transport
@@ -857,7 +1016,7 @@ func (c *Collector) DisableCookies() {
}
// SetCookieJar overrides the previously set cookie jar
-func (c *Collector) SetCookieJar(j *cookiejar.Jar) {
+func (c *Collector) SetCookieJar(j http.CookieJar) {
c.backend.Client.Jar = j
}
@@ -904,9 +1063,11 @@ func (c *Collector) SetProxyFunc(p ProxyFunc) {
t, ok := c.backend.Client.Transport.(*http.Transport)
if c.backend.Client.Transport != nil && ok {
t.Proxy = p
+ t.DisableKeepAlives = true
} else {
c.backend.Client.Transport = &http.Transport{
- Proxy: p,
+ Proxy: p,
+ DisableKeepAlives: true,
}
}
}
@@ -943,16 +1104,53 @@ func (c *Collector) handleOnResponse(r *Response) {
}
}
+func (c *Collector) handleOnResponseHeaders(r *Response) {
+ if c.debugger != nil {
+ c.debugger.Event(createEvent("responseHeaders", r.Request.ID, c.ID, map[string]string{
+ "url": r.Request.URL.String(),
+ "status": http.StatusText(r.StatusCode),
+ }))
+ }
+ for _, f := range c.responseHeadersCallbacks {
+ f(r)
+ }
+}
+
func (c *Collector) handleOnHTML(resp *Response) error {
- if len(c.htmlCallbacks) == 0 || !strings.Contains(strings.ToLower(resp.Headers.Get("Content-Type")), "html") {
+ if len(c.htmlCallbacks) == 0 {
+ return nil
+ }
+
+ contentType := resp.Headers.Get("Content-Type")
+ if contentType == "" {
+ contentType = http.DetectContentType(resp.Body)
+ }
+ // implementation of mime.ParseMediaType without parsing the params
+ // part
+ mediatype, _, _ := strings.Cut(contentType, ";")
+ mediatype = strings.TrimSpace(strings.ToLower(mediatype))
+
+ // TODO we also want to parse application/xml as XHTML if it has
+ // appropriate doctype
+ switch mediatype {
+ case "text/html", "application/xhtml+xml":
+ default:
return nil
}
+
doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(resp.Body))
if err != nil {
return err
}
if href, found := doc.Find("base[href]").Attr("href"); found {
- resp.Request.baseURL, _ = url.Parse(href)
+ u, err := urlParser.ParseRef(resp.Request.URL.String(), href)
+ if err == nil {
+ baseURL, err := url.Parse(u.Href(false))
+ if err == nil {
+ resp.Request.baseURL = baseURL
+ }
+ }
+
}
for _, cc := range c.htmlCallbacks {
i := 0
@@ -978,7 +1176,8 @@ func (c *Collector) handleOnXML(resp *Response) error {
return nil
}
contentType := strings.ToLower(resp.Headers.Get("Content-Type"))
- if !strings.Contains(contentType, "html") && !strings.Contains(contentType, "xml") {
+ isXMLFile := strings.HasSuffix(strings.ToLower(resp.Request.URL.Path), ".xml") || strings.HasSuffix(strings.ToLower(resp.Request.URL.Path), ".xml.gz")
+ if !strings.Contains(contentType, "html") && (!strings.Contains(contentType, "xml") && !isXMLFile) {
return nil
}
@@ -990,7 +1189,10 @@ func (c *Collector) handleOnXML(resp *Response) error {
if e := htmlquery.FindOne(doc, "//base"); e != nil {
for _, a := range e.Attr {
if a.Key == "href" {
- resp.Request.baseURL, _ = url.Parse(a.Val)
+ baseURL, err := resp.Request.URL.Parse(a.Val)
+ if err == nil {
+ resp.Request.baseURL = baseURL
+ }
break
}
}
@@ -1008,7 +1210,7 @@ func (c *Collector) handleOnXML(resp *Response) error {
cc.Function(e)
}
}
- } else if strings.Contains(contentType, "xml") {
+ } else if strings.Contains(contentType, "xml") || isXMLFile {
doc, err := xmlquery.Parse(bytes.NewBuffer(resp.Body))
if err != nil {
return err
@@ -1082,6 +1284,12 @@ func (c *Collector) Limits(rules []*LimitRule) error {
return c.backend.Limits(rules)
}
+// SetRedirectHandler instructs the Collector to allow multiple downloads of the same URL
+func (c *Collector) SetRedirectHandler(f func(req *http.Request, via []*http.Request) error) {
+ c.redirectHandler = f
+ c.backend.Client.CheckRedirect = c.checkRedirectFunc()
+}
+
// SetCookies handles the receipt of the cookies in a reply for the given URL
func (c *Collector) SetCookies(URL string, cookies []*http.Cookie) error {
if c.backend.Client.Jar == nil {
@@ -1121,16 +1329,20 @@ func (c *Collector) Clone() *Collector {
IgnoreRobotsTxt: c.IgnoreRobotsTxt,
MaxBodySize: c.MaxBodySize,
MaxDepth: c.MaxDepth,
+ MaxRequests: c.MaxRequests,
DisallowedURLFilters: c.DisallowedURLFilters,
URLFilters: c.URLFilters,
CheckHead: c.CheckHead,
ParseHTTPErrorResponse: c.ParseHTTPErrorResponse,
UserAgent: c.UserAgent,
+ Headers: c.Headers,
+ TraceHTTP: c.TraceHTTP,
+ Context: c.Context,
store: c.store,
backend: c.backend,
debugger: c.debugger,
Async: c.Async,
- RedirectHandler: c.RedirectHandler,
+ redirectHandler: c.redirectHandler,
errorCallbacks: make([]ErrorCallback, 0, 8),
htmlCallbacks: make([]*htmlCallbackContainer, 0, 8),
xmlCallbacks: make([]*xmlCallbackContainer, 0, 8),
@@ -1145,12 +1357,41 @@ func (c *Collector) Clone() *Collector {
func (c *Collector) checkRedirectFunc() func(req *http.Request, via []*http.Request) error {
return func(req *http.Request, via []*http.Request) error {
- if !c.isDomainAllowed(req.URL.Host) {
- return fmt.Errorf("Not following redirect to %s because its not in AllowedDomains", req.URL.Host)
+ if err := c.checkFilters(req.URL.String(), req.URL.Hostname()); err != nil {
+ return fmt.Errorf("Not following redirect to %q: %w", req.URL, err)
+ }
+
+ // allow redirects to the original destination
+ // to support websites redirecting to the same page while setting
+ // session cookies
+ samePageRedirect := normalizeURL(req.URL.String()) == normalizeURL(via[0].URL.String())
+
+ if !c.AllowURLRevisit && !samePageRedirect {
+ var body io.ReadCloser
+ if req.GetBody != nil {
+ var err error
+ body, err = req.GetBody()
+ if err != nil {
+ return err
+ }
+ defer body.Close()
+ }
+ uHash := requestHash(req.URL.String(), body)
+ visited, err := c.store.IsVisited(uHash)
+ if err != nil {
+ return err
+ }
+ if visited {
+ return &AlreadyVisitedError{req.URL}
+ }
+ err = c.store.Visited(uHash)
+ if err != nil {
+ return err
+ }
}
- if c.RedirectHandler != nil {
- return c.RedirectHandler(req, via)
+ if c.redirectHandler != nil {
+ return c.redirectHandler(req, via)
}
// Honor golangs default of maximum of 10 redirects
@@ -1160,13 +1401,6 @@ func (c *Collector) checkRedirectFunc() func(req *http.Request, via []*http.Requ
lastRequest := via[len(via)-1]
- // Copy the headers from last request
- for hName, hValues := range lastRequest.Header {
- for _, hValue := range hValues {
- req.Header.Set(hName, hValue)
- }
- }
-
// If domain has changed, remove the Authorization-header if it exists
if req.URL.Host != lastRequest.URL.Host {
req.Header.Del("Authorization")
@@ -1190,6 +1424,11 @@ func (c *Collector) parseSettingsFromEnv() {
}
}
+func (c *Collector) checkHasVisited(URL string, requestData map[string]string) (bool, error) {
+ hash := requestHash(URL, createFormReader(requestData))
+ return c.store.IsVisited(hash)
+}
+
// SanitizeFileName replaces dangerous characters in a string
// so the return value can be used as a safe file name.
func SanitizeFileName(fileName string) string {
@@ -1228,7 +1467,8 @@ func createMultipartReader(boundary string, data map[string][]byte) io.Reader {
buffer.WriteString("\n")
}
buffer.WriteString(dashBoundary + "--\n\n")
- return buffer
+ return bytes.NewReader(buffer.Bytes())
+
}
// randomBoundary was borrowed from
@@ -1298,3 +1538,22 @@ func isMatchingFilter(fs []*regexp.Regexp, d []byte) bool {
}
return false
}
+
+func normalizeURL(u string) string {
+ parsed, err := urlParser.Parse(u)
+ if err != nil {
+ return u
+ }
+ return parsed.String()
+}
+
+func requestHash(url string, body io.Reader) uint64 {
+ h := fnv.New64a()
+ // reparse the url to fix ambiguities such as
+ // "http://example.com" vs "http://example.com/"
+ io.WriteString(h, normalizeURL(url))
+ if body != nil {
+ io.Copy(h, body)
+ }
+ return h.Sum64()
+}
diff --git a/colly_test.go b/colly_test.go
index d5c88294b..e70d2774e 100644
--- a/colly_test.go
+++ b/colly_test.go
@@ -15,19 +15,24 @@
package colly
import (
+ "bufio"
"bytes"
+ "context"
+ "errors"
"fmt"
"net/http"
"net/http/httptest"
+ "net/url"
"os"
"reflect"
"regexp"
"strings"
"testing"
+ "time"
"github.com/PuerkitoBio/goquery"
- "github.com/gocolly/colly/debug"
+ "github.com/gocolly/colly/v2/debug"
)
var serverIndexResponse = []byte("hello world\n")
@@ -35,9 +40,10 @@ var robotsFile = `
User-agent: *
Allow: /allowed
Disallow: /disallowed
+Disallow: /allowed*q=
`
-func newTestServer() *httptest.Server {
+func newUnstartedTestServer() *httptest.Server {
mux := http.NewServeMux()
mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
@@ -46,7 +52,11 @@ func newTestServer() *httptest.Server {
})
mux.HandleFunc("/html", func(w http.ResponseWriter, r *http.Request) {
- w.Header().Set("Content-Type", "text/html")
+ if r.URL.Query().Get("no-content-type") != "" {
+ w.Header()["Content-Type"] = nil
+ } else {
+ w.Header().Set("Content-Type", "text/html")
+ }
w.Write([]byte(`
@@ -61,6 +71,17 @@ func newTestServer() *httptest.Server {
`))
})
+ mux.HandleFunc("/xml", func(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("Content-Type", "application/xml")
+ w.Write([]byte(`
+
+ Test Page
+ This is a test page
+ This is a test paragraph
+
+ `))
+ })
+
mux.HandleFunc("/login", func(w http.ResponseWriter, r *http.Request) {
if r.Method == "POST" {
w.Header().Set("Content-Type", "text/html")
@@ -84,7 +105,11 @@ func newTestServer() *httptest.Server {
})
mux.Handle("/redirect", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
- http.Redirect(w, r, "/redirected/", http.StatusSeeOther)
+ destination := "/redirected/"
+ if d := r.URL.Query().Get("d"); d != "" {
+ destination = d
+ }
+ http.Redirect(w, r, destination, http.StatusSeeOther)
}))
@@ -121,6 +146,21 @@ func newTestServer() *httptest.Server {
w.Write([]byte(r.Header.Get("User-Agent")))
})
+ mux.HandleFunc("/host_header", func(w http.ResponseWriter, r *http.Request) {
+ w.WriteHeader(200)
+ w.Write([]byte(r.Host))
+ })
+
+ mux.HandleFunc("/accept_header", func(w http.ResponseWriter, r *http.Request) {
+ w.WriteHeader(200)
+ w.Write([]byte(r.Header.Get("Accept")))
+ })
+
+ mux.HandleFunc("/custom_header", func(w http.ResponseWriter, r *http.Request) {
+ w.WriteHeader(200)
+ w.Write([]byte(r.Header.Get("Test")))
+ })
+
mux.HandleFunc("/base", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
w.Write([]byte(`
@@ -136,7 +176,99 @@ func newTestServer() *httptest.Server {
`))
})
- return httptest.NewServer(mux)
+ mux.HandleFunc("/base_relative", func(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("Content-Type", "text/html")
+ w.Write([]byte(`
+
+
+Test Page
+
+
+
+link
+
+
+ `))
+ })
+
+ mux.HandleFunc("/tabs_and_newlines", func(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("Content-Type", "text/html")
+ w.Write([]byte(`
+
+
+Test Page
+
+
+
+link
+
+
+ `))
+ })
+
+ mux.HandleFunc("/foobar/xy", func(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("Content-Type", "text/html")
+ w.Write([]byte(`
+
+
+Test Page
+
+
+hello
+
+
+ `))
+ })
+
+ mux.HandleFunc("/100%25", func(w http.ResponseWriter, r *http.Request) {
+ w.Write([]byte("100 percent"))
+ })
+
+ mux.HandleFunc("/large_binary", func(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("Content-Type", "application/octet-stream")
+ ww := bufio.NewWriter(w)
+ defer ww.Flush()
+ for {
+ // have to check error to detect client aborting download
+ if _, err := ww.Write([]byte{0x41}); err != nil {
+ return
+ }
+ }
+ })
+
+ mux.HandleFunc("/slow", func(w http.ResponseWriter, r *http.Request) {
+ w.WriteHeader(200)
+
+ ticker := time.NewTicker(100 * time.Millisecond)
+ defer ticker.Stop()
+
+ i := 0
+
+ for {
+ select {
+ case <-r.Context().Done():
+ return
+ case t := <-ticker.C:
+ fmt.Fprintf(w, "%s\n", t)
+ if flusher, ok := w.(http.Flusher); ok {
+ flusher.Flush()
+ }
+ i++
+ if i == 10 {
+ return
+ }
+ }
+ }
+ })
+
+ return httptest.NewUnstartedServer(mux)
+}
+
+func newTestServer() *httptest.Server {
+ srv := newUnstartedTestServer()
+ srv.Start()
+ return srv
}
var newCollectorTests = map[string]func(*testing.T){
@@ -285,6 +417,53 @@ var newCollectorTests = map[string]func(*testing.T){
t.Fatalf("c.debugger = %v, want %v", got, want)
}
},
+ "CheckHead": func(t *testing.T) {
+ c := NewCollector(CheckHead())
+
+ if !c.CheckHead {
+ t.Fatal("c.CheckHead = false, want true")
+ }
+ },
+ "Async": func(t *testing.T) {
+ c := NewCollector(Async())
+
+ if !c.Async {
+ t.Fatal("c.Async = false, want true")
+ }
+ },
+}
+
+func TestNoAcceptHeader(t *testing.T) {
+ ts := newTestServer()
+ defer ts.Close()
+
+ var receivedHeader string
+ // checks if Accept is enabled by default
+ func() {
+ c := NewCollector()
+ c.OnResponse(func(resp *Response) {
+ receivedHeader = string(resp.Body)
+ })
+ c.Visit(ts.URL + "/accept_header")
+ if receivedHeader != "*/*" {
+ t.Errorf("default Accept header isn't */*. got: %v", receivedHeader)
+ }
+ }()
+
+ // checks if Accept can be disabled
+ func() {
+ c := NewCollector()
+ c.OnRequest(func(r *Request) {
+ r.Headers.Del("Accept")
+ })
+ c.OnResponse(func(resp *Response) {
+ receivedHeader = string(resp.Body)
+ })
+ c.Visit(ts.URL + "/accept_header")
+ if receivedHeader != "" {
+ t.Errorf("failed to pass request with no Accept header. got: %v", receivedHeader)
+ }
+ }()
}
func TestNewCollector(t *testing.T) {
@@ -349,6 +528,65 @@ func TestCollectorVisit(t *testing.T) {
}
}
+func TestCollectorVisitWithAllowedDomains(t *testing.T) {
+ ts := newTestServer()
+ defer ts.Close()
+
+ c := NewCollector(AllowedDomains("localhost", "127.0.0.1", "::1"))
+ err := c.Visit(ts.URL)
+ if err != nil {
+ t.Errorf("Failed to visit url %s", ts.URL)
+ }
+
+ err = c.Visit("http://example.com")
+ if err != ErrForbiddenDomain {
+ t.Errorf("c.Visit should return ErrForbiddenDomain, but got %v", err)
+ }
+}
+
+func TestCollectorVisitWithDisallowedDomains(t *testing.T) {
+ ts := newTestServer()
+ defer ts.Close()
+
+ c := NewCollector(DisallowedDomains("localhost", "127.0.0.1", "::1"))
+ err := c.Visit(ts.URL)
+ if err != ErrForbiddenDomain {
+ t.Errorf("c.Visit should return ErrForbiddenDomain, but got %v", err)
+ }
+
+ c2 := NewCollector(DisallowedDomains("example.com"))
+ err = c2.Visit("http://example.com:8080")
+ if err != ErrForbiddenDomain {
+ t.Errorf("c.Visit should return ErrForbiddenDomain, but got %v", err)
+ }
+ err = c2.Visit(ts.URL)
+ if err != nil {
+ t.Errorf("Failed to visit url %s", ts.URL)
+ }
+}
+
+func TestCollectorVisitResponseHeaders(t *testing.T) {
+ ts := newTestServer()
+ defer ts.Close()
+
+ var onResponseHeadersCalled bool
+
+ c := NewCollector()
+ c.OnResponseHeaders(func(r *Response) {
+ onResponseHeadersCalled = true
+ if r.Headers.Get("Content-Type") == "application/octet-stream" {
+ r.Request.Abort()
+ }
+ })
+ c.OnResponse(func(r *Response) {
+ t.Error("OnResponse was called")
+ })
+ c.Visit(ts.URL + "/large_binary")
+ if !onResponseHeadersCalled {
+ t.Error("OnResponseHeaders was not called")
+ }
+}
+
func TestCollectorOnHTML(t *testing.T) {
ts := newTestServer()
defer ts.Close()
@@ -393,6 +631,34 @@ func TestCollectorOnHTML(t *testing.T) {
}
}
+func TestCollectorContentSniffing(t *testing.T) {
+ ts := newTestServer()
+ defer ts.Close()
+
+ c := NewCollector()
+
+ htmlCallbackCalled := false
+
+ c.OnResponse(func(r *Response) {
+ if (*r.Headers)["Content-Type"] != nil {
+ t.Error("Content-Type unexpectedly not nil")
+ }
+ })
+
+ c.OnHTML("html", func(e *HTMLElement) {
+ htmlCallbackCalled = true
+ })
+
+ err := c.Visit(ts.URL + "/html?no-content-type=yes")
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if !htmlCallbackCalled {
+ t.Error("OnHTML was not called")
+ }
+}
+
func TestCollectorURLRevisit(t *testing.T) {
ts := newTestServer()
defer ts.Close()
@@ -422,160 +688,560 @@ func TestCollectorURLRevisit(t *testing.T) {
}
}
-func TestCollectorPost(t *testing.T) {
+func TestCollectorPostRevisit(t *testing.T) {
ts := newTestServer()
defer ts.Close()
postValue := "hello"
- c := NewCollector()
+ postData := map[string]string{
+ "name": postValue,
+ }
+ visitCount := 0
+ c := NewCollector()
c.OnResponse(func(r *Response) {
if postValue != string(r.Body) {
t.Error("Failed to send data with POST")
}
+ visitCount++
})
+ c.Post(ts.URL+"/login", postData)
+ c.Post(ts.URL+"/login", postData)
c.Post(ts.URL+"/login", map[string]string{
- "name": postValue,
+ "name": postValue,
+ "lastname": "world",
})
-}
-
-func TestRedirect(t *testing.T) {
- ts := newTestServer()
- defer ts.Close()
- c := NewCollector()
- c.OnHTML("a[href]", func(e *HTMLElement) {
- u := e.Request.AbsoluteURL(e.Attr("href"))
- if !strings.HasSuffix(u, "/redirected/test") {
- t.Error("Invalid URL after redirect: " + u)
- }
- })
- c.OnResponse(func(r *Response) {
- if !strings.HasSuffix(r.Request.URL.String(), "/redirected/") {
- t.Error("Invalid URL in Request after redirect: " + r.Request.URL.String())
- }
- })
- c.Visit(ts.URL + "/redirect")
-}
+ if visitCount != 2 {
+ t.Error("URL POST revisited")
+ }
-func TestBaseTag(t *testing.T) {
- ts := newTestServer()
- defer ts.Close()
+ c.AllowURLRevisit = true
- c := NewCollector()
- c.OnHTML("a[href]", func(e *HTMLElement) {
- u := e.Request.AbsoluteURL(e.Attr("href"))
- if u != "http://xy.com/z" {
- t.Error("Invalid tag handling in OnHTML: expected https://xy.com/z, got " + u)
- }
- })
- c.Visit(ts.URL + "/base")
+ c.Post(ts.URL+"/login", postData)
+ c.Post(ts.URL+"/login", postData)
- c2 := NewCollector()
- c2.OnXML("//a", func(e *XMLElement) {
- u := e.Request.AbsoluteURL(e.Attr("href"))
- if u != "http://xy.com/z" {
- t.Error("Invalid tag handling in OnXML: expected https://xy.com/z, got " + u)
- }
- })
- c2.Visit(ts.URL + "/base")
+ if visitCount != 4 {
+ t.Error("URL POST not revisited")
+ }
}
-func TestCollectorCookies(t *testing.T) {
+func TestCollectorURLRevisitCheck(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
- if err := c.Visit(ts.URL + "/set_cookie"); err != nil {
- t.Fatal(err)
- }
+ visited, err := c.HasVisited(ts.URL)
- if err := c.Visit(ts.URL + "/check_cookie"); err != nil {
- t.Fatalf("Failed to use previously set cookies: %s", err)
+ if err != nil {
+ t.Error(err.Error())
}
-}
-
-func TestRobotsWhenAllowed(t *testing.T) {
- ts := newTestServer()
- defer ts.Close()
- c := NewCollector()
- c.IgnoreRobotsTxt = false
+ if visited != false {
+ t.Error("Expected URL to NOT have been visited")
+ }
- c.OnResponse(func(resp *Response) {
- if resp.StatusCode != 200 {
- t.Fatalf("Wrong response code: %d", resp.StatusCode)
- }
- })
+ c.Visit(ts.URL)
- err := c.Visit(ts.URL + "/allowed")
+ visited, err = c.HasVisited(ts.URL)
if err != nil {
- t.Fatal(err)
+ t.Error(err.Error())
}
-}
-func TestRobotsWhenDisallowed(t *testing.T) {
- ts := newTestServer()
- defer ts.Close()
+ if visited != true {
+ t.Error("Expected URL to have been visited")
+ }
- c := NewCollector()
- c.IgnoreRobotsTxt = false
+ errorTestCases := []struct {
+ Path string
+ DestinationError string
+ }{
+ {"/", "/"},
+ {"/redirect?d=/", "/"},
+ // now that /redirect?d=/ itself is recorded as visited,
+ // it's now returned in error
+ {"/redirect?d=/", "/redirect?d=/"},
+ {"/redirect?d=/redirect%3Fd%3D/", "/redirect?d=/"},
+ {"/redirect?d=/redirect%3Fd%3D/", "/redirect?d=/redirect%3Fd%3D/"},
+ {"/redirect?d=/redirect%3Fd%3D/&foo=bar", "/redirect?d=/"},
+ }
- c.OnResponse(func(resp *Response) {
- t.Fatalf("Received response: %d", resp.StatusCode)
- })
+ for i, testCase := range errorTestCases {
+ err := c.Visit(ts.URL + testCase.Path)
+ if testCase.DestinationError == "" {
+ if err != nil {
+ t.Errorf("got unexpected error in test %d: %q", i, err)
+ }
+ } else {
+ var ave *AlreadyVisitedError
+ if !errors.As(err, &ave) {
+ t.Errorf("err=%q returned when trying to revisit, expected AlreadyVisitedError", err)
+ } else {
+ if got, want := ave.Destination.String(), ts.URL+testCase.DestinationError; got != want {
+ t.Errorf("wrong destination in AlreadyVisitedError in test %d, got=%q want=%q", i, got, want)
+ }
+ }
+ }
+ }
+}
- err := c.Visit(ts.URL + "/disallowed")
- if err.Error() != "URL blocked by robots.txt" {
- t.Fatalf("wrong error message: %v", err)
+func TestSetCookieRedirect(t *testing.T) {
+ type middleware = func(http.Handler) http.Handler
+ for _, m := range []middleware{
+ requireSessionCookieSimple,
+ requireSessionCookieAuthPage,
+ } {
+ t.Run("", func(t *testing.T) {
+ ts := newUnstartedTestServer()
+ ts.Config.Handler = m(ts.Config.Handler)
+ ts.Start()
+ defer ts.Close()
+ c := NewCollector()
+ c.OnResponse(func(r *Response) {
+ if got, want := r.Body, serverIndexResponse; !bytes.Equal(got, want) {
+ t.Errorf("bad response body got=%q want=%q", got, want)
+ }
+ if got, want := r.StatusCode, http.StatusOK; got != want {
+ t.Errorf("bad response code got=%d want=%d", got, want)
+ }
+ })
+ if err := c.Visit(ts.URL); err != nil {
+ t.Fatal(err)
+ }
+ })
}
}
-func TestIgnoreRobotsWhenDisallowed(t *testing.T) {
+func TestCollectorPostURLRevisitCheck(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
- c.IgnoreRobotsTxt = true
- c.OnResponse(func(resp *Response) {
- if resp.StatusCode != 200 {
- t.Fatalf("Wrong response code: %d", resp.StatusCode)
- }
- })
+ postValue := "hello"
+ postData := map[string]string{
+ "name": postValue,
+ }
- err := c.Visit(ts.URL + "/disallowed")
+ posted, err := c.HasPosted(ts.URL+"/login", postData)
if err != nil {
- t.Fatal(err)
+ t.Error(err.Error())
}
-}
+ if posted != false {
+ t.Error("Expected URL to NOT have been visited")
+ }
-func TestConnectionErrorOnRobotsTxtResultsInError(t *testing.T) {
- ts := newTestServer()
- ts.Close() // immediately close the server to force a connection error
+ c.Post(ts.URL+"/login", postData)
- c := NewCollector()
- c.IgnoreRobotsTxt = false
- err := c.Visit(ts.URL)
+ posted, err = c.HasPosted(ts.URL+"/login", postData)
- if err == nil {
- t.Fatal("Error expected")
+ if err != nil {
+ t.Error(err.Error())
}
-}
-func TestEnvSettings(t *testing.T) {
- ts := newTestServer()
- defer ts.Close()
+ if posted != true {
+ t.Error("Expected URL to have been visited")
+ }
- os.Setenv("COLLY_USER_AGENT", "test")
- defer os.Unsetenv("COLLY_USER_AGENT")
+ postData["lastname"] = "world"
+ posted, err = c.HasPosted(ts.URL+"/login", postData)
- c := NewCollector()
+ if err != nil {
+ t.Error(err.Error())
+ }
+
+ if posted != false {
+ t.Error("Expected URL to NOT have been visited")
+ }
+
+ c.Post(ts.URL+"/login", postData)
+
+ posted, err = c.HasPosted(ts.URL+"/login", postData)
+
+ if err != nil {
+ t.Error(err.Error())
+ }
+
+ if posted != true {
+ t.Error("Expected URL to have been visited")
+ }
+}
+
+// TestCollectorURLRevisitDisallowed ensures that disallowed URL is not considered visited.
+func TestCollectorURLRevisitDomainDisallowed(t *testing.T) {
+ ts := newTestServer()
+ defer ts.Close()
+
+ parsedURL, err := url.Parse(ts.URL)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ c := NewCollector(DisallowedDomains(parsedURL.Hostname()))
+ err = c.Visit(ts.URL)
+ if got, want := err, ErrForbiddenDomain; got != want {
+ t.Fatalf("wrong error on first visit: got=%v want=%v", got, want)
+ }
+ err = c.Visit(ts.URL)
+ if got, want := err, ErrForbiddenDomain; got != want {
+ t.Fatalf("wrong error on second visit: got=%v want=%v", got, want)
+ }
+
+}
+
+func TestCollectorPost(t *testing.T) {
+ ts := newTestServer()
+ defer ts.Close()
+
+ postValue := "hello"
+ c := NewCollector()
+
+ c.OnResponse(func(r *Response) {
+ if postValue != string(r.Body) {
+ t.Error("Failed to send data with POST")
+ }
+ })
+
+ c.Post(ts.URL+"/login", map[string]string{
+ "name": postValue,
+ })
+}
+
+func TestCollectorPostRaw(t *testing.T) {
+ ts := newTestServer()
+ defer ts.Close()
+
+ postValue := "hello"
+ c := NewCollector()
+
+ c.OnResponse(func(r *Response) {
+ if postValue != string(r.Body) {
+ t.Error("Failed to send data with POST")
+ }
+ })
+
+ c.PostRaw(ts.URL+"/login", []byte("name="+postValue))
+}
+
+func TestCollectorPostRawRevisit(t *testing.T) {
+ ts := newTestServer()
+ defer ts.Close()
+
+ postValue := "hello"
+ postData := "name=" + postValue
+ visitCount := 0
+
+ c := NewCollector()
+ c.OnResponse(func(r *Response) {
+ if postValue != string(r.Body) {
+ t.Error("Failed to send data with POST RAW")
+ }
+ visitCount++
+ })
+
+ c.PostRaw(ts.URL+"/login", []byte(postData))
+ c.PostRaw(ts.URL+"/login", []byte(postData))
+ c.PostRaw(ts.URL+"/login", []byte(postData+"&lastname=world"))
+
+ if visitCount != 2 {
+ t.Error("URL POST RAW revisited")
+ }
+
+ c.AllowURLRevisit = true
+
+ c.PostRaw(ts.URL+"/login", []byte(postData))
+ c.PostRaw(ts.URL+"/login", []byte(postData))
+
+ if visitCount != 4 {
+ t.Error("URL POST RAW not revisited")
+ }
+}
+
+func TestRedirect(t *testing.T) {
+ ts := newTestServer()
+ defer ts.Close()
+
+ c := NewCollector()
+ c.OnHTML("a[href]", func(e *HTMLElement) {
+ u := e.Request.AbsoluteURL(e.Attr("href"))
+ if !strings.HasSuffix(u, "/redirected/test") {
+ t.Error("Invalid URL after redirect: " + u)
+ }
+ })
+
+ c.OnResponseHeaders(func(r *Response) {
+ if !strings.HasSuffix(r.Request.URL.String(), "/redirected/") {
+ t.Error("Invalid URL in Request after redirect (OnResponseHeaders): " + r.Request.URL.String())
+ }
+ })
+
+ c.OnResponse(func(r *Response) {
+ if !strings.HasSuffix(r.Request.URL.String(), "/redirected/") {
+ t.Error("Invalid URL in Request after redirect (OnResponse): " + r.Request.URL.String())
+ }
+ })
+ c.Visit(ts.URL + "/redirect")
+}
+
+func TestIssue594(t *testing.T) {
+ // This is a regression test for a data race bug. There's no
+ // assertions because it's meant to be used with race detector
+ ts := newTestServer()
+ defer ts.Close()
+
+ c := NewCollector()
+ // if timeout is set, this bug is not triggered
+ c.SetClient(&http.Client{Timeout: 0 * time.Second})
+
+ c.Visit(ts.URL)
+}
+
+func TestRedirectWithDisallowedURLs(t *testing.T) {
+ ts := newTestServer()
+ defer ts.Close()
+
+ c := NewCollector()
+ c.DisallowedURLFilters = []*regexp.Regexp{regexp.MustCompile(ts.URL + "/redirected/test")}
+ c.OnHTML("a[href]", func(e *HTMLElement) {
+ u := e.Request.AbsoluteURL(e.Attr("href"))
+ err := c.Visit(u)
+ if !errors.Is(err, ErrForbiddenURL) {
+ t.Error("URL should have been forbidden: " + u)
+ }
+ })
+
+ c.Visit(ts.URL + "/redirect")
+}
+
+func TestBaseTag(t *testing.T) {
+ ts := newTestServer()
+ defer ts.Close()
+
+ c := NewCollector()
+ c.OnHTML("a[href]", func(e *HTMLElement) {
+ u := e.Request.AbsoluteURL(e.Attr("href"))
+ if u != "http://xy.com/z" {
+ t.Error("Invalid tag handling in OnHTML: expected https://xy.com/z, got " + u)
+ }
+ })
+ c.Visit(ts.URL + "/base")
+
+ c2 := NewCollector()
+ c2.OnXML("//a", func(e *XMLElement) {
+ u := e.Request.AbsoluteURL(e.Attr("href"))
+ if u != "http://xy.com/z" {
+ t.Error("Invalid tag handling in OnXML: expected https://xy.com/z, got " + u)
+ }
+ })
+ c2.Visit(ts.URL + "/base")
+}
+
+func TestBaseTagRelative(t *testing.T) {
+ ts := newTestServer()
+ defer ts.Close()
+
+ c := NewCollector()
+ c.OnHTML("a[href]", func(e *HTMLElement) {
+ u := e.Request.AbsoluteURL(e.Attr("href"))
+ expected := ts.URL + "/foobar/z"
+ if u != expected {
+ t.Errorf("Invalid tag handling in OnHTML: expected %q, got %q", expected, u)
+ }
+ })
+ c.Visit(ts.URL + "/base_relative")
+
+ c2 := NewCollector()
+ c2.OnXML("//a", func(e *XMLElement) {
+ u := e.Request.AbsoluteURL(e.Attr("href"))
+ expected := ts.URL + "/foobar/z"
+ if u != expected {
+ t.Errorf("Invalid tag handling in OnXML: expected %q, got %q", expected, u)
+ }
+ })
+ c2.Visit(ts.URL + "/base_relative")
+}
+
+func TestTabsAndNewlines(t *testing.T) {
+ // this test might look odd, but see step 3 of
+ // https://url.spec.whatwg.org/#concept-basic-url-parser
+
+ ts := newTestServer()
+ defer ts.Close()
+
+ visited := map[string]struct{}{}
+ expected := map[string]struct{}{
+ "/tabs_and_newlines": {},
+ "/foobar/xy": {},
+ }
+
+ c := NewCollector()
+ c.OnResponse(func(res *Response) {
+ visited[res.Request.URL.EscapedPath()] = struct{}{}
+ })
+ c.OnHTML("a[href]", func(e *HTMLElement) {
+ if err := e.Request.Visit(e.Attr("href")); err != nil {
+ t.Errorf("visit failed: %v", err)
+ }
+ })
+
+ if err := c.Visit(ts.URL + "/tabs_and_newlines"); err != nil {
+ t.Errorf("visit failed: %v", err)
+ }
+
+ if !reflect.DeepEqual(visited, expected) {
+ t.Errorf("visited=%v expected=%v", visited, expected)
+ }
+}
+
+func TestLonePercent(t *testing.T) {
+ ts := newTestServer()
+ defer ts.Close()
+
+ var visitedPath string
+
+ c := NewCollector()
+ c.OnResponse(func(res *Response) {
+ visitedPath = res.Request.URL.RequestURI()
+ })
+ if err := c.Visit(ts.URL + "/100%"); err != nil {
+ t.Errorf("visit failed: %v", err)
+ }
+ // Automatic encoding is not really correct: browsers
+ // would send bare percent here. However, Go net/http
+ // cannot send such requests due to
+ // https://github.com/golang/go/issues/29808. So we have two
+ // alternatives really: return an error when attempting
+ // to fetch such URLs, or at least try the encoded variant.
+ // This test checks that the latter is attempted.
+ if got, want := visitedPath, "/100%25"; got != want {
+ t.Errorf("got=%q want=%q", got, want)
+ }
+ // invalid URL escape in query component is not a problem,
+ // but check it anyway
+ if err := c.Visit(ts.URL + "/?a=100%zz"); err != nil {
+ t.Errorf("visit failed: %v", err)
+ }
+ if got, want := visitedPath, "/?a=100%zz"; got != want {
+ t.Errorf("got=%q want=%q", got, want)
+ }
+}
+
+func TestCollectorCookies(t *testing.T) {
+ ts := newTestServer()
+ defer ts.Close()
+
+ c := NewCollector()
+
+ if err := c.Visit(ts.URL + "/set_cookie"); err != nil {
+ t.Fatal(err)
+ }
+
+ if err := c.Visit(ts.URL + "/check_cookie"); err != nil {
+ t.Fatalf("Failed to use previously set cookies: %s", err)
+ }
+}
+
+func TestRobotsWhenAllowed(t *testing.T) {
+ ts := newTestServer()
+ defer ts.Close()
+
+ c := NewCollector()
+ c.IgnoreRobotsTxt = false
+
+ c.OnResponse(func(resp *Response) {
+ if resp.StatusCode != 200 {
+ t.Fatalf("Wrong response code: %d", resp.StatusCode)
+ }
+ })
+
+ err := c.Visit(ts.URL + "/allowed")
+
+ if err != nil {
+ t.Fatal(err)
+ }
+}
+
+func TestRobotsWhenDisallowed(t *testing.T) {
+ ts := newTestServer()
+ defer ts.Close()
+
+ c := NewCollector()
+ c.IgnoreRobotsTxt = false
+
+ c.OnResponse(func(resp *Response) {
+ t.Fatalf("Received response: %d", resp.StatusCode)
+ })
+
+ err := c.Visit(ts.URL + "/disallowed")
+ if err.Error() != "URL blocked by robots.txt" {
+ t.Fatalf("wrong error message: %v", err)
+ }
+}
+
+func TestRobotsWhenDisallowedWithQueryParameter(t *testing.T) {
+ ts := newTestServer()
+ defer ts.Close()
+
+ c := NewCollector()
+ c.IgnoreRobotsTxt = false
+
+ c.OnResponse(func(resp *Response) {
+ t.Fatalf("Received response: %d", resp.StatusCode)
+ })
+
+ err := c.Visit(ts.URL + "/allowed?q=1")
+ if err.Error() != "URL blocked by robots.txt" {
+ t.Fatalf("wrong error message: %v", err)
+ }
+}
+
+func TestIgnoreRobotsWhenDisallowed(t *testing.T) {
+ ts := newTestServer()
+ defer ts.Close()
+
+ c := NewCollector()
+ c.IgnoreRobotsTxt = true
+
+ c.OnResponse(func(resp *Response) {
+ if resp.StatusCode != 200 {
+ t.Fatalf("Wrong response code: %d", resp.StatusCode)
+ }
+ })
+
+ err := c.Visit(ts.URL + "/disallowed")
+
+ if err != nil {
+ t.Fatal(err)
+ }
+
+}
+
+func TestConnectionErrorOnRobotsTxtResultsInError(t *testing.T) {
+ ts := newTestServer()
+ ts.Close() // immediately close the server to force a connection error
+
+ c := NewCollector()
+ c.IgnoreRobotsTxt = false
+ err := c.Visit(ts.URL)
+
+ if err == nil {
+ t.Fatal("Error expected")
+ }
+}
+
+func TestEnvSettings(t *testing.T) {
+ ts := newTestServer()
+ defer ts.Close()
+
+ os.Setenv("COLLY_USER_AGENT", "test")
+ defer os.Unsetenv("COLLY_USER_AGENT")
+
+ c := NewCollector()
valid := false
@@ -592,6 +1258,121 @@ func TestEnvSettings(t *testing.T) {
}
}
+func TestUserAgent(t *testing.T) {
+ const exampleUserAgent1 = "Example/1.0"
+ const exampleUserAgent2 = "Example/2.0"
+ const defaultUserAgent = "colly - https://github.com/gocolly/colly/v2"
+
+ ts := newTestServer()
+ defer ts.Close()
+
+ var receivedUserAgent string
+
+ func() {
+ c := NewCollector()
+ c.OnResponse(func(resp *Response) {
+ receivedUserAgent = string(resp.Body)
+ })
+ c.Visit(ts.URL + "/user_agent")
+ if got, want := receivedUserAgent, defaultUserAgent; got != want {
+ t.Errorf("mismatched User-Agent: got=%q want=%q", got, want)
+ }
+ }()
+ func() {
+ c := NewCollector(UserAgent(exampleUserAgent1))
+ c.OnResponse(func(resp *Response) {
+ receivedUserAgent = string(resp.Body)
+ })
+ c.Visit(ts.URL + "/user_agent")
+ if got, want := receivedUserAgent, exampleUserAgent1; got != want {
+ t.Errorf("mismatched User-Agent: got=%q want=%q", got, want)
+ }
+ }()
+ func() {
+ c := NewCollector(UserAgent(exampleUserAgent1))
+ c.OnResponse(func(resp *Response) {
+ receivedUserAgent = string(resp.Body)
+ })
+
+ c.Request("GET", ts.URL+"/user_agent", nil, nil, nil)
+ if got, want := receivedUserAgent, exampleUserAgent1; got != want {
+ t.Errorf("mismatched User-Agent (nil hdr): got=%q want=%q", got, want)
+ }
+ }()
+ func() {
+ c := NewCollector(UserAgent(exampleUserAgent1))
+ c.OnResponse(func(resp *Response) {
+ receivedUserAgent = string(resp.Body)
+ })
+
+ c.Request("GET", ts.URL+"/user_agent", nil, nil, http.Header{})
+ if got, want := receivedUserAgent, exampleUserAgent1; got != want {
+ t.Errorf("mismatched User-Agent (non-nil hdr): got=%q want=%q", got, want)
+ }
+ }()
+ func() {
+ c := NewCollector(UserAgent(exampleUserAgent1))
+ c.OnResponse(func(resp *Response) {
+ receivedUserAgent = string(resp.Body)
+ })
+ hdr := http.Header{}
+ hdr.Set("User-Agent", "")
+
+ c.Request("GET", ts.URL+"/user_agent", nil, nil, hdr)
+ if got, want := receivedUserAgent, ""; got != want {
+ t.Errorf("mismatched User-Agent (hdr with empty UA): got=%q want=%q", got, want)
+ }
+ }()
+ func() {
+ c := NewCollector(UserAgent(exampleUserAgent1))
+ c.OnResponse(func(resp *Response) {
+ receivedUserAgent = string(resp.Body)
+ })
+ hdr := http.Header{}
+ hdr.Set("User-Agent", exampleUserAgent2)
+
+ c.Request("GET", ts.URL+"/user_agent", nil, nil, hdr)
+ if got, want := receivedUserAgent, exampleUserAgent2; got != want {
+ t.Errorf("mismatched User-Agent (hdr with UA): got=%q want=%q", got, want)
+ }
+ }()
+}
+
+func TestHeaders(t *testing.T) {
+ const exampleHostHeader = "example.com"
+ const exampleTestHeader = "Testing"
+
+ ts := newTestServer()
+ defer ts.Close()
+
+ var receivedHeader string
+
+ func() {
+ c := NewCollector(
+ Headers(map[string]string{"Host": exampleHostHeader}),
+ )
+ c.OnResponse(func(resp *Response) {
+ receivedHeader = string(resp.Body)
+ })
+ c.Visit(ts.URL + "/host_header")
+ if got, want := receivedHeader, exampleHostHeader; got != want {
+ t.Errorf("mismatched Host header: got=%q want=%q", got, want)
+ }
+ }()
+ func() {
+ c := NewCollector(
+ Headers(map[string]string{"Test": exampleTestHeader}),
+ )
+ c.OnResponse(func(resp *Response) {
+ receivedHeader = string(resp.Body)
+ })
+ c.Visit(ts.URL + "/custom_header")
+ if got, want := receivedHeader, exampleTestHeader; got != want {
+ t.Errorf("mismatched custom header: got=%q want=%q", got, want)
+ }
+ }()
+}
+
func TestParseHTTPErrorResponse(t *testing.T) {
contentCount := 0
ts := newTestServer()
@@ -662,7 +1443,7 @@ func TestHTMLElement(t *testing.T) {
}
}
-func TestCollectorOnXML(t *testing.T) {
+func TestCollectorOnXMLWithHtml(t *testing.T) {
ts := newTestServer()
defer ts.Close()
@@ -706,6 +1487,186 @@ func TestCollectorOnXML(t *testing.T) {
}
}
+func TestCollectorOnXMLWithXML(t *testing.T) {
+ ts := newTestServer()
+ defer ts.Close()
+
+ c := NewCollector()
+
+ titleCallbackCalled := false
+ paragraphCallbackCount := 0
+
+ c.OnXML("//page/title", func(e *XMLElement) {
+ titleCallbackCalled = true
+ if e.Text != "Test Page" {
+ t.Error("Title element text does not match, got", e.Text)
+ }
+ })
+
+ c.OnXML("//page/paragraph", func(e *XMLElement) {
+ paragraphCallbackCount++
+ if e.Attr("type") != "description" {
+ t.Error("Failed to get paragraph's type attribute")
+ }
+ })
+
+ c.OnXML("/page", func(e *XMLElement) {
+ if e.ChildAttr("paragraph", "type") != "description" {
+ t.Error("Invalid type value")
+ }
+ classes := e.ChildAttrs("paragraph", "type")
+ if len(classes) != 2 {
+ t.Error("Invalid type values")
+ }
+ })
+
+ c.Visit(ts.URL + "/xml")
+
+ if !titleCallbackCalled {
+ t.Error("Failed to call OnXML callback for tag")
+ }
+
+ if paragraphCallbackCount != 2 {
+ t.Error("Failed to find all tags")
+ }
+}
+
+func TestCollectorVisitWithTrace(t *testing.T) {
+ ts := newTestServer()
+ defer ts.Close()
+
+ c := NewCollector(AllowedDomains("localhost", "127.0.0.1", "::1"), TraceHTTP())
+ c.OnResponse(func(resp *Response) {
+ if resp.Trace == nil {
+ t.Error("Failed to initialize trace")
+ }
+ })
+
+ err := c.Visit(ts.URL)
+ if err != nil {
+ t.Errorf("Failed to visit url %s", ts.URL)
+ }
+}
+
+func TestCollectorVisitWithCheckHead(t *testing.T) {
+ ts := newTestServer()
+ defer ts.Close()
+
+ c := NewCollector(CheckHead())
+ var requestMethodChain []string
+ c.OnResponse(func(resp *Response) {
+ requestMethodChain = append(requestMethodChain, resp.Request.Method)
+ })
+
+ err := c.Visit(ts.URL)
+ if err != nil {
+ t.Errorf("Failed to visit url %s", ts.URL)
+ }
+ if requestMethodChain[0] != "HEAD" && requestMethodChain[1] != "GET" {
+ t.Errorf("Failed to perform a HEAD request before GET")
+ }
+}
+
+func TestCollectorDepth(t *testing.T) {
+ ts := newTestServer()
+ defer ts.Close()
+ maxDepth := 2
+ c1 := NewCollector(
+ MaxDepth(maxDepth),
+ AllowURLRevisit(),
+ )
+ requestCount := 0
+ c1.OnResponse(func(resp *Response) {
+ requestCount++
+ if requestCount >= 10 {
+ return
+ }
+ c1.Visit(ts.URL)
+ })
+ c1.Visit(ts.URL)
+ if requestCount < 10 {
+ t.Errorf("Invalid number of requests: %d (expected 10) without using MaxDepth", requestCount)
+ }
+
+ c2 := c1.Clone()
+ requestCount = 0
+ c2.OnResponse(func(resp *Response) {
+ requestCount++
+ resp.Request.Visit(ts.URL)
+ })
+ c2.Visit(ts.URL)
+ if requestCount != 2 {
+ t.Errorf("Invalid number of requests: %d (expected 2) with using MaxDepth 2", requestCount)
+ }
+
+ c1.Visit(ts.URL)
+ if requestCount < 10 {
+ t.Errorf("Invalid number of requests: %d (expected 10) without using MaxDepth again", requestCount)
+ }
+
+ requestCount = 0
+ c2.Visit(ts.URL)
+ if requestCount != 2 {
+ t.Errorf("Invalid number of requests: %d (expected 2) with using MaxDepth 2 again", requestCount)
+ }
+}
+
+func TestCollectorRequests(t *testing.T) {
+ ts := newTestServer()
+ defer ts.Close()
+ maxRequests := uint32(5)
+ c1 := NewCollector(
+ MaxRequests(maxRequests),
+ AllowURLRevisit(),
+ )
+ requestCount := 0
+ c1.OnResponse(func(resp *Response) {
+ requestCount++
+ c1.Visit(ts.URL)
+ })
+ c1.Visit(ts.URL)
+ if requestCount != 5 {
+ t.Errorf("Invalid number of requests: %d (expected 5) with MaxRequests", requestCount)
+ }
+}
+
+func TestCollectorContext(t *testing.T) {
+ // "/slow" takes 1 second to return the response.
+ // If context does abort the transfer after 0.5 seconds as it should,
+ // OnError will be called, and the test is passed. Otherwise, test is failed.
+
+ ts := newTestServer()
+ defer ts.Close()
+
+ ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
+ defer cancel()
+
+ c := NewCollector(StdlibContext(ctx))
+
+ onErrorCalled := false
+
+ c.OnResponse(func(resp *Response) {
+ t.Error("OnResponse was called, expected OnError")
+ })
+
+ c.OnError(func(resp *Response, err error) {
+ onErrorCalled = true
+ if err != context.DeadlineExceeded {
+ t.Errorf("OnError got err=%#v, expected context.DeadlineExceeded", err)
+ }
+ })
+
+ err := c.Visit(ts.URL + "/slow")
+ if err != context.DeadlineExceeded {
+ t.Errorf("Visit return err=%#v, expected context.DeadlineExceeded", err)
+ }
+
+ if !onErrorCalled {
+ t.Error("OnError was not called")
+ }
+
+}
+
func BenchmarkOnHTML(b *testing.B) {
ts := newTestServer()
defer ts.Close()
@@ -742,3 +1703,114 @@ func BenchmarkOnResponse(b *testing.B) {
c.Visit(ts.URL)
}
}
+
+func requireSessionCookieSimple(handler http.Handler) http.Handler {
+ const cookieName = "session_id"
+
+ return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ if _, err := r.Cookie(cookieName); err == http.ErrNoCookie {
+ http.SetCookie(w, &http.Cookie{Name: cookieName, Value: "1"})
+ http.Redirect(w, r, r.RequestURI, http.StatusFound)
+ return
+ }
+ handler.ServeHTTP(w, r)
+ })
+}
+
+func requireSessionCookieAuthPage(handler http.Handler) http.Handler {
+ const setCookiePath = "/auth"
+ const cookieName = "session_id"
+
+ return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ if r.URL.Path == setCookiePath {
+ destination := r.URL.Query().Get("return")
+ http.Redirect(w, r, destination, http.StatusFound)
+ return
+ }
+ if _, err := r.Cookie(cookieName); err == http.ErrNoCookie {
+ http.SetCookie(w, &http.Cookie{Name: cookieName, Value: "1"})
+ http.Redirect(w, r, setCookiePath+"?return="+url.QueryEscape(r.RequestURI), http.StatusFound)
+ return
+ }
+ handler.ServeHTTP(w, r)
+ })
+}
+
+func TestCollectorPostRetry(t *testing.T) {
+ ts := newTestServer()
+ defer ts.Close()
+
+ postValue := "hello"
+ c := NewCollector()
+ try := false
+ c.OnResponse(func(r *Response) {
+ if r.Ctx.Get("notFirst") == "" {
+ r.Ctx.Put("notFirst", "first")
+ _ = r.Request.Retry()
+ return
+ }
+ if postValue != string(r.Body) {
+ t.Error("Failed to send data with POST")
+ }
+ try = true
+ })
+
+ c.Post(ts.URL+"/login", map[string]string{
+ "name": postValue,
+ })
+ if !try {
+ t.Error("OnResponse Retry was not called")
+ }
+}
+func TestCollectorGetRetry(t *testing.T) {
+ ts := newTestServer()
+ defer ts.Close()
+ try := false
+
+ c := NewCollector()
+
+ c.OnResponse(func(r *Response) {
+ if r.Ctx.Get("notFirst") == "" {
+ r.Ctx.Put("notFirst", "first")
+ _ = r.Request.Retry()
+ return
+ }
+ if !bytes.Equal(r.Body, serverIndexResponse) {
+ t.Error("Response body does not match with the original content")
+ }
+ try = true
+ })
+
+ c.Visit(ts.URL)
+ if !try {
+ t.Error("OnResponse Retry was not called")
+ }
+}
+
+func TestCollectorPostRetryUnseekable(t *testing.T) {
+ ts := newTestServer()
+ defer ts.Close()
+ try := false
+ postValue := "hello"
+ c := NewCollector()
+
+ c.OnResponse(func(r *Response) {
+ if postValue != string(r.Body) {
+ t.Error("Failed to send data with POST")
+ }
+
+ if r.Ctx.Get("notFirst") == "" {
+ r.Ctx.Put("notFirst", "first")
+ err := r.Request.Retry()
+ if !errors.Is(err, ErrRetryBodyUnseekable) {
+ t.Errorf("Unexpected error Type ErrRetryBodyUnseekable : %v", err)
+ }
+ return
+ }
+ try = true
+ })
+ c.Request("POST", ts.URL+"/login", bytes.NewBuffer([]byte("name="+postValue)), nil, nil)
+ if try {
+ t.Error("OnResponse Retry was called but BodyUnseekable")
+ }
+}
diff --git a/debug/webdebugger.go b/debug/webdebugger.go
index e246361e1..504a9eb04 100644
--- a/debug/webdebugger.go
+++ b/debug/webdebugger.go
@@ -18,6 +18,7 @@ import (
"encoding/json"
"log"
"net/http"
+ "sync"
"time"
)
@@ -28,6 +29,7 @@ type WebDebugger struct {
initialized bool
CurrentRequests map[uint32]requestInfo
RequestLog []requestInfo
+ sync.Mutex
}
type requestInfo struct {
@@ -61,6 +63,9 @@ func (w *WebDebugger) Init() error {
// Event updates the debugger's status
func (w *WebDebugger) Event(e *Event) {
+ w.Lock()
+ defer w.Unlock()
+
switch e.Type {
case "request":
w.CurrentRequests[e.RequestID] = requestInfo{
@@ -119,11 +124,11 @@ function fetchStatus() {
$("#request_log_count").text('(' + data.RequestLog.length + ')');
for(var i in data.CurrentRequests) {
var r = data.CurrentRequests[i];
- $("#current_requests").append(curRequestTpl(r.Url, r.Started, r.CollectorId));
+ $("#current_requests").append(curRequestTpl(r.URL, r.Started, r.CollectorID));
}
for(var i in data.RequestLog.reverse()) {
var r = data.RequestLog[i];
- $("#request_log").append(requestLogTpl(r.Url, r.Duration, r.CollectorId));
+ $("#request_log").append(requestLogTpl(r.URL, r.Duration, r.CollectorID));
}
setTimeout(fetchStatus, 1000);
});
@@ -138,7 +143,9 @@ $(document).ready(function() {
}
func (w *WebDebugger) statusHandler(wr http.ResponseWriter, r *http.Request) {
+ w.Lock()
jsonData, err := json.MarshalIndent(w, "", " ")
+ w.Unlock()
if err != nil {
panic(err)
}
diff --git a/extensions/random_user_agent.go b/extensions/random_user_agent.go
index 6426a14ff..296b2f6e6 100644
--- a/extensions/random_user_agent.go
+++ b/extensions/random_user_agent.go
@@ -3,57 +3,537 @@ package extensions
import (
"fmt"
"math/rand"
+ "strings"
- "github.com/gocolly/colly"
+ "github.com/gocolly/colly/v2"
)
var uaGens = []func() string{
genFirefoxUA,
genChromeUA,
+ genEdgeUA,
+ genOperaUA,
}
-// RandomUserAgent generates a random browser user agent on every request
+var uaGensMobile = []func() string{
+ genMobilePixel7UA,
+ genMobilePixel6UA,
+ genMobilePixel5UA,
+ genMobilePixel4UA,
+ genMobileNexus10UA,
+}
+
+// RandomUserAgent generates a random DESKTOP browser user-agent on every requests
func RandomUserAgent(c *colly.Collector) {
c.OnRequest(func(r *colly.Request) {
r.Headers.Set("User-Agent", uaGens[rand.Intn(len(uaGens))]())
})
}
+// RandomMobileUserAgent generates a random MOBILE browser user-agent on every requests
+func RandomMobileUserAgent(c *colly.Collector) {
+ c.OnRequest(func(r *colly.Request) {
+ r.Headers.Set("User-Agent", uaGensMobile[rand.Intn(len(uaGensMobile))]())
+ })
+}
+
var ffVersions = []float32{
- 58.0,
- 57.0,
- 56.0,
- 52.0,
- 48.0,
- 40.0,
- 35.0,
+ // NOTE: Only version released after Jun 1, 2022 will be listed.
+ // Data source: https://en.wikipedia.org/wiki/Firefox_version_history
+
+ // 2022
+ 102.0,
+ 103.0,
+ 104.0,
+ 105.0,
+ 106.0,
+ 107.0,
+ 108.0,
+
+ // 2023
+ 109.0,
+ 110.0,
+ 111.0,
+ 112.0,
+ 113.0,
}
var chromeVersions = []string{
- "65.0.3325.146",
- "64.0.3282.0",
- "41.0.2228.0",
- "40.0.2214.93",
- "37.0.2062.124",
+ // NOTE: Only version released after Jun 1, 2022 will be listed.
+ // Data source: https://chromereleases.googleblog.com/search/label/Stable%20updates
+
+ // https://chromereleases.googleblog.com/2022/06/stable-channel-update-for-desktop.html
+ "102.0.5005.115",
+
+ // https://chromereleases.googleblog.com/2022/06/stable-channel-update-for-desktop_21.html
+ "103.0.5060.53",
+
+ // https://chromereleases.googleblog.com/2022/06/stable-channel-update-for-desktop_27.html
+ "103.0.5060.66",
+
+ // https://chromereleases.googleblog.com/2022/07/stable-channel-update-for-desktop.html
+ "103.0.5060.114",
+
+ // https://chromereleases.googleblog.com/2022/07/stable-channel-update-for-desktop_19.html
+ "103.0.5060.134",
+
+ // https://chromereleases.googleblog.com/2022/08/stable-channel-update-for-desktop.html
+ "104.0.5112.79",
+ "104.0.5112.80",
+ "104.0.5112.81",
+
+ // https://chromereleases.googleblog.com/2022/08/stable-channel-update-for-desktop_16.html
+ "104.0.5112.101",
+ "104.0.5112.102",
+
+ // https://chromereleases.googleblog.com/2022/08/stable-channel-update-for-desktop_30.html
+ "105.0.5195.52",
+ "105.0.5195.53",
+ "105.0.5195.54",
+
+ // https://chromereleases.googleblog.com/2022/09/stable-channel-update-for-desktop.html
+ "105.0.5195.102",
+
+ // https://chromereleases.googleblog.com/2022/09/stable-channel-update-for-desktop_14.html
+ "105.0.5195.125",
+ "105.0.5195.126",
+ "105.0.5195.127",
+
+ // https://chromereleases.googleblog.com/2022/09/stable-channel-update-for-desktop_27.html
+ "106.0.5249.61",
+ "106.0.5249.62",
+
+ // https://chromereleases.googleblog.com/2022/09/stable-channel-update-for-desktop_30.html
+ "106.0.5249.91",
+
+ // https://chromereleases.googleblog.com/2022/10/stable-channel-update-for-desktop.html
+ "106.0.5249.103",
+
+ // https://chromereleases.googleblog.com/2022/10/stable-channel-update-for-desktop_11.html
+ "106.0.5249.119",
+
+ // https://chromereleases.googleblog.com/2022/10/stable-channel-update-for-desktop_25.html
+ "107.0.5304.62",
+ "107.0.5304.63",
+ "107.0.5304.68",
+
+ // https://chromereleases.googleblog.com/2022/10/stable-channel-update-for-desktop_27.html
+ "107.0.5304.87",
+ "107.0.5304.88",
+
+ // https://chromereleases.googleblog.com/2022/11/stable-channel-update-for-desktop.html
+ "107.0.5304.106",
+ "107.0.5304.107",
+ "107.0.5304.110",
+
+ // https://chromereleases.googleblog.com/2022/11/stable-channel-update-for-desktop_24.html
+ "107.0.5304.121",
+ "107.0.5304.122",
+
+ // https://chromereleases.googleblog.com/2022/11/stable-channel-update-for-desktop_29.html
+ "108.0.5359.71",
+ "108.0.5359.72",
+
+ // https://chromereleases.googleblog.com/2022/12/stable-channel-update-for-desktop.html
+ "108.0.5359.94",
+ "108.0.5359.95",
+
+ // https://chromereleases.googleblog.com/2022/12/stable-channel-update-for-desktop_7.html
+ "108.0.5359.98",
+ "108.0.5359.99",
+
+ // https://chromereleases.googleblog.com/2022/12/stable-channel-update-for-desktop_13.html
+ "108.0.5359.124",
+ "108.0.5359.125",
+
+ // https://chromereleases.googleblog.com/2023/01/stable-channel-update-for-desktop.html
+ "109.0.5414.74",
+ "109.0.5414.75",
+ "109.0.5414.87",
+
+ // https://chromereleases.googleblog.com/2023/01/stable-channel-update-for-desktop_24.html
+ "109.0.5414.119",
+ "109.0.5414.120",
+
+ // https://chromereleases.googleblog.com/2023/02/stable-channel-update-for-desktop.html
+ "110.0.5481.77",
+ "110.0.5481.78",
+
+ // https://chromereleases.googleblog.com/2023/02/stable-channel-desktop-update.html
+ "110.0.5481.96",
+ "110.0.5481.97",
+
+ // https://chromereleases.googleblog.com/2023/02/stable-channel-desktop-update_14.html
+ "110.0.5481.100",
+
+ // https://chromereleases.googleblog.com/2023/02/stable-channel-desktop-update_16.html
+ "110.0.5481.104",
+
+ // https://chromereleases.googleblog.com/2023/02/stable-channel-desktop-update_22.html
+ "110.0.5481.177",
+ "110.0.5481.178",
+
+ // https://chromereleases.googleblog.com/2023/02/stable-channel-desktop-update_97.html
+ "109.0.5414.129",
+
+ // https://chromereleases.googleblog.com/2023/03/stable-channel-update-for-desktop.html
+ "111.0.5563.64",
+ "111.0.5563.65",
+
+ // https://chromereleases.googleblog.com/2023/03/stable-channel-update-for-desktop_21.html
+ "111.0.5563.110",
+ "111.0.5563.111",
+
+ // https://chromereleases.googleblog.com/2023/03/stable-channel-update-for-desktop_27.html
+ "111.0.5563.146",
+ "111.0.5563.147",
+
+ // https://chromereleases.googleblog.com/2023/04/stable-channel-update-for-desktop.html
+ "112.0.5615.49",
+ "112.0.5615.50",
+
+ // https://chromereleases.googleblog.com/2023/04/stable-channel-update-for-desktop_12.html
+ "112.0.5615.86",
+ "112.0.5615.87",
+
+ // https://chromereleases.googleblog.com/2023/04/stable-channel-update-for-desktop_14.html
+ "112.0.5615.121",
+
+ // https://chromereleases.googleblog.com/2023/04/stable-channel-update-for-desktop_18.html
+ "112.0.5615.137",
+ "112.0.5615.138",
+ "112.0.5615.165",
+
+ // https://chromereleases.googleblog.com/2023/05/stable-channel-update-for-desktop.html
+ "113.0.5672.63",
+ "113.0.5672.64",
+
+ // https://chromereleases.googleblog.com/2023/05/stable-channel-update-for-desktop_8.html
+ "113.0.5672.92",
+ "113.0.5672.93",
+}
+
+var edgeVersions = []string{
+ // NOTE: Only version released after Jun 1, 2022 will be listed.
+ // Data source: https://learn.microsoft.com/en-us/deployedge/microsoft-edge-release-schedule
+
+ // 2022
+ "103.0.0.0,103.0.1264.37",
+ "104.0.0.0,104.0.1293.47",
+ "105.0.0.0,105.0.1343.25",
+ "106.0.0.0,106.0.1370.34",
+ "107.0.0.0,107.0.1418.24",
+ "108.0.0.0,108.0.1462.42",
+
+ // 2023
+ "109.0.0.0,109.0.1518.49",
+ "110.0.0.0,110.0.1587.41",
+ "111.0.0.0,111.0.1661.41",
+ "112.0.0.0,112.0.1722.34",
+ "113.0.0.0,113.0.1774.3",
+}
+
+var operaVersions = []string{
+ // NOTE: Only version released after Jan 1, 2023 will be listed.
+ // Data source: https://blogs.opera.com/desktop/
+
+ // https://blogs.opera.com/desktop/changelog-for-96/
+ "110.0.5449.0,96.0.4640.0",
+ "110.0.5464.2,96.0.4653.0",
+ "110.0.5464.2,96.0.4660.0",
+ "110.0.5481.30,96.0.4674.0",
+ "110.0.5481.30,96.0.4691.0",
+ "110.0.5481.30,96.0.4693.12",
+ "110.0.5481.77,96.0.4693.16",
+ "110.0.5481.100,96.0.4693.20",
+ "110.0.5481.178,96.0.4693.31",
+ "110.0.5481.178,96.0.4693.50",
+ "110.0.5481.192,96.0.4693.80",
+
+ // https://blogs.opera.com/desktop/changelog-for-97/
+ "111.0.5532.2,97.0.4711.0",
+ "111.0.5532.2,97.0.4704.0",
+ "111.0.5532.2,97.0.4697.0",
+ "111.0.5562.0,97.0.4718.0",
+ "111.0.5563.19,97.0.4719.4",
+ "111.0.5563.19,97.0.4719.11",
+ "111.0.5563.41,97.0.4719.17",
+ "111.0.5563.65,97.0.4719.26",
+ "111.0.5563.65,97.0.4719.28",
+ "111.0.5563.111,97.0.4719.43",
+ "111.0.5563.147,97.0.4719.63",
+ "111.0.5563.147,97.0.4719.83",
+
+ // https://blogs.opera.com/desktop/changelog-for-98/
+ "112.0.5596.2,98.0.4756.0",
+ "112.0.5596.2,98.0.4746.0",
+ "112.0.5615.20,98.0.4759.1",
+ "112.0.5615.50,98.0.4759.3",
+ "112.0.5615.87,98.0.4759.6",
+ "112.0.5615.165,98.0.4759.15",
+ "112.0.5615.165,98.0.4759.21",
+ "112.0.5615.165,98.0.4759.39",
+}
+
+var pixel7AndroidVersions = []string{
+ // Data source:
+ // - https://developer.android.com/about/versions
+ // - https://source.android.com/docs/setup/about/build-numbers#source-code-tags-and-builds
+ "13",
+}
+
+var pixel6AndroidVersions = []string{
+ // Data source:
+ // - https://developer.android.com/about/versions
+ // - https://source.android.com/docs/setup/about/build-numbers#source-code-tags-and-builds
+ "12",
+ "13",
+}
+
+var pixel5AndroidVersions = []string{
+ // Data source:
+ // - https://developer.android.com/about/versions
+ // - https://source.android.com/docs/setup/about/build-numbers#source-code-tags-and-builds
+ "11",
+ "12",
+ "13",
+}
+
+var pixel4AndroidVersions = []string{
+ // Data source:
+ // - https://developer.android.com/about/versions
+ // - https://source.android.com/docs/setup/about/build-numbers#source-code-tags-and-builds
+ "10",
+ "11",
+ "12",
+ "13",
+}
+
+var nexus10AndroidVersions = []string{
+ // Data source:
+ // - https://developer.android.com/about/versions
+ // - https://source.android.com/docs/setup/about/build-numbers#source-code-tags-and-builds
+ "4.4.2",
+ "4.4.4",
+ "5.0",
+ "5.0.1",
+ "5.0.2",
+ "5.1",
+ "5.1.1",
+}
+
+var nexus10Builds = []string{
+ // Data source: https://source.android.com/docs/setup/about/build-numbers#source-code-tags-and-builds
+
+ "LMY49M", // android-5.1.1_r38 (Lollipop)
+ "LMY49J", // android-5.1.1_r37 (Lollipop)
+ "LMY49I", // android-5.1.1_r36 (Lollipop)
+ "LMY49H", // android-5.1.1_r35 (Lollipop)
+ "LMY49G", // android-5.1.1_r34 (Lollipop)
+ "LMY49F", // android-5.1.1_r33 (Lollipop)
+ "LMY48Z", // android-5.1.1_r30 (Lollipop)
+ "LMY48X", // android-5.1.1_r25 (Lollipop)
+ "LMY48T", // android-5.1.1_r19 (Lollipop)
+ "LMY48M", // android-5.1.1_r14 (Lollipop)
+ "LMY48I", // android-5.1.1_r9 (Lollipop)
+ "LMY47V", // android-5.1.1_r1 (Lollipop)
+ "LMY47D", // android-5.1.0_r1 (Lollipop)
+ "LRX22G", // android-5.0.2_r1 (Lollipop)
+ "LRX22C", // android-5.0.1_r1 (Lollipop)
+ "LRX21P", // android-5.0.0_r4.0.1 (Lollipop)
+ "KTU84P", // android-4.4.4_r1 (KitKat)
+ "KTU84L", // android-4.4.3_r1 (KitKat)
+ "KOT49H", // android-4.4.2_r1 (KitKat)
+ "KOT49E", // android-4.4.1_r1 (KitKat)
+ "KRT16S", // android-4.4_r1.2 (KitKat)
+ "JWR66Y", // android-4.3_r1.1 (Jelly Bean)
+ "JWR66V", // android-4.3_r1 (Jelly Bean)
+ "JWR66N", // android-4.3_r0.9.1 (Jelly Bean)
+ "JDQ39 ", // android-4.2.2_r1 (Jelly Bean)
+ "JOP40F", // android-4.2.1_r1.1 (Jelly Bean)
+ "JOP40D", // android-4.2.1_r1 (Jelly Bean)
+ "JOP40C", // android-4.2_r1 (Jelly Bean)
}
var osStrings = []string{
- "Macintosh; Intel Mac OS X 10_10",
- "Windows NT 10.0",
+ // MacOS - High Sierra
+ "Macintosh; Intel Mac OS X 10_13",
+ "Macintosh; Intel Mac OS X 10_13_1",
+ "Macintosh; Intel Mac OS X 10_13_2",
+ "Macintosh; Intel Mac OS X 10_13_3",
+ "Macintosh; Intel Mac OS X 10_13_4",
+ "Macintosh; Intel Mac OS X 10_13_5",
+ "Macintosh; Intel Mac OS X 10_13_6",
+
+ // MacOS - Mojave
+ "Macintosh; Intel Mac OS X 10_14",
+ "Macintosh; Intel Mac OS X 10_14_1",
+ "Macintosh; Intel Mac OS X 10_14_2",
+ "Macintosh; Intel Mac OS X 10_14_3",
+ "Macintosh; Intel Mac OS X 10_14_4",
+ "Macintosh; Intel Mac OS X 10_14_5",
+ "Macintosh; Intel Mac OS X 10_14_6",
+
+ // MacOS - Catalina
+ "Macintosh; Intel Mac OS X 10_15",
+ "Macintosh; Intel Mac OS X 10_15_1",
+ "Macintosh; Intel Mac OS X 10_15_2",
+ "Macintosh; Intel Mac OS X 10_15_3",
+ "Macintosh; Intel Mac OS X 10_15_4",
+ "Macintosh; Intel Mac OS X 10_15_5",
+ "Macintosh; Intel Mac OS X 10_15_6",
+ "Macintosh; Intel Mac OS X 10_15_7",
+
+ // MacOS - Big Sur
+ "Macintosh; Intel Mac OS X 11_0",
+ "Macintosh; Intel Mac OS X 11_0_1",
+ "Macintosh; Intel Mac OS X 11_1",
+ "Macintosh; Intel Mac OS X 11_2",
+ "Macintosh; Intel Mac OS X 11_2_1",
+ "Macintosh; Intel Mac OS X 11_2_2",
+ "Macintosh; Intel Mac OS X 11_2_3",
+ "Macintosh; Intel Mac OS X 11_3",
+ "Macintosh; Intel Mac OS X 11_3_1",
+ "Macintosh; Intel Mac OS X 11_4",
+ "Macintosh; Intel Mac OS X 11_5",
+ "Macintosh; Intel Mac OS X 11_5_1",
+ "Macintosh; Intel Mac OS X 11_5_2",
+ "Macintosh; Intel Mac OS X 11_6",
+ "Macintosh; Intel Mac OS X 11_6_1",
+ "Macintosh; Intel Mac OS X 11_6_2",
+ "Macintosh; Intel Mac OS X 11_6_3",
+ "Macintosh; Intel Mac OS X 11_6_4",
+ "Macintosh; Intel Mac OS X 11_6_5",
+ "Macintosh; Intel Mac OS X 11_6_6",
+ "Macintosh; Intel Mac OS X 11_6_7",
+ "Macintosh; Intel Mac OS X 11_6_8",
+ "Macintosh; Intel Mac OS X 11_7",
+ "Macintosh; Intel Mac OS X 11_7_1",
+ "Macintosh; Intel Mac OS X 11_7_2",
+ "Macintosh; Intel Mac OS X 11_7_3",
+ "Macintosh; Intel Mac OS X 11_7_4",
+ "Macintosh; Intel Mac OS X 11_7_5",
+ "Macintosh; Intel Mac OS X 11_7_6",
+
+ // MacOS - Monterey
+ "Macintosh; Intel Mac OS X 12_0",
+ "Macintosh; Intel Mac OS X 12_0_1",
+ "Macintosh; Intel Mac OS X 12_1",
+ "Macintosh; Intel Mac OS X 12_2",
+ "Macintosh; Intel Mac OS X 12_2_1",
+ "Macintosh; Intel Mac OS X 12_3",
+ "Macintosh; Intel Mac OS X 12_3_1",
+ "Macintosh; Intel Mac OS X 12_4",
+ "Macintosh; Intel Mac OS X 12_5",
+ "Macintosh; Intel Mac OS X 12_5_1",
+ "Macintosh; Intel Mac OS X 12_6",
+ "Macintosh; Intel Mac OS X 12_6_1",
+ "Macintosh; Intel Mac OS X 12_6_2",
+ "Macintosh; Intel Mac OS X 12_6_3",
+ "Macintosh; Intel Mac OS X 12_6_4",
+ "Macintosh; Intel Mac OS X 12_6_5",
+
+ // MacOS - Ventura
+ "Macintosh; Intel Mac OS X 13_0",
+ "Macintosh; Intel Mac OS X 13_0_1",
+ "Macintosh; Intel Mac OS X 13_1",
+ "Macintosh; Intel Mac OS X 13_2",
+ "Macintosh; Intel Mac OS X 13_2_1",
+ "Macintosh; Intel Mac OS X 13_3",
+ "Macintosh; Intel Mac OS X 13_3_1",
+
+ // Windows
+ "Windows NT 10.0; Win64; x64",
"Windows NT 5.1",
"Windows NT 6.1; WOW64",
"Windows NT 6.1; Win64; x64",
+
+ // Linux
"X11; Linux x86_64",
}
+// Generates Firefox Browser User-Agent (Desktop)
+//
+// -> "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:87.0) Gecko/20100101 Firefox/87.0"
func genFirefoxUA() string {
version := ffVersions[rand.Intn(len(ffVersions))]
os := osStrings[rand.Intn(len(osStrings))]
return fmt.Sprintf("Mozilla/5.0 (%s; rv:%.1f) Gecko/20100101 Firefox/%.1f", os, version, version)
}
+// Generates Chrome Browser User-Agent (Desktop)
+//
+// -> "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36"
func genChromeUA() string {
version := chromeVersions[rand.Intn(len(chromeVersions))]
os := osStrings[rand.Intn(len(osStrings))]
return fmt.Sprintf("Mozilla/5.0 (%s) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36", os, version)
}
+
+// Generates Microsoft Edge User-Agent (Desktop)
+//
+// -> "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36 Edg/90.0.818.39"
+func genEdgeUA() string {
+ version := edgeVersions[rand.Intn(len(edgeVersions))]
+ chromeVersion := strings.Split(version, ",")[0]
+ edgeVersion := strings.Split(version, ",")[1]
+ os := osStrings[rand.Intn(len(osStrings))]
+ return fmt.Sprintf("Mozilla/5.0 (%s) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36 Edg/%s", os, chromeVersion, edgeVersion)
+}
+
+// Generates Opera Browser User-Agent (Desktop)
+//
+// -> "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 OPR/98.0.4759.3"
+func genOperaUA() string {
+ version := operaVersions[rand.Intn(len(operaVersions))]
+ chromeVersion := strings.Split(version, ",")[0]
+ operaVersion := strings.Split(version, ",")[1]
+ os := osStrings[rand.Intn(len(osStrings))]
+ return fmt.Sprintf("Mozilla/5.0 (%s) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36 OPR/%s", os, chromeVersion, operaVersion)
+}
+
+// Generates Pixel 7 Browser User-Agent (Mobile)
+//
+// -> Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36
+func genMobilePixel7UA() string {
+ android := pixel7AndroidVersions[rand.Intn(len(pixel7AndroidVersions))]
+ chrome := chromeVersions[rand.Intn(len(chromeVersions))]
+ return fmt.Sprintf("Mozilla/5.0 (Linux; Android %s; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36", android, chrome)
+}
+
+// Generates Pixel 6 Browser User-Agent (Mobile)
+//
+// -> "Mozilla/5.0 (Linux; Android 13; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36"
+func genMobilePixel6UA() string {
+ android := pixel6AndroidVersions[rand.Intn(len(pixel6AndroidVersions))]
+ chrome := chromeVersions[rand.Intn(len(chromeVersions))]
+ return fmt.Sprintf("Mozilla/5.0 (Linux; Android %s; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36", android, chrome)
+}
+
+// Generates Pixel 5 Browser User-Agent (Mobile)
+//
+// -> "Mozilla/5.0 (Linux; Android 13; Pixel 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36"
+func genMobilePixel5UA() string {
+ android := pixel5AndroidVersions[rand.Intn(len(pixel5AndroidVersions))]
+ chrome := chromeVersions[rand.Intn(len(chromeVersions))]
+ return fmt.Sprintf("Mozilla/5.0 (Linux; Android %s; Pixel 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36", android, chrome)
+}
+
+// Generates Pixel 4 Browser User-Agent (Mobile)
+//
+// -> "Mozilla/5.0 (Linux; Android 13; Pixel 4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Mobile Safari/537.36"
+func genMobilePixel4UA() string {
+ android := pixel4AndroidVersions[rand.Intn(len(pixel4AndroidVersions))]
+ chrome := chromeVersions[rand.Intn(len(chromeVersions))]
+ return fmt.Sprintf("Mozilla/5.0 (Linux; Android %s; Pixel 4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36", android, chrome)
+}
+
+// Generates Nexus 10 Browser User-Agent (Mobile)
+//
+// -> "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 10 Build/LMY48T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.91 Safari/537.36"
+func genMobileNexus10UA() string {
+ build := nexus10Builds[rand.Intn(len(nexus10Builds))]
+ android := nexus10AndroidVersions[rand.Intn(len(nexus10AndroidVersions))]
+ chrome := chromeVersions[rand.Intn(len(chromeVersions))]
+ return fmt.Sprintf("Mozilla/5.0 (Linux; Android %s; Nexus 10 Build/%s) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36", android, build, chrome)
+}
diff --git a/extensions/referer.go b/extensions/referer.go
index 6b13a32a3..32a1c69ea 100644
--- a/extensions/referer.go
+++ b/extensions/referer.go
@@ -1,7 +1,7 @@
package extensions
import (
- "github.com/gocolly/colly"
+ "github.com/gocolly/colly/v2"
)
// Referer sets valid Referer HTTP header to requests.
diff --git a/extensions/url_length_filter.go b/extensions/url_length_filter.go
index 695b74e57..141cfb57d 100644
--- a/extensions/url_length_filter.go
+++ b/extensions/url_length_filter.go
@@ -1,7 +1,7 @@
package extensions
import (
- "github.com/gocolly/colly"
+ "github.com/gocolly/colly/v2"
)
// URLLengthFilter filters out requests with URLs longer than URLLengthLimit
diff --git a/go.mod b/go.mod
new file mode 100644
index 000000000..8cdce202e
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,19 @@
+module github.com/gocolly/colly/v2
+
+go 1.12
+
+require (
+ github.com/PuerkitoBio/goquery v1.5.1
+ github.com/andybalholm/cascadia v1.3.1 // indirect
+ github.com/antchfx/htmlquery v1.2.3
+ github.com/antchfx/xmlquery v1.3.4
+ github.com/gobwas/glob v0.2.3
+ github.com/jawher/mow.cli v1.1.0
+ github.com/kennygrant/sanitize v1.2.4
+ github.com/nlnwa/whatwg-url v0.1.2
+ github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca
+ github.com/temoto/robotstxt v1.1.1
+ golang.org/x/net v0.17.0
+ google.golang.org/appengine v1.6.6
+ google.golang.org/protobuf v1.33.0 // indirect
+)
diff --git a/go.sum b/go.sum
new file mode 100644
index 000000000..861506403
--- /dev/null
+++ b/go.sum
@@ -0,0 +1,104 @@
+github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE=
+github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
+github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
+github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c=
+github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
+github.com/antchfx/htmlquery v1.2.3 h1:sP3NFDneHx2stfNXCKbhHFo8XgNjCACnU/4AO5gWz6M=
+github.com/antchfx/htmlquery v1.2.3/go.mod h1:B0ABL+F5irhhMWg54ymEZinzMSi0Kt3I2if0BLYa3V0=
+github.com/antchfx/xmlquery v1.3.4 h1:RuhsI4AA5Ma4XoXhaAr2VjJxU0Xp0W2zy/f9ZIpsF4s=
+github.com/antchfx/xmlquery v1.3.4/go.mod h1:64w0Xesg2sTaawIdNqMB+7qaW/bSqkQm+ssPaCMWNnc=
+github.com/antchfx/xpath v1.1.6/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk=
+github.com/antchfx/xpath v1.1.10 h1:cJ0pOvEdN/WvYXxvRrzQH9x5QWKpzHacYO8qzCcDYAg=
+github.com/antchfx/xpath v1.1.10/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk=
+github.com/bits-and-blooms/bitset v1.2.2-0.20220111210104-dfa3e347c392 h1:9d7ak0NpT8/bhFM5ZkQuLpeS8Ey9zDY9OJJcOYqYV4c=
+github.com/bits-and-blooms/bitset v1.2.2-0.20220111210104-dfa3e347c392/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
+github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
+github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e h1:1r7pUrabqp18hOBcwBwiTsbnFeTZHV9eER/QT5JVZxY=
+github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
+github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/protobuf v1.5.0 h1:LUVKkCeviFUMKqHa4tXIIij/lbhnMbP7Fn5wKdKkRh4=
+github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
+github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU=
+github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/jawher/mow.cli v1.1.0 h1:NdtHXRc0CwZQ507wMvQ/IS+Q3W3x2fycn973/b8Zuk8=
+github.com/jawher/mow.cli v1.1.0/go.mod h1:aNaQlc7ozF3vw6IJ2dHjp2ZFiA4ozMIYY6PyuRJwlUg=
+github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
+github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
+github.com/nlnwa/whatwg-url v0.1.2 h1:BqqsIVG6xv71wOoMAoFDmV6OK6/2sXn7BJdOsTkBl88=
+github.com/nlnwa/whatwg-url v0.1.2/go.mod h1:b0r+dEyM/KztLMDSVY6ApcO9Fmzgq+e9+Ugq20UBYck=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI=
+github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE=
+github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
+github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
+github.com/temoto/robotstxt v1.1.1 h1:Gh8RCs8ouX3hRSxxK7B1mO5RFByQ4CmJZDwgom++JaA=
+github.com/temoto/robotstxt v1.1.1/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
+github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
+golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
+golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4=
+golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
+golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
+golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
+golang.org/x/net v0.0.0-20200813134508-3edf25e44fcc/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
+golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
+golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
+golang.org/x/net v0.0.0-20220114011407-0dd24b26b47d/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
+golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
+golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
+golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
+golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM=
+golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
+golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
+golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
+golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
+golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
+golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
+golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
+golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
+golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k=
+golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
+golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
+golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+google.golang.org/appengine v1.6.6 h1:lMO5rYAqUxkmaj76jAkRUvt5JZgFymx/+Q5Mzfivuhc=
+google.golang.org/appengine v1.6.6/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=
+google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
+google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI=
+google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos=
diff --git a/htmlelement.go b/htmlelement.go
index 92484bd2b..7128949e5 100644
--- a/htmlelement.go
+++ b/htmlelement.go
@@ -68,6 +68,17 @@ func (h *HTMLElement) ChildText(goquerySelector string) string {
return strings.TrimSpace(h.DOM.Find(goquerySelector).Text())
}
+// ChildTexts returns the stripped text content of all the matching
+// elements.
+func (h *HTMLElement) ChildTexts(goquerySelector string) []string {
+ var res []string
+ h.DOM.Find(goquerySelector).Each(func(_ int, s *goquery.Selection) {
+
+ res = append(res, strings.TrimSpace(s.Text()))
+ })
+ return res
+}
+
// ChildAttr returns the stripped text content of the first matching
// element's attribute.
func (h *HTMLElement) ChildAttr(goquerySelector, attrName string) string {
diff --git a/http_backend.go b/http_backend.go
index 5c3c216d2..e580f7a2e 100644
--- a/http_backend.go
+++ b/http_backend.go
@@ -19,12 +19,12 @@ import (
"encoding/gob"
"encoding/hex"
"io"
- "io/ioutil"
"math/rand"
"net/http"
"os"
"path"
"regexp"
+ "strings"
"sync"
"time"
@@ -39,16 +39,18 @@ type httpBackend struct {
lock *sync.RWMutex
}
+type checkHeadersFunc func(req *http.Request, statusCode int, header http.Header) bool
+
// LimitRule provides connection restrictions for domains.
// Both DomainRegexp and DomainGlob can be used to specify
// the included domains patterns, but at least one is required.
// There can be two kind of limitations:
-// - Parallelism: Set limit for the number of concurrent requests to matching domains
-// - Delay: Wait specified amount of time between requests (parallelism is 1 in this case)
+// - Parallelism: Set limit for the number of concurrent requests to matching domains
+// - Delay: Wait specified amount of time between requests (parallelism is 1 in this case)
type LimitRule struct {
// DomainRegexp is a regular expression to match against domains
DomainRegexp string
- // DomainRegexp is a glob pattern to match against domains
+ // DomainGlob is a glob pattern to match against domains
DomainGlob string
// Delay is the duration to wait before creating a new request to the matching domains
Delay time.Duration
@@ -126,9 +128,9 @@ func (h *httpBackend) GetMatchingRule(domain string) *LimitRule {
return nil
}
-func (h *httpBackend) Cache(request *http.Request, bodySize int, cacheDir string) (*Response, error) {
- if cacheDir == "" || request.Method != "GET" {
- return h.Do(request, bodySize)
+func (h *httpBackend) Cache(request *http.Request, bodySize int, checkHeadersFunc checkHeadersFunc, cacheDir string) (*Response, error) {
+ if cacheDir == "" || request.Method != "GET" || request.Header.Get("Cache-Control") == "no-cache" {
+ return h.Do(request, bodySize, checkHeadersFunc)
}
sum := sha1.Sum([]byte(request.URL.String()))
hash := hex.EncodeToString(sum[:])
@@ -138,11 +140,12 @@ func (h *httpBackend) Cache(request *http.Request, bodySize int, cacheDir string
resp := new(Response)
err := gob.NewDecoder(file).Decode(resp)
file.Close()
+ checkHeadersFunc(request, resp.StatusCode, *resp.Headers)
if resp.StatusCode < 500 {
return resp, err
}
}
- resp, err := h.Do(request, bodySize)
+ resp, err := h.Do(request, bodySize, checkHeadersFunc)
if err != nil || resp.StatusCode >= 500 {
return resp, err
}
@@ -163,7 +166,7 @@ func (h *httpBackend) Cache(request *http.Request, bodySize int, cacheDir string
return resp, os.Rename(filename+"~", filename)
}
-func (h *httpBackend) Do(request *http.Request, bodySize int) (*Response, error) {
+func (h *httpBackend) Do(request *http.Request, bodySize int, checkHeadersFunc checkHeadersFunc) (*Response, error) {
r := h.GetMatchingRule(request.URL.Host)
if r != nil {
r.waitChan <- true
@@ -181,22 +184,31 @@ func (h *httpBackend) Do(request *http.Request, bodySize int) (*Response, error)
if err != nil {
return nil, err
}
+ defer res.Body.Close()
+
+ finalRequest := request
if res.Request != nil {
- *request = *res.Request
+ finalRequest = res.Request
+ }
+ if !checkHeadersFunc(finalRequest, res.StatusCode, res.Header) {
+ // closing res.Body (see defer above) without reading it aborts
+ // the download
+ return nil, ErrAbortedAfterHeaders
}
var bodyReader io.Reader = res.Body
if bodySize > 0 {
bodyReader = io.LimitReader(bodyReader, int64(bodySize))
}
- if !res.Uncompressed && res.Header.Get("Content-Encoding") == "gzip" {
+ contentEncoding := strings.ToLower(res.Header.Get("Content-Encoding"))
+ if !res.Uncompressed && (strings.Contains(contentEncoding, "gzip") || (contentEncoding == "" && strings.Contains(strings.ToLower(res.Header.Get("Content-Type")), "gzip")) || strings.HasSuffix(strings.ToLower(finalRequest.URL.Path), ".xml.gz")) {
bodyReader, err = gzip.NewReader(bodyReader)
if err != nil {
return nil, err
}
+ defer bodyReader.(*gzip.Reader).Close()
}
- body, err := ioutil.ReadAll(bodyReader)
- defer res.Body.Close()
+ body, err := io.ReadAll(bodyReader)
if err != nil {
return nil, err
}
diff --git a/http_trace.go b/http_trace.go
new file mode 100644
index 000000000..bcacbe313
--- /dev/null
+++ b/http_trace.go
@@ -0,0 +1,37 @@
+package colly
+
+import (
+ "net/http"
+ "net/http/httptrace"
+ "time"
+)
+
+// HTTPTrace provides a datastructure for storing an http trace.
+type HTTPTrace struct {
+ start, connect time.Time
+ ConnectDuration time.Duration
+ FirstByteDuration time.Duration
+}
+
+// trace returns a httptrace.ClientTrace object to be used with an http
+// request via httptrace.WithClientTrace() that fills in the HttpTrace.
+func (ht *HTTPTrace) trace() *httptrace.ClientTrace {
+ trace := &httptrace.ClientTrace{
+ ConnectStart: func(network, addr string) { ht.connect = time.Now() },
+ ConnectDone: func(network, addr string, err error) {
+ ht.ConnectDuration = time.Since(ht.connect)
+ },
+
+ GetConn: func(hostPort string) { ht.start = time.Now() },
+ GotFirstResponseByte: func() {
+ ht.FirstByteDuration = time.Since(ht.start)
+ },
+ }
+ return trace
+}
+
+// WithTrace returns the given HTTP Request with this HTTPTrace added to its
+// context.
+func (ht *HTTPTrace) WithTrace(req *http.Request) *http.Request {
+ return req.WithContext(httptrace.WithClientTrace(req.Context(), ht.trace()))
+}
diff --git a/http_trace_test.go b/http_trace_test.go
new file mode 100644
index 000000000..6f4d88d9d
--- /dev/null
+++ b/http_trace_test.go
@@ -0,0 +1,73 @@
+package colly
+
+import (
+ "net/http"
+ "net/http/httptest"
+ "testing"
+ "time"
+)
+
+const testDelay = 200 * time.Millisecond
+
+func newTraceTestServer(delay time.Duration) *httptest.Server {
+ mux := http.NewServeMux()
+
+ mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
+ time.Sleep(delay)
+ w.WriteHeader(200)
+ })
+ mux.HandleFunc("/error", func(w http.ResponseWriter, r *http.Request) {
+ time.Sleep(delay)
+ w.WriteHeader(500)
+ })
+
+ return httptest.NewServer(mux)
+}
+
+func TestTraceWithNoDelay(t *testing.T) {
+ ts := newTraceTestServer(0)
+ defer ts.Close()
+
+ client := ts.Client()
+ req, err := http.NewRequest("GET", ts.URL, nil)
+ if err != nil {
+ t.Errorf("Failed to construct request %v", err)
+ }
+ trace := &HTTPTrace{}
+ req = trace.WithTrace(req)
+
+ if _, err = client.Do(req); err != nil {
+ t.Errorf("Failed to make request %v", err)
+ }
+
+ if trace.ConnectDuration > testDelay {
+ t.Errorf("trace ConnectDuration should be (almost) 0, got %v", trace.ConnectDuration)
+ }
+ if trace.FirstByteDuration > testDelay {
+ t.Errorf("trace FirstByteDuration should be (almost) 0, got %v", trace.FirstByteDuration)
+ }
+}
+
+func TestTraceWithDelay(t *testing.T) {
+ ts := newTraceTestServer(testDelay)
+ defer ts.Close()
+
+ client := ts.Client()
+ req, err := http.NewRequest("GET", ts.URL, nil)
+ if err != nil {
+ t.Errorf("Failed to construct request %v", err)
+ }
+ trace := &HTTPTrace{}
+ req = trace.WithTrace(req)
+
+ if _, err = client.Do(req); err != nil {
+ t.Errorf("Failed to make request %v", err)
+ }
+
+ if trace.ConnectDuration > testDelay {
+ t.Errorf("trace ConnectDuration should be (almost) 0, got %v", trace.ConnectDuration)
+ }
+ if trace.FirstByteDuration < testDelay {
+ t.Errorf("trace FirstByteDuration should be at least 200ms, got %v", trace.FirstByteDuration)
+ }
+}
diff --git a/proxy/proxy.go b/proxy/proxy.go
index 18bcb2ad3..a4bd84852 100644
--- a/proxy/proxy.go
+++ b/proxy/proxy.go
@@ -20,7 +20,7 @@ import (
"net/url"
"sync/atomic"
- "github.com/gocolly/colly"
+ "github.com/gocolly/colly/v2"
)
type roundRobinSwitcher struct {
@@ -29,8 +29,9 @@ type roundRobinSwitcher struct {
}
func (r *roundRobinSwitcher) GetProxy(pr *http.Request) (*url.URL, error) {
- u := r.proxyURLs[r.index%uint32(len(r.proxyURLs))]
- atomic.AddUint32(&r.index, 1)
+ index := atomic.AddUint32(&r.index, 1) - 1
+ u := r.proxyURLs[index%uint32(len(r.proxyURLs))]
+
ctx := context.WithValue(pr.Context(), colly.ProxyURLKey, u.String())
*pr = *pr.WithContext(ctx)
return u, nil
@@ -42,6 +43,9 @@ func (r *roundRobinSwitcher) GetProxy(pr *http.Request) (*url.URL, error) {
// and "socks5" are supported. If the scheme is empty,
// "http" is assumed.
func RoundRobinProxySwitcher(ProxyURLs ...string) (colly.ProxyFunc, error) {
+ if len(ProxyURLs) < 1 {
+ return nil, colly.ErrEmptyProxyURL
+ }
urls := make([]*url.URL, len(ProxyURLs))
for i, u := range ProxyURLs {
parsedU, err := url.Parse(u)
diff --git a/queue/queue.go b/queue/queue.go
index f7a133d3b..0d0d78a66 100644
--- a/queue/queue.go
+++ b/queue/queue.go
@@ -3,14 +3,18 @@ package queue
import (
"net/url"
"sync"
- "sync/atomic"
- "github.com/gocolly/colly"
+ whatwgUrl "github.com/nlnwa/whatwg-url/url"
+
+ "github.com/gocolly/colly/v2"
)
const stop = true
+var urlParser = whatwgUrl.NewParser(whatwgUrl.WithPercentEncodeSinglePercentSign())
+
// Storage is the interface of the queue's storage backend
+// Storage must be concurrently safe for multiple goroutines.
type Storage interface {
// Init initializes the storage
Init() error
@@ -27,11 +31,11 @@ type Storage interface {
// requests in multiple threads
type Queue struct {
// Threads defines the number of consumer threads
- Threads int
- storage Storage
- activeThreadCount int32
- threadChans []chan bool
- lock *sync.Mutex
+ Threads int
+ storage Storage
+ wake chan struct{}
+ mut sync.Mutex // guards wake and running
+ running bool
}
// InMemoryQueueStorage is the default implementation of the Storage interface.
@@ -61,10 +65,9 @@ func New(threads int, s Storage) (*Queue, error) {
return nil, err
}
return &Queue{
- Threads: threads,
- storage: s,
- lock: &sync.Mutex{},
- threadChans: make([]chan bool, 0, threads),
+ Threads: threads,
+ storage: s,
+ running: true,
}, nil
}
@@ -76,12 +79,16 @@ func (q *Queue) IsEmpty() bool {
// AddURL adds a new URL to the queue
func (q *Queue) AddURL(URL string) error {
- u, err := url.Parse(URL)
+ u, err := urlParser.Parse(URL)
+ if err != nil {
+ return err
+ }
+ u2, err := url.Parse(u.Href(false))
if err != nil {
return err
}
r := &colly.Request{
- URL: u,
+ URL: u2,
Method: "GET",
}
d, err := r.Marshal()
@@ -93,20 +100,26 @@ func (q *Queue) AddURL(URL string) error {
// AddRequest adds a new Request to the queue
func (q *Queue) AddRequest(r *colly.Request) error {
- d, err := r.Marshal()
+ q.mut.Lock()
+ waken := q.wake != nil
+ q.mut.Unlock()
+ if !waken {
+ return q.storeRequest(r)
+ }
+ err := q.storeRequest(r)
if err != nil {
return err
}
- if err := q.storage.AddRequest(d); err != nil {
+ q.wake <- struct{}{}
+ return nil
+}
+
+func (q *Queue) storeRequest(r *colly.Request) error {
+ d, err := r.Marshal()
+ if err != nil {
return err
}
- q.lock.Lock()
- for _, c := range q.threadChans {
- c <- !stop
- }
- q.threadChans = make([]chan bool, 0, q.Threads)
- q.lock.Unlock()
- return nil
+ return q.storage.AddRequest(d)
}
// Size returns the size of the queue
@@ -116,56 +129,96 @@ func (q *Queue) Size() (int, error) {
// Run starts consumer threads and calls the Collector
// to perform requests. Run blocks while the queue has active requests
+// The given Storage must not be used directly while Run blocks.
func (q *Queue) Run(c *colly.Collector) error {
- wg := &sync.WaitGroup{}
+ q.mut.Lock()
+ if q.wake != nil && q.running == true {
+ q.mut.Unlock()
+ panic("cannot call duplicate Queue.Run")
+ }
+ q.wake = make(chan struct{})
+ q.running = true
+ q.mut.Unlock()
+
+ requestc := make(chan *colly.Request)
+ complete, errc := make(chan struct{}), make(chan error, 1)
for i := 0; i < q.Threads; i++ {
- wg.Add(1)
- go func(c *colly.Collector, wg *sync.WaitGroup) {
- defer wg.Done()
- for {
- if q.IsEmpty() {
- if q.activeThreadCount == 0 {
- break
- }
- ch := make(chan bool)
- q.lock.Lock()
- q.threadChans = append(q.threadChans, ch)
- q.lock.Unlock()
- action := <-ch
- if action == stop && q.IsEmpty() {
- break
- }
- }
- q.lock.Lock()
- atomic.AddInt32(&q.activeThreadCount, 1)
- q.lock.Unlock()
- rb, err := q.storage.GetRequest()
- if err != nil || rb == nil {
- q.finish()
- continue
+ go independentRunner(requestc, complete)
+ }
+ go q.loop(c, requestc, complete, errc)
+ defer close(requestc)
+ return <-errc
+}
+
+// Stop will stop the running queue
+func (q *Queue) Stop() {
+ q.mut.Lock()
+ q.running = false
+ q.mut.Unlock()
+}
+
+func (q *Queue) loop(c *colly.Collector, requestc chan<- *colly.Request, complete <-chan struct{}, errc chan<- error) {
+ var active int
+ for {
+ size, err := q.storage.QueueSize()
+ if err != nil {
+ errc <- err
+ break
+ }
+ if size == 0 && active == 0 || !q.running {
+ // Terminate when
+ // 1. No active requests
+ // 2. Empty queue
+ errc <- nil
+ break
+ }
+ sent := requestc
+ var req *colly.Request
+ if size > 0 {
+ req, err = q.loadRequest(c)
+ if err != nil {
+ // ignore an error returned by GetRequest() or
+ // UnmarshalRequest()
+ continue
+ }
+ } else {
+ sent = nil
+ }
+ Sent:
+ for {
+ select {
+ case sent <- req:
+ active++
+ break Sent
+ case <-q.wake:
+ if sent == nil {
+ break Sent
}
- r, err := c.UnmarshalRequest(rb)
- if err != nil || r == nil {
- q.finish()
- continue
+ case <-complete:
+ active--
+ if sent == nil && active == 0 {
+ break Sent
}
- r.Do()
- q.finish()
}
- }(c, wg)
+ }
}
- wg.Wait()
- return nil
}
-func (q *Queue) finish() {
- q.lock.Lock()
- q.activeThreadCount--
- for _, c := range q.threadChans {
- c <- stop
+func independentRunner(requestc <-chan *colly.Request, complete chan<- struct{}) {
+ for req := range requestc {
+ req.Do()
+ complete <- struct{}{}
+ }
+}
+
+func (q *Queue) loadRequest(c *colly.Collector) (*colly.Request, error) {
+ buf, err := q.storage.GetRequest()
+ if err != nil {
+ return nil, err
}
- q.threadChans = make([]chan bool, 0, q.Threads)
- q.lock.Unlock()
+ copied := make([]byte, len(buf))
+ copy(copied, buf)
+ return c.UnmarshalRequest(copied)
}
// Init implements Storage.Init() function
@@ -180,7 +233,7 @@ func (q *InMemoryQueueStorage) AddRequest(r []byte) error {
defer q.lock.Unlock()
// Discard URLs if size limit exceeded
if q.MaxSize > 0 && q.size >= q.MaxSize {
- return nil
+ return colly.ErrQueueFull
}
i := &inMemoryQueueItem{Request: r}
if q.first == nil {
diff --git a/queue/queue_test.go b/queue/queue_test.go
new file mode 100644
index 000000000..1d10f8377
--- /dev/null
+++ b/queue/queue_test.go
@@ -0,0 +1,112 @@
+package queue
+
+import (
+ "math/rand"
+ "net/http"
+ "net/http/httptest"
+ "sync"
+ "sync/atomic"
+ "testing"
+ "time"
+
+ "github.com/gocolly/colly/v2"
+)
+
+func TestQueue(t *testing.T) {
+ server := httptest.NewServer(http.HandlerFunc(serverHandler))
+ defer server.Close()
+
+ rng := rand.New(rand.NewSource(12387123712321232))
+ var rngMu sync.Mutex
+
+ var (
+ items uint32
+ requests uint32
+ success uint32
+ failure uint32
+ )
+ storage := &InMemoryQueueStorage{MaxSize: 100000}
+ q, err := New(10, storage)
+ if err != nil {
+ panic(err)
+ }
+ put := func() {
+ rngMu.Lock()
+ t := time.Duration(rng.Intn(50)) * time.Microsecond
+ rngMu.Unlock()
+ url := server.URL + "/delay?t=" + t.String()
+ atomic.AddUint32(&items, 1)
+ q.AddURL(url)
+ }
+ for i := 0; i < 3000; i++ {
+ put()
+ storage.AddRequest([]byte("error request"))
+ }
+ c := colly.NewCollector(
+ colly.AllowURLRevisit(),
+ )
+ c.OnRequest(func(req *colly.Request) {
+ atomic.AddUint32(&requests, 1)
+ })
+ c.OnResponse(func(resp *colly.Response) {
+ if resp.StatusCode == http.StatusOK {
+ atomic.AddUint32(&success, 1)
+ } else {
+ atomic.AddUint32(&failure, 1)
+ }
+ rngMu.Lock()
+ toss := rng.Intn(2) == 0
+ rngMu.Unlock()
+ if toss {
+ put()
+ }
+ })
+ c.OnError(func(resp *colly.Response, err error) {
+ atomic.AddUint32(&failure, 1)
+ })
+ err = q.Run(c)
+ if err != nil {
+ t.Fatalf("Queue.Run() return an error: %v", err)
+ }
+ if items != requests || success+failure != requests || failure > 0 {
+ t.Fatalf("wrong Queue implementation: "+
+ "items = %d, requests = %d, success = %d, failure = %d",
+ items, requests, success, failure)
+ }
+}
+
+func serverHandler(w http.ResponseWriter, req *http.Request) {
+ if !serverRoute(w, req) {
+ shutdown(w)
+ }
+}
+
+func serverRoute(w http.ResponseWriter, req *http.Request) bool {
+ if req.URL.Path == "/delay" {
+ return serveDelay(w, req) == nil
+ }
+ return false
+}
+
+func serveDelay(w http.ResponseWriter, req *http.Request) error {
+ q := req.URL.Query()
+ t, err := time.ParseDuration(q.Get("t"))
+ if err != nil {
+ return err
+ }
+ time.Sleep(t)
+ w.WriteHeader(http.StatusOK)
+ return nil
+}
+
+func shutdown(w http.ResponseWriter) {
+ taker, ok := w.(http.Hijacker)
+ if !ok {
+ return
+ }
+ raw, _, err := taker.Hijack()
+ if err != nil {
+ return
+ }
+ raw.Close()
+}
diff --git a/request.go b/request.go
index 4b94cd209..5c80e2bb8 100644
--- a/request.go
+++ b/request.go
@@ -18,7 +18,6 @@ import (
"bytes"
"encoding/json"
"io"
- "io/ioutil"
"net/http"
"net/url"
"strings"
@@ -31,6 +30,8 @@ type Request struct {
URL *url.URL
// Headers contains the Request's HTTP headers
Headers *http.Header
+ // the Host header
+ Host string
// Ctx is a context between a Request and a Response
Ctx *Context
// Depth is the number of the parents of the request
@@ -55,24 +56,31 @@ type Request struct {
type serializableRequest struct {
URL string
Method string
+ Depth int
Body []byte
ID uint32
Ctx map[string]interface{}
Headers http.Header
+ Host string
}
// New creates a new request with the context of the original request
func (r *Request) New(method, URL string, body io.Reader) (*Request, error) {
- u, err := url.Parse(URL)
+ u, err := urlParser.Parse(URL)
+ if err != nil {
+ return nil, err
+ }
+ u2, err := url.Parse(u.Href(false))
if err != nil {
return nil, err
}
return &Request{
Method: method,
- URL: u,
+ URL: u2,
Body: body,
Ctx: r.Ctx,
Headers: &http.Header{},
+ Host: r.Host,
ID: atomic.AddUint32(&r.collector.requestCount, 1),
collector: r.collector,
}, nil
@@ -96,15 +104,12 @@ func (r *Request) AbsoluteURL(u string) string {
} else {
base = r.URL
}
- absURL, err := base.Parse(u)
+
+ absURL, err := urlParser.ParseRef(base.String(), u)
if err != nil {
return ""
}
- absURL.Fragment = ""
- if absURL.Scheme == "//" {
- absURL.Scheme = r.URL.Scheme
- }
- return absURL.String()
+ return absURL.Href(false)
}
// Visit continues Collector's collecting job by creating a
@@ -114,6 +119,11 @@ func (r *Request) Visit(URL string) error {
return r.collector.scrape(r.AbsoluteURL(URL), "GET", r.Depth+1, nil, r.Ctx, nil, true)
}
+// HasVisited checks if the provided URL has been visited
+func (r *Request) HasVisited(URL string) (bool, error) {
+ return r.collector.HasVisited(URL)
+}
+
// Post continues a collector job by creating a POST request and preserves the Context
// of the previous request.
// Post also calls the previously provided callbacks
@@ -141,6 +151,10 @@ func (r *Request) PostMultipart(URL string, requestData map[string][]byte) error
// Retry submits HTTP request again with the same parameters
func (r *Request) Retry() error {
+ r.Headers.Del("Cookie")
+ if _, ok := r.Body.(io.ReadSeeker); r.Body != nil && !ok {
+ return ErrRetryBodyUnseekable
+ }
return r.collector.scrape(r.URL.String(), r.Method, r.Depth, r.Body, r.Ctx, *r.Headers, false)
}
@@ -161,14 +175,16 @@ func (r *Request) Marshal() ([]byte, error) {
var err error
var body []byte
if r.Body != nil {
- body, err = ioutil.ReadAll(r.Body)
+ body, err = io.ReadAll(r.Body)
if err != nil {
return nil, err
}
}
sr := &serializableRequest{
URL: r.URL.String(),
+ Host: r.Host,
Method: r.Method,
+ Depth: r.Depth,
Body: body,
ID: r.ID,
Ctx: ctx,
diff --git a/response.go b/response.go
index 29ba6ae14..30cdeae66 100644
--- a/response.go
+++ b/response.go
@@ -17,9 +17,10 @@ package colly
import (
"bytes"
"fmt"
- "io/ioutil"
+ "io"
"mime"
"net/http"
+ "os"
"strings"
"github.com/saintfish/chardet"
@@ -38,11 +39,14 @@ type Response struct {
Request *Request
// Headers contains the Response's HTTP headers
Headers *http.Header
+ // Trace contains the HTTPTrace for the request. Will only be set by the
+ // collector if Collector.TraceHTTP is set to true.
+ Trace *HTTPTrace
}
// Save writes response body to disk
func (r *Response) Save(fileName string) error {
- return ioutil.WriteFile(fileName, r.Body, 0644)
+ return os.WriteFile(fileName, r.Body, 0644)
}
// FileName returns the sanitized file name parsed from "Content-Disposition"
@@ -59,6 +63,9 @@ func (r *Response) FileName() string {
}
func (r *Response) fixCharset(detectCharset bool, defaultEncoding string) error {
+ if len(r.Body) == 0 {
+ return nil
+ }
if defaultEncoding != "" {
tmpBody, err := encodeBytes(r.Body, "text/plain; charset="+defaultEncoding)
if err != nil {
@@ -68,6 +75,16 @@ func (r *Response) fixCharset(detectCharset bool, defaultEncoding string) error
return nil
}
contentType := strings.ToLower(r.Headers.Get("Content-Type"))
+
+ if strings.Contains(contentType, "image/") ||
+ strings.Contains(contentType, "video/") ||
+ strings.Contains(contentType, "audio/") ||
+ strings.Contains(contentType, "font/") {
+ // These MIME types should not have textual data.
+
+ return nil
+ }
+
if !strings.Contains(contentType, "charset") {
if !detectCharset {
return nil
@@ -95,5 +112,5 @@ func encodeBytes(b []byte, contentType string) ([]byte, error) {
if err != nil {
return nil, err
}
- return ioutil.ReadAll(r)
+ return io.ReadAll(r)
}
diff --git a/unmarshal.go b/unmarshal.go
index 302f25871..42ceb2a69 100644
--- a/unmarshal.go
+++ b/unmarshal.go
@@ -35,17 +35,17 @@ func (h *HTMLElement) UnmarshalWithMap(v interface{}, structMap map[string]strin
// UnmarshalHTML declaratively extracts text or attributes to a struct from
// HTML response using struct tags composed of css selectors.
// Allowed struct tags:
-// - "selector" (required): CSS (goquery) selector of the desired data
-// - "attr" (optional): Selects the matching element's attribute's value.
+// - "selector" (required): CSS (goquery) selector of the desired data
+// - "attr" (optional): Selects the matching element's attribute's value.
// Leave it blank or omit to get the text of the element.
//
// Example struct declaration:
//
-// type Nested struct {
-// String string `selector:"div > p"`
-// Classes []string `selector:"li" attr:"class"`
-// Struct *Nested `selector:"div > div"`
-// }
+// type Nested struct {
+// String string `selector:"div > p"`
+// Classes []string `selector:"li" attr:"class"`
+// Struct *Nested `selector:"div > div"`
+// }
//
// Supported types: struct, *struct, string, []string
func UnmarshalHTML(v interface{}, s *goquery.Selection, structMap map[string]string) error {
diff --git a/xmlelement.go b/xmlelement.go
index 7ff5fe553..857900e85 100644
--- a/xmlelement.go
+++ b/xmlelement.go
@@ -15,7 +15,6 @@
package colly
import (
- "encoding/xml"
"strings"
"github.com/antchfx/htmlquery"
@@ -76,7 +75,7 @@ func (h *XMLElement) Attr(k string) string {
}
}
} else {
- for _, a := range h.attributes.([]xml.Attr) {
+ for _, a := range h.attributes.([]xmlquery.Attr) {
if a.Name.Local == k {
return a.Value
}
diff --git a/xmlelement_test.go b/xmlelement_test.go
index ac7a1aeca..90a434826 100644
--- a/xmlelement_test.go
+++ b/xmlelement_test.go
@@ -16,7 +16,7 @@ package colly_test
import (
"github.com/antchfx/htmlquery"
- "github.com/gocolly/colly"
+ "github.com/gocolly/colly/v2"
"reflect"
"strings"
"testing"