Skip to content

Commit

Permalink
[examples] Use functional options for NewCollector
Browse files Browse the repository at this point in the history
  • Loading branch information
peterhellberg committed Jan 5, 2018
1 parent 09a4706 commit b59af10
Show file tree
Hide file tree
Showing 12 changed files with 54 additions and 54 deletions.
8 changes: 4 additions & 4 deletions _examples/basic/basic.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ import (

func main() {
// Instantiate default collector
c := colly.NewCollector()

// Visit only domains: hackerspaces.org, wiki.hackerspaces.org
c.AllowedDomains = []string{"hackerspaces.org", "wiki.hackerspaces.org"}
c := colly.NewCollector(
// Visit only domains: hackerspaces.org, wiki.hackerspaces.org
colly.AllowedDomains("hackerspaces.org", "wiki.hackerspaces.org"),
)

// On every a element which has href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
Expand Down
14 changes: 7 additions & 7 deletions _examples/coursera_courses/coursera_courses.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,14 @@ type Course struct {

func main() {
// Instantiate default collector
c := colly.NewCollector()
c := colly.NewCollector(
// Visit only domains: coursera.org, www.coursera.org
colly.AllowedDomains("coursera.org", "www.coursera.org"),

// Visit only domains: coursera.org, www.coursera.org
c.AllowedDomains = []string{"coursera.org", "www.coursera.org"}

// Cache responses to prevent multiple download of pages
// even if the collector is restarted
c.CacheDir = "./coursera_cache"
// Cache responses to prevent multiple download of pages
// even if the collector is restarted
colly.CacheDir("./coursera_cache"),
)

// Create another collector to scrape course details
detailCollector := c.Clone()
Expand Down
7 changes: 4 additions & 3 deletions _examples/instagram/instagram.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,10 @@ func main() {
instagramAccount := os.Args[1]
outputDir := fmt.Sprintf("./instagram_%s/", instagramAccount)

c := colly.NewCollector()
c.CacheDir = "./_instagram_cache/"
c.UserAgent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
c := colly.NewCollector(
colly.CacheDir("./_instagram_cache/"),
colly.UserAgent("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"),
)

c.OnHTML("body > script:first-of-type", func(e *colly.HTMLElement) {
jsonData := e.Text[strings.Index(e.Text, "{") : len(e.Text)-1]
Expand Down
10 changes: 5 additions & 5 deletions _examples/max_depth/max_depth.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@ import (

func main() {
// Instantiate default collector
c := colly.NewCollector()

// MaxDepth is 1, so only the links on the scraped page
// is visited, and no further links are followed
c.MaxDepth = 1
c := colly.NewCollector(
// MaxDepth is 1, so only the links on the scraped page
// is visited, and no further links are followed
colly.MaxDepth(1),
)

// On every a element which has href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
Expand Down
4 changes: 1 addition & 3 deletions _examples/multipart/multipart.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,7 @@ func main() {
// Start a single route http server to post an image to.
setupServer()

c := colly.NewCollector()
c.AllowURLRevisit = true
c.MaxDepth = 5
c := colly.NewCollector(colly.AllowURLRevisit(), colly.MaxDepth(5))

// On every a element which has href attribute call callback
c.OnHTML("html", func(e *colly.HTMLElement) {
Expand Down
13 changes: 7 additions & 6 deletions _examples/openedx_courses/openedx_courses.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,14 @@ type Course struct {

func main() {
// Instantiate default collector
c := colly.NewCollector()
// Using IndonesiaX as sample
c.AllowedDomains = []string{"indonesiax.co.id", "www.indonesiax.co.id"}
c := colly.NewCollector(
// Using IndonesiaX as sample
colly.AllowedDomains("indonesiax.co.id", "www.indonesiax.co.id"),

// Cache responses to prevent multiple download of pages
// even if the collector is restarted
c.CacheDir = "./cache"
// Cache responses to prevent multiple download of pages
// even if the collector is restarted
colly.CacheDir("./cache"),
)

courses := make([]Course, 0, 200)

Expand Down
10 changes: 5 additions & 5 deletions _examples/parallel/parallel.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@ import (

func main() {
// Instantiate default collector
c := colly.NewCollector()
c := colly.NewCollector(
// MaxDepth is 2, so only the links on the scraped page
// and links on those pages are visited
colly.MaxDepth(2),
)

// Limit the maximum parallelism to 5
// This is necessary if the goroutines are dynamically
Expand All @@ -18,10 +22,6 @@ func main() {
// number of go routines.
c.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 5})

// MaxDepth is 2, so only the links on the scraped page
// and links on those pages are visited
c.MaxDepth = 2

// On every a element which has href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
Expand Down
3 changes: 1 addition & 2 deletions _examples/proxy_switcher/proxy_switcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import (

func main() {
// Instantiate default collector
c := colly.NewCollector()
c := colly.NewCollector(colly.AllowURLRevisit())

// Rotate two socks5 proxies
rp, err := proxy.RoundRobinProxySwitcher("socks5://127.0.0.1:1337", "socks5://127.0.0.1:1338")
Expand All @@ -25,7 +25,6 @@ func main() {
})

// Fetch httpbin.org/ip five times
c.AllowURLRevisit = true
for i := 0; i < 5; i++ {
c.Visit("https://httpbin.org/ip")
}
Expand Down
8 changes: 4 additions & 4 deletions _examples/random_delay/random_delay.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ func main() {
url := "https://httpbin.org/delay/2"

// Instantiate default collector
c := colly.NewCollector()

// Attach a debugger to the collector
c.SetDebugger(&debug.LogDebugger{})
c := colly.NewCollector(
// Attach a debugger to the collector
colly.Debugger(&debug.LogDebugger{}),
)

// Limit the number of threads started by colly to two
// when visiting links which domains' matches "*httpbin.*" glob
Expand Down
8 changes: 4 additions & 4 deletions _examples/rate_limit/rate_limit.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ func main() {
url := "https://httpbin.org/delay/2"

// Instantiate default collector
c := colly.NewCollector()

// Attach a debugger to the collector
c.SetDebugger(&debug.LogDebugger{})
c := colly.NewCollector(
// Attach a debugger to the collector
colly.Debugger(&debug.LogDebugger{}),
)

// Limit the number of threads started by colly to two
// when visiting links which domains' matches "*httpbin.*" glob
Expand Down
14 changes: 7 additions & 7 deletions _examples/url_filter/url_filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@ import (

func main() {
// Instantiate default collector
c := colly.NewCollector()

// Visit only root url and urls which start with "e" or "h" on httpbin.org
c.URLFilters = []*regexp.Regexp{
regexp.MustCompile("http://httpbin\\.org/(|e.+)$"),
regexp.MustCompile("http://httpbin\\.org/h.+"),
}
c := colly.NewCollector(
// Visit only root url and urls which start with "e" or "h" on httpbin.org
colly.URLFilters(
regexp.MustCompile("http://httpbin\\.org/(|e.+)$"),
regexp.MustCompile("http://httpbin\\.org/h.+"),
),
)

// On every a element which has href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
Expand Down
9 changes: 5 additions & 4 deletions _examples/xkcd_store/xkcd_store.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,12 @@ func main() {
defer writer.Flush()
// Write CSV header
writer.Write([]string{"Name", "Price", "URL", "Image URL"})
// Instantiate default collector
c := colly.NewCollector()

// Allow requests only to store.xkcd.com
c.AllowedDomains = []string{"store.xkcd.com"}
// Instantiate default collector
c := colly.NewCollector(
// Allow requests only to store.xkcd.com
colly.AllowedDomains("store.xkcd.com"),
)

// Extract product details
c.OnHTML(".product-grid-item", func(e *colly.HTMLElement) {
Expand Down

0 comments on commit b59af10

Please sign in to comment.