Merge pull request gocolly#84 from peterhellberg/examples-with-functi…

…onal-options Examples with functional options
jempe · Jan 6, 2018 · 8ce8057 · 8ce8057
2 parents 2cf04e2 + e13b243
commit 8ce8057
Show file tree

Hide file tree

Showing 15 changed files with 68 additions and 80 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -5,8 +5,6 @@ go:
   - 1.8.x
   - 1.9.x
   - tip
-env:
-  - "PATH=$GOPATH/bin:$PATH"
 script:
   - go get -u github.com/golang/lint/golint
   - OUT="$(go get -a)"; test -z "$OUT" || (echo "$OUT" && return 1)

diff --git a/_examples/basic/basic.go b/_examples/basic/basic.go
@@ -8,10 +8,10 @@ import (
 
 func main() {
 	// Instantiate default collector
-	c := colly.NewCollector()
-
-	// Visit only domains: hackerspaces.org, wiki.hackerspaces.org
-	c.AllowedDomains = []string{"hackerspaces.org", "wiki.hackerspaces.org"}
+	c := colly.NewCollector(
+		// Visit only domains: hackerspaces.org, wiki.hackerspaces.org
+		colly.AllowedDomains("hackerspaces.org", "wiki.hackerspaces.org"),
+	)
 
 	// On every a element which has href attribute call callback
 	c.OnHTML("a[href]", func(e *colly.HTMLElement) {

diff --git a/_examples/coursera_courses/coursera_courses.go b/_examples/coursera_courses/coursera_courses.go
@@ -2,8 +2,8 @@ package main
 
 import (
 	"encoding/json"
-	"fmt"
 	"log"
+	"os"
 	"strings"
 
 	"github.com/PuerkitoBio/goquery"
@@ -25,14 +25,14 @@ type Course struct {
 
 func main() {
 	// Instantiate default collector
-	c := colly.NewCollector()
+	c := colly.NewCollector(
+		// Visit only domains: coursera.org, www.coursera.org
+		colly.AllowedDomains("coursera.org", "www.coursera.org"),
 
-	// Visit only domains: coursera.org, www.coursera.org
-	c.AllowedDomains = []string{"coursera.org", "www.coursera.org"}
-
-	// Cache responses to prevent multiple download of pages
-	// even if the collector is restarted
-	c.CacheDir = "./coursera_cache"
+		// Cache responses to prevent multiple download of pages
+		// even if the collector is restarted
+		colly.CacheDir("./coursera_cache"),
+	)
 
 	// Create another collector to scrape course details
 	detailCollector := c.Clone()
@@ -104,12 +104,9 @@ func main() {
 	// Start scraping on http://coursera.com/browse
 	c.Visit("https://coursera.org/browse")
 
-	// Convert results to JSON data if the scraping job has finished
-	jsonData, err := json.MarshalIndent(courses, "", "  ")
-	if err != nil {
-		panic(err)
-	}
+	enc := json.NewEncoder(os.Stdout)
+	enc.SetIndent("", "  ")
 
-	// Dump json to the standard output (can be redirected to a file)
-	fmt.Println(string(jsonData))
+	// Dump json to the standard output
+	enc.Encode(courses)
 }
diff --git a/_examples/google_groups/google_groups.go b/_examples/google_groups/google_groups.go
@@ -3,8 +3,8 @@ package main
 import (
 	"encoding/json"
 	"flag"
-	"fmt"
 	"log"
+	"os"
 	"strings"
 
 	"github.com/PuerkitoBio/goquery"
@@ -87,12 +87,9 @@ func main() {
 
 	threadCollector.Visit("https://groups.google.com/forum/?_escaped_fragment_=forum/" + groupName)
 
-	// Convert results to JSON data if the scraping job has finished
-	jsonData, err := json.MarshalIndent(threads, "", "  ")
-	if err != nil {
-		panic(err)
-	}
+	enc := json.NewEncoder(os.Stdout)
+	enc.SetIndent("", "  ")
 
-	// Dump json to the standard output (can be redirected to a file)
-	fmt.Println(string(jsonData))
+	// Dump json to the standard output
+	enc.Encode(threads)
 }
diff --git a/_examples/hackernews_comments/hackernews_comments.go b/_examples/hackernews_comments/hackernews_comments.go
@@ -3,7 +3,6 @@ package main
 import (
 	"encoding/json"
 	"flag"
-	"fmt"
 	"log"
 	"os"
 	"strconv"
@@ -65,12 +64,9 @@ func main() {
 
 	c.Visit("https://news.ycombinator.com/item?id=" + itemID)
 
-	// Convert results to JSON data if the scraping job has finished
-	jsonData, err := json.MarshalIndent(comments, "", "  ")
-	if err != nil {
-		panic(err)
-	}
+	enc := json.NewEncoder(os.Stdout)
+	enc.SetIndent("", "  ")
 
-	// Dump json to the standard output (can be redirected to a file)
-	fmt.Println(string(jsonData))
+	// Dump json to the standard output
+	enc.Encode(comments)
 }
diff --git a/_examples/instagram/instagram.go b/_examples/instagram/instagram.go
@@ -32,9 +32,10 @@ func main() {
 	instagramAccount := os.Args[1]
 	outputDir := fmt.Sprintf("./instagram_%s/", instagramAccount)
 
-	c := colly.NewCollector()
-	c.CacheDir = "./_instagram_cache/"
-	c.UserAgent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
+	c := colly.NewCollector(
+		colly.CacheDir("./_instagram_cache/"),
+		colly.UserAgent("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"),
+	)
 
 	c.OnHTML("body > script:first-of-type", func(e *colly.HTMLElement) {
 		jsonData := e.Text[strings.Index(e.Text, "{") : len(e.Text)-1]

diff --git a/_examples/max_depth/max_depth.go b/_examples/max_depth/max_depth.go
@@ -8,11 +8,11 @@ import (
 
 func main() {
 	// Instantiate default collector
-	c := colly.NewCollector()
-
-	// MaxDepth is 1, so only the links on the scraped page
-	// is visited, and no further links are followed
-	c.MaxDepth = 1
+	c := colly.NewCollector(
+		// MaxDepth is 1, so only the links on the scraped page
+		// is visited, and no further links are followed
+		colly.MaxDepth(1),
+	)
 
 	// On every a element which has href attribute call callback
 	c.OnHTML("a[href]", func(e *colly.HTMLElement) {

diff --git a/_examples/multipart/multipart.go b/_examples/multipart/multipart.go
@@ -46,9 +46,7 @@ func main() {
 	// Start a single route http server to post an image to.
 	setupServer()
 
-	c := colly.NewCollector()
-	c.AllowURLRevisit = true
-	c.MaxDepth = 5
+	c := colly.NewCollector(colly.AllowURLRevisit(), colly.MaxDepth(5))
 
 	// On every a element which has href attribute call callback
 	c.OnHTML("html", func(e *colly.HTMLElement) {

diff --git a/_examples/openedx_courses/openedx_courses.go b/_examples/openedx_courses/openedx_courses.go
@@ -25,13 +25,14 @@ type Course struct {
 
 func main() {
 	// Instantiate default collector
-	c := colly.NewCollector()
-	// Using IndonesiaX as sample
-	c.AllowedDomains = []string{"indonesiax.co.id", "www.indonesiax.co.id"}
+	c := colly.NewCollector(
+		// Using IndonesiaX as sample
+		colly.AllowedDomains("indonesiax.co.id", "www.indonesiax.co.id"),
 
-	// Cache responses to prevent multiple download of pages
-	// even if the collector is restarted
-	c.CacheDir = "./cache"
+		// Cache responses to prevent multiple download of pages
+		// even if the collector is restarted
+		colly.CacheDir("./cache"),
+	)
 
 	courses := make([]Course, 0, 200)
 

diff --git a/_examples/parallel/parallel.go b/_examples/parallel/parallel.go
@@ -8,7 +8,11 @@ import (
 
 func main() {
 	// Instantiate default collector
-	c := colly.NewCollector()
+	c := colly.NewCollector(
+		// MaxDepth is 2, so only the links on the scraped page
+		// and links on those pages are visited
+		colly.MaxDepth(2),
+	)
 
 	// Limit the maximum parallelism to 5
 	// This is necessary if the goroutines are dynamically
@@ -18,10 +22,6 @@ func main() {
 	// number of go routines.
 	c.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 5})
 
-	// MaxDepth is 2, so only the links on the scraped page
-	// and links on those pages are visited
-	c.MaxDepth = 2
-
 	// On every a element which has href attribute call callback
 	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
 		link := e.Attr("href")

diff --git a/_examples/proxy_switcher/proxy_switcher.go b/_examples/proxy_switcher/proxy_switcher.go
@@ -10,7 +10,7 @@ import (
 
 func main() {
 	// Instantiate default collector
-	c := colly.NewCollector()
+	c := colly.NewCollector(colly.AllowURLRevisit())
 
 	// Rotate two socks5 proxies
 	rp, err := proxy.RoundRobinProxySwitcher("socks5://127.0.0.1:1337", "socks5://127.0.0.1:1338")
@@ -25,7 +25,6 @@ func main() {
 	})
 
 	// Fetch httpbin.org/ip five times
-	c.AllowURLRevisit = true
 	for i := 0; i < 5; i++ {
 		c.Visit("https://httpbin.org/ip")
 	}

diff --git a/_examples/random_delay/random_delay.go b/_examples/random_delay/random_delay.go
@@ -12,10 +12,10 @@ func main() {
 	url := "https://httpbin.org/delay/2"
 
 	// Instantiate default collector
-	c := colly.NewCollector()
-
-	// Attach a debugger to the collector
-	c.SetDebugger(&debug.LogDebugger{})
+	c := colly.NewCollector(
+		// Attach a debugger to the collector
+		colly.Debugger(&debug.LogDebugger{}),
+	)
 
 	// Limit the number of threads started by colly to two
 	// when visiting links which domains' matches "*httpbin.*" glob

diff --git a/_examples/rate_limit/rate_limit.go b/_examples/rate_limit/rate_limit.go
@@ -11,10 +11,10 @@ func main() {
 	url := "https://httpbin.org/delay/2"
 
 	// Instantiate default collector
-	c := colly.NewCollector()
-
-	// Attach a debugger to the collector
-	c.SetDebugger(&debug.LogDebugger{})
+	c := colly.NewCollector(
+		// Attach a debugger to the collector
+		colly.Debugger(&debug.LogDebugger{}),
+	)
 
 	// Limit the number of threads started by colly to two
 	// when visiting links which domains' matches "*httpbin.*" glob

diff --git a/_examples/url_filter/url_filter.go b/_examples/url_filter/url_filter.go
@@ -9,13 +9,13 @@ import (
 
 func main() {
 	// Instantiate default collector
-	c := colly.NewCollector()
-
-	// Visit only root url and urls which start with "e" or "h" on httpbin.org
-	c.URLFilters = []*regexp.Regexp{
-		regexp.MustCompile("http://httpbin\\.org/(|e.+)$"),
-		regexp.MustCompile("http://httpbin\\.org/h.+"),
-	}
+	c := colly.NewCollector(
+		// Visit only root url and urls which start with "e" or "h" on httpbin.org
+		colly.URLFilters(
+			regexp.MustCompile("http://httpbin\\.org/(|e.+)$"),
+			regexp.MustCompile("http://httpbin\\.org/h.+"),
+		),
+	)
 
 	// On every a element which has href attribute call callback
 	c.OnHTML("a[href]", func(e *colly.HTMLElement) {

diff --git a/_examples/xkcd_store/xkcd_store.go b/_examples/xkcd_store/xkcd_store.go
@@ -20,11 +20,12 @@ func main() {
 	defer writer.Flush()
 	// Write CSV header
 	writer.Write([]string{"Name", "Price", "URL", "Image URL"})
-	// Instantiate default collector
-	c := colly.NewCollector()
 
-	// Allow requests only to store.xkcd.com
-	c.AllowedDomains = []string{"store.xkcd.com"}
+	// Instantiate default collector
+	c := colly.NewCollector(
+		// Allow requests only to store.xkcd.com
+		colly.AllowedDomains("store.xkcd.com"),
+	)
 
 	// Extract product details
 	c.OnHTML(".product-grid-item", func(e *colly.HTMLElement) {
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,8 +5,6 @@ go: @@
       - 1.8.x
       - 1.9.x
       - tip
-    env:
-      - "PATH=$GOPATH/bin:$PATH"
     script:
       - go get -u github.com/golang/lint/golint
       - OUT="$(go get -a)"; test -z "$OUT" || (echo "$OUT" && return 1)
@@ Expand Down @@