Skip to content

Commit

Permalink
Merge pull request gocolly#84 from peterhellberg/examples-with-functi…
Browse files Browse the repository at this point in the history
…onal-options

Examples with functional options
  • Loading branch information
asciimoo authored Jan 6, 2018
2 parents 2cf04e2 + e13b243 commit 8ce8057
Show file tree
Hide file tree
Showing 15 changed files with 68 additions and 80 deletions.
2 changes: 0 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@ go:
- 1.8.x
- 1.9.x
- tip
env:
- "PATH=$GOPATH/bin:$PATH"
script:
- go get -u github.com/golang/lint/golint
- OUT="$(go get -a)"; test -z "$OUT" || (echo "$OUT" && return 1)
Expand Down
8 changes: 4 additions & 4 deletions _examples/basic/basic.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ import (

func main() {
// Instantiate default collector
c := colly.NewCollector()

// Visit only domains: hackerspaces.org, wiki.hackerspaces.org
c.AllowedDomains = []string{"hackerspaces.org", "wiki.hackerspaces.org"}
c := colly.NewCollector(
// Visit only domains: hackerspaces.org, wiki.hackerspaces.org
colly.AllowedDomains("hackerspaces.org", "wiki.hackerspaces.org"),
)

// On every a element which has href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
Expand Down
27 changes: 12 additions & 15 deletions _examples/coursera_courses/coursera_courses.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ package main

import (
"encoding/json"
"fmt"
"log"
"os"
"strings"

"github.com/PuerkitoBio/goquery"
Expand All @@ -25,14 +25,14 @@ type Course struct {

func main() {
// Instantiate default collector
c := colly.NewCollector()
c := colly.NewCollector(
// Visit only domains: coursera.org, www.coursera.org
colly.AllowedDomains("coursera.org", "www.coursera.org"),

// Visit only domains: coursera.org, www.coursera.org
c.AllowedDomains = []string{"coursera.org", "www.coursera.org"}

// Cache responses to prevent multiple download of pages
// even if the collector is restarted
c.CacheDir = "./coursera_cache"
// Cache responses to prevent multiple download of pages
// even if the collector is restarted
colly.CacheDir("./coursera_cache"),
)

// Create another collector to scrape course details
detailCollector := c.Clone()
Expand Down Expand Up @@ -104,12 +104,9 @@ func main() {
// Start scraping on http://coursera.com/browse
c.Visit("https://coursera.org/browse")

// Convert results to JSON data if the scraping job has finished
jsonData, err := json.MarshalIndent(courses, "", " ")
if err != nil {
panic(err)
}
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")

// Dump json to the standard output (can be redirected to a file)
fmt.Println(string(jsonData))
// Dump json to the standard output
enc.Encode(courses)
}
13 changes: 5 additions & 8 deletions _examples/google_groups/google_groups.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ package main
import (
"encoding/json"
"flag"
"fmt"
"log"
"os"
"strings"

"github.com/PuerkitoBio/goquery"
Expand Down Expand Up @@ -87,12 +87,9 @@ func main() {

threadCollector.Visit("https://groups.google.com/forum/?_escaped_fragment_=forum/" + groupName)

// Convert results to JSON data if the scraping job has finished
jsonData, err := json.MarshalIndent(threads, "", " ")
if err != nil {
panic(err)
}
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")

// Dump json to the standard output (can be redirected to a file)
fmt.Println(string(jsonData))
// Dump json to the standard output
enc.Encode(threads)
}
12 changes: 4 additions & 8 deletions _examples/hackernews_comments/hackernews_comments.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package main
import (
"encoding/json"
"flag"
"fmt"
"log"
"os"
"strconv"
Expand Down Expand Up @@ -65,12 +64,9 @@ func main() {

c.Visit("https://news.ycombinator.com/item?id=" + itemID)

// Convert results to JSON data if the scraping job has finished
jsonData, err := json.MarshalIndent(comments, "", " ")
if err != nil {
panic(err)
}
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")

// Dump json to the standard output (can be redirected to a file)
fmt.Println(string(jsonData))
// Dump json to the standard output
enc.Encode(comments)
}
7 changes: 4 additions & 3 deletions _examples/instagram/instagram.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,10 @@ func main() {
instagramAccount := os.Args[1]
outputDir := fmt.Sprintf("./instagram_%s/", instagramAccount)

c := colly.NewCollector()
c.CacheDir = "./_instagram_cache/"
c.UserAgent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
c := colly.NewCollector(
colly.CacheDir("./_instagram_cache/"),
colly.UserAgent("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"),
)

c.OnHTML("body > script:first-of-type", func(e *colly.HTMLElement) {
jsonData := e.Text[strings.Index(e.Text, "{") : len(e.Text)-1]
Expand Down
10 changes: 5 additions & 5 deletions _examples/max_depth/max_depth.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@ import (

func main() {
// Instantiate default collector
c := colly.NewCollector()

// MaxDepth is 1, so only the links on the scraped page
// is visited, and no further links are followed
c.MaxDepth = 1
c := colly.NewCollector(
// MaxDepth is 1, so only the links on the scraped page
// is visited, and no further links are followed
colly.MaxDepth(1),
)

// On every a element which has href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
Expand Down
4 changes: 1 addition & 3 deletions _examples/multipart/multipart.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,7 @@ func main() {
// Start a single route http server to post an image to.
setupServer()

c := colly.NewCollector()
c.AllowURLRevisit = true
c.MaxDepth = 5
c := colly.NewCollector(colly.AllowURLRevisit(), colly.MaxDepth(5))

// On every a element which has href attribute call callback
c.OnHTML("html", func(e *colly.HTMLElement) {
Expand Down
13 changes: 7 additions & 6 deletions _examples/openedx_courses/openedx_courses.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,14 @@ type Course struct {

func main() {
// Instantiate default collector
c := colly.NewCollector()
// Using IndonesiaX as sample
c.AllowedDomains = []string{"indonesiax.co.id", "www.indonesiax.co.id"}
c := colly.NewCollector(
// Using IndonesiaX as sample
colly.AllowedDomains("indonesiax.co.id", "www.indonesiax.co.id"),

// Cache responses to prevent multiple download of pages
// even if the collector is restarted
c.CacheDir = "./cache"
// Cache responses to prevent multiple download of pages
// even if the collector is restarted
colly.CacheDir("./cache"),
)

courses := make([]Course, 0, 200)

Expand Down
10 changes: 5 additions & 5 deletions _examples/parallel/parallel.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@ import (

func main() {
// Instantiate default collector
c := colly.NewCollector()
c := colly.NewCollector(
// MaxDepth is 2, so only the links on the scraped page
// and links on those pages are visited
colly.MaxDepth(2),
)

// Limit the maximum parallelism to 5
// This is necessary if the goroutines are dynamically
Expand All @@ -18,10 +22,6 @@ func main() {
// number of go routines.
c.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 5})

// MaxDepth is 2, so only the links on the scraped page
// and links on those pages are visited
c.MaxDepth = 2

// On every a element which has href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
Expand Down
3 changes: 1 addition & 2 deletions _examples/proxy_switcher/proxy_switcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import (

func main() {
// Instantiate default collector
c := colly.NewCollector()
c := colly.NewCollector(colly.AllowURLRevisit())

// Rotate two socks5 proxies
rp, err := proxy.RoundRobinProxySwitcher("socks5://127.0.0.1:1337", "socks5://127.0.0.1:1338")
Expand All @@ -25,7 +25,6 @@ func main() {
})

// Fetch httpbin.org/ip five times
c.AllowURLRevisit = true
for i := 0; i < 5; i++ {
c.Visit("https://httpbin.org/ip")
}
Expand Down
8 changes: 4 additions & 4 deletions _examples/random_delay/random_delay.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ func main() {
url := "https://httpbin.org/delay/2"

// Instantiate default collector
c := colly.NewCollector()

// Attach a debugger to the collector
c.SetDebugger(&debug.LogDebugger{})
c := colly.NewCollector(
// Attach a debugger to the collector
colly.Debugger(&debug.LogDebugger{}),
)

// Limit the number of threads started by colly to two
// when visiting links which domains' matches "*httpbin.*" glob
Expand Down
8 changes: 4 additions & 4 deletions _examples/rate_limit/rate_limit.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ func main() {
url := "https://httpbin.org/delay/2"

// Instantiate default collector
c := colly.NewCollector()

// Attach a debugger to the collector
c.SetDebugger(&debug.LogDebugger{})
c := colly.NewCollector(
// Attach a debugger to the collector
colly.Debugger(&debug.LogDebugger{}),
)

// Limit the number of threads started by colly to two
// when visiting links which domains' matches "*httpbin.*" glob
Expand Down
14 changes: 7 additions & 7 deletions _examples/url_filter/url_filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@ import (

func main() {
// Instantiate default collector
c := colly.NewCollector()

// Visit only root url and urls which start with "e" or "h" on httpbin.org
c.URLFilters = []*regexp.Regexp{
regexp.MustCompile("http://httpbin\\.org/(|e.+)$"),
regexp.MustCompile("http://httpbin\\.org/h.+"),
}
c := colly.NewCollector(
// Visit only root url and urls which start with "e" or "h" on httpbin.org
colly.URLFilters(
regexp.MustCompile("http://httpbin\\.org/(|e.+)$"),
regexp.MustCompile("http://httpbin\\.org/h.+"),
),
)

// On every a element which has href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
Expand Down
9 changes: 5 additions & 4 deletions _examples/xkcd_store/xkcd_store.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,12 @@ func main() {
defer writer.Flush()
// Write CSV header
writer.Write([]string{"Name", "Price", "URL", "Image URL"})
// Instantiate default collector
c := colly.NewCollector()

// Allow requests only to store.xkcd.com
c.AllowedDomains = []string{"store.xkcd.com"}
// Instantiate default collector
c := colly.NewCollector(
// Allow requests only to store.xkcd.com
colly.AllowedDomains("store.xkcd.com"),
)

// Extract product details
c.OnHTML(".product-grid-item", func(e *colly.HTMLElement) {
Expand Down

0 comments on commit 8ce8057

Please sign in to comment.