Skip to content

Commit

Permalink
Adds support for scraping multiple urls
Browse files Browse the repository at this point in the history
  • Loading branch information
hermansc committed May 1, 2014
1 parent 44846bb commit 383aa0a
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 51 deletions.
15 changes: 9 additions & 6 deletions scraper.conf.sample
Original file line number Diff line number Diff line change
@@ -1,15 +1,11 @@
# The URL we want to scrape. Should be a mobile link to finn.no
# url http://m.finn.no/bap/forsale/search.html
url http://m.finn.no/bap/forsale/search.html?price_to=6000&sub_category=1.93.3215

# The number of minutes between each request.
interval 1

# The email you want to recieve the notifications on.
tomail [email protected]

# The mail to send from.
frommail [email protected]
frommail [email protected]

# The template we use for e-mail content
template default.tmpl
Expand All @@ -18,4 +14,11 @@ template default.tmpl
# useragent Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14

# Enable som (useful) debug messages.
debug false
debug true

#
# The URLs we want to scrape. Should be a mobile link to finn.no
#
# url http://m.finn.no/bap/forsale/search.html?price_to=6000&sub_category=1.93.3215
url http://m.finn.no/bap/forsale/search.html

156 changes: 111 additions & 45 deletions scraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"github.com/PuerkitoBio/goquery"
"io/ioutil"
"log"
"math/rand"
"net/http"
"net/smtp"
"os"
Expand All @@ -20,7 +21,7 @@ import (
)

type Config struct {
Url string
Urls []string
Interval int
ToEmail string
FromEmail string
Expand All @@ -32,13 +33,16 @@ type Config struct {

var config Config
var configLocation string
var seen map[string]bool
var seen map[string][]string

func printHelp() {
fmt.Fprintf(os.Stderr, "Specify config-file:\n%s scraper.conf\n", os.Args[0])
}

func loadConfig(filename string) error {
// Ensure the config is empty (and reset)
config = Config{}

f, err := ioutil.ReadFile(filename)
if err != nil {
return err
Expand All @@ -51,48 +55,55 @@ func loadConfig(filename string) error {
}

for _, line := range lines {
// Ignore empty lines
if len(line) == 0 {
continue
}

// Ignore comments
if strings.HasPrefix(line, "#") {
continue
}

params := strings.Split(line, " ")
key := strings.TrimSpace(params[0])
value := strings.TrimSpace(params[1])

// Read the URL
if params[0] == "url" {
config.Url = params[1]
if key == "url" {
config.Urls = append(config.Urls, value)
}

// Read the interval.
if params[0] == "interval" {
interval, err := strconv.Atoi(params[1])
if key == "interval" {
interval, err := strconv.Atoi(value)
if err != nil {
return err
}
config.Interval = interval
}

// Read the mails
if params[0] == "tomail" {
config.ToEmail = params[1]
if key == "tomail" {
config.ToEmail = value
}
if params[0] == "frommail" {
config.FromEmail = params[1]
if key == "frommail" {
config.FromEmail = value
}

// Check if useragent is defined.
if params[0] == "useragent" {
config.UserAgent = params[1]
if key == "useragent" {
config.UserAgent = value
}

// Check if we have defined our own template.
if params[0] == "template" {
config.Template = params[1]
if key == "template" {
config.Template = value
}

// Read the debug-param
if params[0] == "debug" {
debug, err := strconv.ParseBool(params[1])
if key == "debug" {
debug, err := strconv.ParseBool(value)
if err != nil {
return err
}
Expand All @@ -101,10 +112,13 @@ func loadConfig(filename string) error {
}

// We just loaded a presumably new config. So we start a new run.
config.FirstRun = true
config.FirstRun = false

// Create map of all seen ads, so it is not nil (and we reset in case HUP)
seen = make(map[string][]string)

// Check that the config is OK.
if config.ToEmail == "" || config.FromEmail == "" || config.Url == "" {
if config.ToEmail == "" || config.FromEmail == "" || config.Urls == nil {
return errors.New("Invalid configuration. You need to provide To/From-emails and Url")
}
if config.Interval < 1 {
Expand All @@ -125,8 +139,11 @@ func loadConfig(filename string) error {

// Ensure that the URL looks good.
re := regexp.MustCompile("^(http://)?(m.finn.no)(/.+/)+search.html(.*)$")
if !(re.Match([]byte(config.Url))) {
log.Fatal("Your URL is in a format invalid format. Are you using the mobile site? Check the documentation.")
for _, url := range config.Urls {
if !(re.Match([]byte(url))) {
log.Fatal("Your URL '" + url + `' is in a format invalid format.
Are you using the mobile site? Check the documentation.`)
}
}

// Everything is OK.
Expand All @@ -148,8 +165,9 @@ func handleSignals() {
log.Println(err.Error())
}
log.Println("Loaded new config after SIGHUP")

// Run a check, right away.
err = checkFinn()
err = checkAllUrls()
if err != nil {
log.Println(err.Error())
}
Expand All @@ -159,23 +177,56 @@ func handleSignals() {
}

func sendMail(to, from, content string) error {
err := smtp.SendMail("localhost:25",
nil,
from,
[]string{to},
[]byte(content))
// Check that the SMTP server is OK and connect.
c, err := smtp.Dial("localhost:25")
if err != nil {
log.Println("Could not send e-mail, check your local SMTP-configuration. Got following error:")
return err
}

// Set sender.
err = c.Mail(from)
if err != nil {
return err
}

// Set recipient.
c.Rcpt(to)
if err != nil {
return err
}

// Write the content to the mail buffer.
wc, err := c.Data()
if err != nil {
return err
}
defer wc.Close()
buf := bytes.NewBufferString(content)
_, err = buf.WriteTo(wc)
if err != nil {
return err
}

// Mail successfully sent.
return nil
}

func getMailContent(ads []string) (string, error) {
func stringInSlice(in string, list []string) bool {
for _, elem := range list {
if elem == in {
return true
}
}
return false
}

func getMailContent(url string, ads []string) (string, error) {
// Add the strings of all ads on the dict sent to the template.
d := make(map[string]interface{})
d["Ads"] = strings.TrimRight(strings.Join(ads, ""), "\n\r")
d["NumResults"] = len(ads)
d["SearchURL"] = config.Url
d["SearchURL"] = url

content, err := parseTemplate(config.Template, d)
if err != nil {
Expand All @@ -198,17 +249,38 @@ func parseTemplate(filename string, data interface{}) (string, error) {
return buf.String(), nil
}

func checkFinn() error {
func checkAllUrls() error {
// Check all URLs found in the config file.
for _, url := range config.Urls {
err := checkFinn(url)
if err != nil {
log.Fatalln(err.Error())
}

// Sleep between 1 and 6 seconds.
mseconds := 1000 + rand.Intn(5000)
time.Sleep(time.Duration(mseconds) * time.Millisecond)

}

// Now we indicate we are only looking for new ads
if config.FirstRun {
config.FirstRun = false
}
return nil
}

func checkFinn(url string) error {
if config.Debug {
log.Println("Checking provided URL...")
log.Println("Checking " + url)
}

// For saving the non-seen new ads.
newAds := make([]string, 0)

// Open the provided URL.
client := &http.Client{}
req, err := http.NewRequest("GET", config.Url, nil)
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return err
}
Expand All @@ -229,7 +301,7 @@ func checkFinn() error {
}

finncode, _ := s.Attr("id")
if _, ok := seen[finncode]; !ok {
if ok := stringInSlice(finncode, seen[url]); !ok {
// Construct a devent ad header.
title := strings.TrimSpace(s.Find("div[data-automation-id='titleRow']").Text())
price := strings.TrimSpace(s.Find("span[data-automation-id='bodyRow']").Text())
Expand All @@ -241,7 +313,9 @@ func checkFinn() error {

// Add the ad to our data structures, saving it.
newAds = append(newAds, adHeader)
seen[finncode] = true

// Add the finncode to seen ids.
seen[url] = append(seen[url], finncode)

// We assume there are no more than 5 new ads in one interval.
if i+1 == 5 && !(config.FirstRun) {
Expand All @@ -253,27 +327,22 @@ func checkFinn() error {

// We've found new ads, send them to the designated e-mail.
if len(newAds) > 0 && !(config.FirstRun) {
to := fmt.Sprintf("To: %s\r\n", config.ToEmail)
from := fmt.Sprintf("From: %v\r\n", config.FromEmail)
content, err := getMailContent(newAds)
content, err := getMailContent(url, newAds)
if err != nil {
log.Println(err.Error())
}

// Send the actual email.
err = sendMail(to, from, to+from+content)
err = sendMail(config.ToEmail, config.FromEmail, content)
if err != nil {
return err
}

log.Printf("Found %d new ads! Sent e-mail to %v!\n", len(newAds), config.ToEmail)
}

// Now we indicate we are only looking for new ads
if config.FirstRun {
fmt.Printf("Added %d ads to my memory. Looking for new ads every %v minutes and sending them to %v.\n",
len(seen), config.Interval, config.ToEmail)
config.FirstRun = false
len(seen[url]), config.Interval, config.ToEmail)
}

return nil
Expand All @@ -295,12 +364,9 @@ func main() {
// Listen for signals (SIGHUP)
handleSignals()

// Create map of all seen ads, so it is not nil.
seen = make(map[string]bool)

for {
// Check and report if any new ads are found.
err := checkFinn()
err := checkAllUrls()
if err != nil {
log.Println(err.Error())
}
Expand Down

0 comments on commit 383aa0a

Please sign in to comment.