diff --git a/scraper.conf.sample b/scraper.conf.sample index 3e7ff86..8e6b94b 100644 --- a/scraper.conf.sample +++ b/scraper.conf.sample @@ -1,7 +1,3 @@ -# The URL we want to scrape. Should be a mobile link to finn.no -# url http://m.finn.no/bap/forsale/search.html -url http://m.finn.no/bap/forsale/search.html?price_to=6000&sub_category=1.93.3215 - # The number of minutes between each request. interval 1 @@ -9,7 +5,7 @@ interval 1 tomail foo@mailinator.com # The mail to send from. -frommail foo@domain.com +frommail hermansc@samfundet.no # The template we use for e-mail content template default.tmpl @@ -18,4 +14,11 @@ template default.tmpl # useragent Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14 # Enable som (useful) debug messages. -debug false +debug true + +# +# The URLs we want to scrape. Should be a mobile link to finn.no +# +# url http://m.finn.no/bap/forsale/search.html?price_to=6000&sub_category=1.93.3215 +url http://m.finn.no/bap/forsale/search.html + diff --git a/scraper.go b/scraper.go index fe6c5f8..433c8b3 100644 --- a/scraper.go +++ b/scraper.go @@ -7,6 +7,7 @@ import ( "github.com/PuerkitoBio/goquery" "io/ioutil" "log" + "math/rand" "net/http" "net/smtp" "os" @@ -20,7 +21,7 @@ import ( ) type Config struct { - Url string + Urls []string Interval int ToEmail string FromEmail string @@ -32,13 +33,16 @@ type Config struct { var config Config var configLocation string -var seen map[string]bool +var seen map[string][]string func printHelp() { fmt.Fprintf(os.Stderr, "Specify config-file:\n%s scraper.conf\n", os.Args[0]) } func loadConfig(filename string) error { + // Ensure the config is empty (and reset) + config = Config{} + f, err := ioutil.ReadFile(filename) if err != nil { return err @@ -51,21 +55,28 @@ func loadConfig(filename string) error { } for _, line := range lines { + // Ignore empty lines + if len(line) == 0 { + continue + } + // Ignore comments if strings.HasPrefix(line, "#") { continue } params := strings.Split(line, " ") + key := strings.TrimSpace(params[0]) + value := strings.TrimSpace(params[1]) // Read the URL - if params[0] == "url" { - config.Url = params[1] + if key == "url" { + config.Urls = append(config.Urls, value) } // Read the interval. - if params[0] == "interval" { - interval, err := strconv.Atoi(params[1]) + if key == "interval" { + interval, err := strconv.Atoi(value) if err != nil { return err } @@ -73,26 +84,26 @@ func loadConfig(filename string) error { } // Read the mails - if params[0] == "tomail" { - config.ToEmail = params[1] + if key == "tomail" { + config.ToEmail = value } - if params[0] == "frommail" { - config.FromEmail = params[1] + if key == "frommail" { + config.FromEmail = value } // Check if useragent is defined. - if params[0] == "useragent" { - config.UserAgent = params[1] + if key == "useragent" { + config.UserAgent = value } // Check if we have defined our own template. - if params[0] == "template" { - config.Template = params[1] + if key == "template" { + config.Template = value } // Read the debug-param - if params[0] == "debug" { - debug, err := strconv.ParseBool(params[1]) + if key == "debug" { + debug, err := strconv.ParseBool(value) if err != nil { return err } @@ -101,10 +112,13 @@ func loadConfig(filename string) error { } // We just loaded a presumably new config. So we start a new run. - config.FirstRun = true + config.FirstRun = false + + // Create map of all seen ads, so it is not nil (and we reset in case HUP) + seen = make(map[string][]string) // Check that the config is OK. - if config.ToEmail == "" || config.FromEmail == "" || config.Url == "" { + if config.ToEmail == "" || config.FromEmail == "" || config.Urls == nil { return errors.New("Invalid configuration. You need to provide To/From-emails and Url") } if config.Interval < 1 { @@ -125,8 +139,11 @@ func loadConfig(filename string) error { // Ensure that the URL looks good. re := regexp.MustCompile("^(http://)?(m.finn.no)(/.+/)+search.html(.*)$") - if !(re.Match([]byte(config.Url))) { - log.Fatal("Your URL is in a format invalid format. Are you using the mobile site? Check the documentation.") + for _, url := range config.Urls { + if !(re.Match([]byte(url))) { + log.Fatal("Your URL '" + url + `' is in a format invalid format. + Are you using the mobile site? Check the documentation.`) + } } // Everything is OK. @@ -148,8 +165,9 @@ func handleSignals() { log.Println(err.Error()) } log.Println("Loaded new config after SIGHUP") + // Run a check, right away. - err = checkFinn() + err = checkAllUrls() if err != nil { log.Println(err.Error()) } @@ -159,23 +177,56 @@ func handleSignals() { } func sendMail(to, from, content string) error { - err := smtp.SendMail("localhost:25", - nil, - from, - []string{to}, - []byte(content)) + // Check that the SMTP server is OK and connect. + c, err := smtp.Dial("localhost:25") + if err != nil { + log.Println("Could not send e-mail, check your local SMTP-configuration. Got following error:") + return err + } + + // Set sender. + err = c.Mail(from) if err != nil { return err } + + // Set recipient. + c.Rcpt(to) + if err != nil { + return err + } + + // Write the content to the mail buffer. + wc, err := c.Data() + if err != nil { + return err + } + defer wc.Close() + buf := bytes.NewBufferString(content) + _, err = buf.WriteTo(wc) + if err != nil { + return err + } + + // Mail successfully sent. return nil } -func getMailContent(ads []string) (string, error) { +func stringInSlice(in string, list []string) bool { + for _, elem := range list { + if elem == in { + return true + } + } + return false +} + +func getMailContent(url string, ads []string) (string, error) { // Add the strings of all ads on the dict sent to the template. d := make(map[string]interface{}) d["Ads"] = strings.TrimRight(strings.Join(ads, ""), "\n\r") d["NumResults"] = len(ads) - d["SearchURL"] = config.Url + d["SearchURL"] = url content, err := parseTemplate(config.Template, d) if err != nil { @@ -198,9 +249,30 @@ func parseTemplate(filename string, data interface{}) (string, error) { return buf.String(), nil } -func checkFinn() error { +func checkAllUrls() error { + // Check all URLs found in the config file. + for _, url := range config.Urls { + err := checkFinn(url) + if err != nil { + log.Fatalln(err.Error()) + } + + // Sleep between 1 and 6 seconds. + mseconds := 1000 + rand.Intn(5000) + time.Sleep(time.Duration(mseconds) * time.Millisecond) + + } + + // Now we indicate we are only looking for new ads + if config.FirstRun { + config.FirstRun = false + } + return nil +} + +func checkFinn(url string) error { if config.Debug { - log.Println("Checking provided URL...") + log.Println("Checking " + url) } // For saving the non-seen new ads. @@ -208,7 +280,7 @@ func checkFinn() error { // Open the provided URL. client := &http.Client{} - req, err := http.NewRequest("GET", config.Url, nil) + req, err := http.NewRequest("GET", url, nil) if err != nil { return err } @@ -229,7 +301,7 @@ func checkFinn() error { } finncode, _ := s.Attr("id") - if _, ok := seen[finncode]; !ok { + if ok := stringInSlice(finncode, seen[url]); !ok { // Construct a devent ad header. title := strings.TrimSpace(s.Find("div[data-automation-id='titleRow']").Text()) price := strings.TrimSpace(s.Find("span[data-automation-id='bodyRow']").Text()) @@ -241,7 +313,9 @@ func checkFinn() error { // Add the ad to our data structures, saving it. newAds = append(newAds, adHeader) - seen[finncode] = true + + // Add the finncode to seen ids. + seen[url] = append(seen[url], finncode) // We assume there are no more than 5 new ads in one interval. if i+1 == 5 && !(config.FirstRun) { @@ -253,27 +327,22 @@ func checkFinn() error { // We've found new ads, send them to the designated e-mail. if len(newAds) > 0 && !(config.FirstRun) { - to := fmt.Sprintf("To: %s\r\n", config.ToEmail) - from := fmt.Sprintf("From: %v\r\n", config.FromEmail) - content, err := getMailContent(newAds) + content, err := getMailContent(url, newAds) if err != nil { log.Println(err.Error()) } // Send the actual email. - err = sendMail(to, from, to+from+content) + err = sendMail(config.ToEmail, config.FromEmail, content) if err != nil { return err } log.Printf("Found %d new ads! Sent e-mail to %v!\n", len(newAds), config.ToEmail) } - - // Now we indicate we are only looking for new ads if config.FirstRun { fmt.Printf("Added %d ads to my memory. Looking for new ads every %v minutes and sending them to %v.\n", - len(seen), config.Interval, config.ToEmail) - config.FirstRun = false + len(seen[url]), config.Interval, config.ToEmail) } return nil @@ -295,12 +364,9 @@ func main() { // Listen for signals (SIGHUP) handleSignals() - // Create map of all seen ads, so it is not nil. - seen = make(map[string]bool) - for { // Check and report if any new ads are found. - err := checkFinn() + err := checkAllUrls() if err != nil { log.Println(err.Error()) }