Skip to content

Commit

Permalink
Configurable Web Crawls
Browse files Browse the repository at this point in the history
Adding the structure needed to being configurable web
crawling.

Currently there are only two options: base - which configures
the base URL (to ignore all other parts of a site and focus on
a specific set of URLs e.g. /forums) and exclude - which tells
the scanner to ignore URLs which contain one or more of the given
strings - this allows explicitly ignoring uninteresting URLs
(e.g. /profile or /settings) and also for avoiding URLs which might mess
up the scan (e.g. /logout)

This commit also fixes a bug in web crawler where the depth parameter
was overridden by a constantly updating crawl map.
  • Loading branch information
s-rah committed Oct 10, 2016
1 parent 3ed2d7d commit b7d242c
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 60 deletions.
11 changes: 5 additions & 6 deletions config/crawl_config.go
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
package config

import (
"io/ioutil"
"encoding/json"
"encoding/json"
"io/ioutil"
)

type CrawlConfig struct {
Onion string `json:"onion"`
Base string `json:"base"`
Exclude []string `json:"exclude"`
Onion string `json:"onion"`
Base string `json:"base"`
Exclude []string `json:"exclude"`
}

func LoadCrawlConfig(filename string) (CrawlConfig, error) {
Expand All @@ -20,4 +20,3 @@ func LoadCrawlConfig(filename string) (CrawlConfig, error) {
err = json.Unmarshal(dat, &res)
return res, err
}

31 changes: 15 additions & 16 deletions config/onionscan_config.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
package config

import (
"fmt"
"github.com/s-rah/onionscan/crawldb"
"log"
"time"
"path/filepath"
"os"
"fmt"
"path/filepath"
"time"
)

type OnionScanConfig struct {
Expand All @@ -21,7 +21,6 @@ type OnionScanConfig struct {
CrawlConfigs map[string]CrawlConfig
}


func Configure(torProxyAddress string, directoryDepth int, fingerprint bool, timeout int, database string, scans []string, crawlconfigdir string, verbose bool) *OnionScanConfig {
osc := new(OnionScanConfig)
osc.TorProxyAddress = torProxyAddress
Expand All @@ -34,20 +33,20 @@ func Configure(torProxyAddress string, directoryDepth int, fingerprint bool, tim
osc.RescanDuration = time.Hour * -100
osc.Scans = scans
osc.CrawlConfigs = make(map[string]CrawlConfig)
visit := func (path string, f os.FileInfo, err error) error {
if !f.IsDir() {
cc,err := LoadCrawlConfig(path)
if err == nil {
osc.LogInfo(fmt.Sprintf("Loading Crawl Config for %s %v", cc.Onion, cc))
osc.CrawlConfigs[cc.Onion] = cc
}
}
return nil

visit := func(path string, f os.FileInfo, err error) error {
if !f.IsDir() {
cc, err := LoadCrawlConfig(path)
if err == nil {
osc.LogInfo(fmt.Sprintf("Loading Crawl Config for %s %v", cc.Onion, cc))
osc.CrawlConfigs[cc.Onion] = cc
}
}
return nil
}

filepath.Walk(crawlconfigdir, visit)

return osc
}

Expand Down
4 changes: 2 additions & 2 deletions deanonymization/check_exif_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@ func TagsToMap(exiftags []report.ExifTag) map[string]string {

func CompareMaps(t *testing.T, foundTags map[string]string, expectedTags map[string]string) {
var allTags []string
for k, _ := range expectedTags {
for k := range expectedTags {
allTags = append(allTags, k)
}
for k, _ := range foundTags {
for k := range foundTags {
allTags = append(allTags, k)
}
utils.RemoveDuplicates(&allTags)
Expand Down
72 changes: 36 additions & 36 deletions spider/onionspider.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@ func (os *OnionSpider) Crawl(hiddenservice string, osc *config.OnionScanConfig,
Jar: cookieJar,
}

basepath := osc.CrawlConfigs[hiddenservice].Base
if basepath == "" {
basepath = "/"
}
basepath := osc.CrawlConfigs[hiddenservice].Base
if basepath == "" {
basepath = "/"
}

base, err := url.Parse("http://" + hiddenservice + basepath)

Expand Down Expand Up @@ -80,18 +80,18 @@ func (os *OnionSpider) Crawl(hiddenservice string, osc *config.OnionScanConfig,
}

potentialDirectory := NormalizeURI(resourceURI.Path[:term], base)
_,exists := report.Crawls[potentialDirectory]
_, exists := report.Crawls[potentialDirectory]
if !exists {
result,cid := osc.Database.HasCrawlRecord(potentialDirectory, osc.RescanDuration)
if !result {
osc.LogInfo(fmt.Sprintf("Scanning Directory: %s", potentialDirectory))
id, err := os.GetPage(potentialDirectory, base, osc, false)
addCrawl(potentialDirectory, id, err)
scanDir(potentialDirectory)
} else {
osc.LogInfo(fmt.Sprintf("Already crawled %s (%s) recently - reusing existing crawl", resourceURI.Path[:term], potentialDirectory))
addCrawl(potentialDirectory, cid, nil)
}
result, cid := osc.Database.HasCrawlRecord(potentialDirectory, osc.RescanDuration)
if !result {
osc.LogInfo(fmt.Sprintf("Scanning Directory: %s", potentialDirectory))
id, err := os.GetPage(potentialDirectory, base, osc, false)
addCrawl(potentialDirectory, id, err)
scanDir(potentialDirectory)
} else {
osc.LogInfo(fmt.Sprintf("Already crawled %s (%s) recently - reusing existing crawl", resourceURI.Path[:term], potentialDirectory))
addCrawl(potentialDirectory, cid, nil)
}
}
}
}
Expand All @@ -101,8 +101,8 @@ func (os *OnionSpider) Crawl(hiddenservice string, osc *config.OnionScanConfig,
target, err := url.Parse(uri)
if err == nil && base.Host == target.Host {
normalizeTarget := NormalizeURI(target.String(), base)
_,exists := report.Crawls[normalizeTarget]
if strings.HasPrefix(target.Path, base.Path) && !exists {
_, exists := report.Crawls[normalizeTarget]
if strings.HasPrefix(target.Path, base.Path) && !exists {
result, cid := osc.Database.HasCrawlRecord(normalizeTarget, osc.RescanDuration)
if !result {
osc.LogInfo(fmt.Sprintf("Scanning URI: %s", target.String()))
Expand All @@ -117,14 +117,14 @@ func (os *OnionSpider) Crawl(hiddenservice string, osc *config.OnionScanConfig,
}
}

exclude := func(uri string) bool {
for _,rule := range osc.CrawlConfigs[hiddenservice].Exclude {
if strings.Contains(uri, rule) {
return true
}
}
return false
}
exclude := func(uri string) bool {
for _, rule := range osc.CrawlConfigs[hiddenservice].Exclude {
if strings.Contains(uri, rule) {
return true
}
}
return false
}

// Grab Server Status if it Exists
// We add it as a resource so we can pull any information out of it later.
Expand All @@ -146,32 +146,32 @@ func (os *OnionSpider) Crawl(hiddenservice string, osc *config.OnionScanConfig,
for i := 0; i < osc.Depth; i++ {
// Process all the images we can find
osc.LogInfo(fmt.Sprintf("Scanning Depth: %d", i))

// Copy to Prevent Map Updating from Influencing Depth
crawlMap := make(map[string]int)
for k,v := range report.Crawls {
crawlMap[k] = v
}
for k, v := range report.Crawls {
crawlMap[k] = v
}

for url, id := range crawlMap {
_, exists := processed[url]
if !exists {
crawlRecord, _ := osc.Database.GetCrawlRecord(id)
for _, image := range crawlRecord.Page.Images {
if !exclude(image.Target) {
processURI(image.Target, base)
if !exclude(image.Target) {
processURI(image.Target, base)
}
}

for _, anchor := range crawlRecord.Page.Anchors {
if !exclude(anchor.Target) {
processURI(anchor.Target, base)
if !exclude(anchor.Target) {
processURI(anchor.Target, base)
}
}

for _, link := range crawlRecord.Page.Links {
if !exclude(link.Target) {
processURI(link.Target, base)
if !exclude(link.Target) {
processURI(link.Target, base)
}
}

Expand Down

0 comments on commit b7d242c

Please sign in to comment.