Skip to content

Commit

Permalink
Merge pull request s-rah#41 from mapmeld/master
Browse files Browse the repository at this point in the history
Scan CSS style tags and stylesheet links
  • Loading branch information
s-rah committed May 27, 2016
2 parents 4874bee + cd9bfa2 commit 49095c5
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 10 deletions.
12 changes: 10 additions & 2 deletions protocol/http_scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,15 +76,23 @@ func (hps *HTTPProtocolScanner) ScanProtocol(hiddenService string, onionscanConf
}

func (hps *HTTPProtocolScanner) ScanPage(hiddenService string, page string, report *report.OnionScanReport, f func(scans.Scanner, string, int, string, *report.OnionScanReport)) {
_, contents, responseCode := hps.ScrapePage(hiddenService, page)
f(hps, page, responseCode, string(contents), report)
}

func (hps *HTTPProtocolScanner) ScrapePage(hiddenService string, page string) (error, []byte, int) {
if !strings.Contains(page, utils.WithoutSubdomains(hiddenService)) {
if !strings.HasPrefix(page, "/") {
page = "/" + page
}
page = hiddenService + page
}
response, err := hps.Client.Get("http://" + page)
if err != nil {
log.Printf("Error connecting to http://%s %s\n", page, err)
return
return err, nil, -1
}
defer response.Body.Close()
contents, _ := ioutil.ReadAll(response.Body)
f(hps, page, response.StatusCode, string(contents), report)
return nil, contents, response.StatusCode
}
1 change: 1 addition & 0 deletions scans/scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ import (

type Scanner interface {
ScanPage(string, string, *report.OnionScanReport, func(Scanner, string, int, string, *report.OnionScanReport))
ScrapePage(string, string) (error, []byte, int)
}
27 changes: 20 additions & 7 deletions scans/standard-page-scan.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ func StandardPageScan(scan Scanner, page string, status int, contents string, re
new(PGPContentScan).ScanContent(contents, report)

log.Printf("\tScanning for Images\n")
domains := utils.ExtractDomains(contents)
var domains []string
var cssLinks []string

// parser based on http://schier.co/blog/2015/04/26/a-simple-web-scraper-in-go.html
z := html.NewTokenizer(strings.NewReader(contents))
Expand All @@ -49,17 +50,22 @@ func StandardPageScan(scan Scanner, page string, status int, contents string, re
// TODO: don't crawl links with nofollow

if tt == html.StartTagToken {
isLink := t.Data == "a"
if isLink {
// links
if t.Data == "a" {
linkUrl := utils.GetAttribute(t, "href")
if len(linkUrl) > 1 {
domains = append(domains, linkUrl)
}
}
}

isImage := t.Data == "img"
if isImage {
// css <link>
if t.Data == "link" && utils.GetAttribute(t, "rel") == "stylesheet" {
cssLinks = append(cssLinks, utils.GetAttribute(t, "href"))
}

// images
if t.Data == "img" {
imageUrl := utils.GetAttribute(t, "src")

baseUrl, _ := url.Parse(imageUrl)
Expand All @@ -72,8 +78,15 @@ func StandardPageScan(scan Scanner, page string, status int, contents string, re
}
}

log.Printf("\tScanning for Links\n")
log.Printf("\tScanning for CSS Fonts and Background Images\n")
for _, cssUrl := range cssLinks {
log.Printf("\tScanning CSS file: %s\n", cssUrl)
_, cssContents, _ := scan.ScrapePage(report.HiddenService, utils.WithoutProtocol(cssUrl))
domains = append(domains, utils.ExtractDomains(string(cssContents))[0:]...)
}

log.Printf("\tScanning for Links\n")
domains = append(domains, utils.ExtractDomains(contents)...)
for _, domain := range domains {
baseUrl, _ := url.Parse(domain)
if baseUrl.Host != "" && utils.WithoutSubdomains(baseUrl.Host) != utils.WithoutSubdomains(report.HiddenService) {
Expand All @@ -95,7 +108,7 @@ func StandardPageScan(scan Scanner, page string, status int, contents string, re
foundPaths := r.FindAllStringSubmatch(string(contents), -1)
for _, regexpResults := range foundPaths {
path := regexpResults[2]
if strings.HasPrefix(path, "http") && !strings.Contains(path, utils.WithoutSubdomains(report.HiddenService)) {
if (strings.HasPrefix(path, "http") || strings.HasPrefix(path, "//")) && !strings.Contains(path, utils.WithoutSubdomains(report.HiddenService)) {
continue
}

Expand Down
17 changes: 16 additions & 1 deletion utils/url_parsing.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,23 @@ package utils

import (
"github.com/mvdan/xurls"
"regexp"
"strings"
)

func ExtractDomains(content string) []string {
return xurls.Strict.FindAllString(content, -1)
domains := xurls.Strict.FindAllString(content, -1)
cssurlregex := regexp.MustCompile(`(?i)url\((.*?)\)`)
cssDomains := cssurlregex.FindAllString(content, -1)
for _, cssDomain := range cssDomains {
if strings.HasPrefix(strings.ToLower(cssDomain), "url(") {
cssDomain = cssDomain[4 : len(cssDomain)-1]
}
if !strings.HasSuffix(cssDomain, ":before") && !strings.HasSuffix(cssDomain, ":after") {
domains = append(domains, cssDomain)
}
}
return domains
}

func WithoutSubdomains(urlhost string) string {
Expand All @@ -25,5 +37,8 @@ func WithoutProtocol(url string) string {
if strings.HasPrefix(url, "https://") {
return url[8:]
}
if strings.HasPrefix(url, "//") {
return url[2:]
}
return url
}

0 comments on commit 49095c5

Please sign in to comment.