forked from DanMcInerney/xsscrapy
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
DanMcInerney
committed
Jun 23, 2014
1 parent
9dbf1d6
commit 99736db
Showing
16 changed files
with
211 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# Automatically created by: scrapy startproject | ||
# | ||
# For more information about the [deploy] section see: | ||
# http://doc.scrapy.org/en/latest/topics/scrapyd.html | ||
|
||
[settings] | ||
default = xsscrapy.settings | ||
|
||
[deploy] | ||
#url = http://localhost:6800/ | ||
project = xss_spider |
Empty file.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
# Define here the models for your scraped items | ||
# | ||
# See documentation in: | ||
# http://doc.scrapy.org/en/latest/topics/items.html | ||
|
||
from scrapy.item import Item, Field | ||
|
||
class Link(Item): | ||
url = Field() | ||
body = Field() |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
from xsscrapy.settings import USER_AGENT_LIST | ||
import random | ||
from scrapy import log | ||
|
||
class RandomUserAgentMiddleware(object): | ||
''' Use a random user-agent for each request ''' | ||
def process_request(self, request, spider): | ||
ua = random.choice(USER_AGENT_LIST) | ||
if ua: | ||
request.headers.setdefault('User-Agent', ua) |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# Define your item pipelines here | ||
# | ||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting | ||
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html | ||
|
||
class XSS_pipeline(object): | ||
def process_item(self, item, spider): | ||
return item |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
# For simplicity, this file contains only the most important settings by | ||
# default. All the other settings are documented here: | ||
# | ||
# http://doc.scrapy.org/en/latest/topics/settings.html | ||
# | ||
|
||
BOT_NAME = 'xsscrapy' | ||
|
||
SPIDER_MODULES = ['xsscrapy.spiders'] | ||
NEWSPIDER_MODULE = 'xsscrapy.spiders' | ||
|
||
# Crawl responsibly by identifying yourself (and your website) on the user-agent | ||
#USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36' | ||
# Get a random user agent for each crawled page | ||
USER_AGENT_LIST = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36', | ||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36', | ||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14', | ||
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20100101 Firefox/29.0', | ||
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36', | ||
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0'] | ||
DOWNLOADER_MIDDLEWARES = {'xsscrapy.middlewares.RandomUserAgentMiddleware': 400, | ||
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,} | ||
|
||
# prevent duplicate link crawling | ||
DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter' | ||
|
||
ITEM_PIPELINES = {'xsscrapy.pipelines.XSS_pipeline':100} # Look into what the 100 is doing (I know lower is higher priority, 0-1000) | ||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# This package will contain the spiders of your Scrapy project | ||
# | ||
# Please refer to the documentation for information on how to create and manage | ||
# your spiders. |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor | ||
from scrapy.contrib.spiders import CrawlSpider, Rule | ||
from scrapy.selector import Selector | ||
from scrapy.http import Request | ||
|
||
from xsscrapy.items import Link | ||
|
||
from urlparse import urlparse, parse_qsl | ||
import urllib | ||
import re | ||
|
||
|
||
class XSSspider(CrawlSpider): | ||
name = 'xss_spider' | ||
#allowed_domains = ['coin.co'] | ||
#start_urls = ['http://coin.co'] | ||
|
||
rules = (Rule(SgmlLinkExtractor(), callback='parse_url', follow=True), ) | ||
|
||
def __init__(self, *args, **kwargs): | ||
# run using: scrapy crawl xss_spider -a url='http://kadira.com' | ||
super(XSSspider, self).__init__(*args, **kwargs) | ||
self.start_urls = [kwargs.get('url')] | ||
hostname = urlparse(self.start_urls[0]).hostname | ||
self.allowed_domains = ['.'.join(hostname.split('.')[-2:])] # adding [] around the value seems to allow it to crawl subdomain of value | ||
self.payloader = xss_payloader() | ||
|
||
def parse_url(self, response): | ||
item = Link() | ||
item['url'] = response.url | ||
payloaded_urls = self.payloader.run(item['url']) | ||
if payloaded_urls: | ||
return [Request(url, callback=self.find_xss_in_body) for url in payloaded_urls] | ||
|
||
#item['body'] = response.body | ||
return item | ||
|
||
def find_xss_in_body(self, response): | ||
delim = '9zqjx' | ||
body = response.body | ||
url = response.url | ||
tester = '"\'><()=;/:' | ||
if tester in body: | ||
print '------------------------- 100% vulnerable:', url | ||
|
||
allBetweenDelims = '%s(.*?)%s' % (delim, delim) | ||
matches = re.findall(allBetweenDelims, body) | ||
if len(matches) > 0: | ||
pass | ||
|
||
|
||
class xss_payloader: | ||
''' Find urls with parameters then return a list of urls with 1 xss payload per param ''' | ||
|
||
def __init__(self): | ||
self.xssDelim = '9zqjx' # zqjx has the least amount of google search results I can find for 4 letter combo (47.2K) | ||
self.payloadTests = [self.xssDelim+'"\'><()=;/:'+self.xssDelim, # Normal check | ||
self.xssDelim+'%22%27%3E%3C%28%29%3D%3B%2F%3A'+self.xssDelim, # Hex encoded | ||
self.xssDelim+'"'><()=;/:'+self.xssDelim] # HTML encoded without semicolons | ||
|
||
def run(self, url): | ||
if '=' in url: | ||
payloaded_urls = self.checkForURLparams(url) | ||
return payloaded_urls | ||
|
||
def checkForURLparams(self, url): | ||
''' Add links with variables in them to the queue again but with XSS testing payloads ''' | ||
payloaded_urls = [] | ||
params = self.getURLparams(url) | ||
moddedParams = self.change_params(params) | ||
hostname, protocol, root_domain, path = self.url_processor(url) | ||
if hostname and protocol and path: | ||
for payload in moddedParams: | ||
for params in moddedParams[payload]: | ||
joinedParams = urllib.urlencode(params, doseq=1) # doseq maps the params back together | ||
newURL = urllib.unquote(protocol+hostname+path+'?'+joinedParams) | ||
payloaded_urls.append(newURL) | ||
return payloaded_urls | ||
|
||
def getURLparams(self, url): | ||
''' Parse out the URL parameters ''' | ||
parsedUrl = urlparse(url) | ||
fullParams = parsedUrl.query | ||
params = parse_qsl(fullParams) #parse_qsl rather than parse_ps in order to preserve order | ||
return params | ||
|
||
def change_params(self, params): | ||
''' Returns a list of complete parameters, each with 1 parameter changed to an XSS vector ''' | ||
changedParams = [] | ||
changedParam = False | ||
moddedParams = [] | ||
allModdedParams = {} | ||
|
||
# Create a list of lists, each list will be the URL we will test | ||
# This preserves the order of the URL parameters and will also | ||
# test each parameter individually instead of all at once | ||
for payload in self.payloadTests: | ||
allModdedParams[payload] = [] | ||
for x in xrange(0, len(params)): | ||
for p in params: | ||
param = p[0] | ||
value = p[1] | ||
# If a parameter has not been modified yet | ||
if param not in changedParams and changedParam == False: | ||
newValue = payload | ||
changedParams.append(param) | ||
p = (param, newValue) | ||
moddedParams.append(p) | ||
changedParam = True | ||
else: | ||
moddedParams.append(p) | ||
|
||
# Reset so we can step through again and change a diff param | ||
allModdedParams[payload].append(moddedParams) | ||
|
||
changedParam = False | ||
moddedParams = [] | ||
|
||
# Reset the list of changed params each time a new payload is attempted | ||
changedParams = [] | ||
|
||
return allModdedParams | ||
|
||
def url_processor(self, url): | ||
''' Get the url domain, protocol, and hostname using urlparse ''' | ||
try: | ||
parsed_url = urlparse(url) | ||
# Get the path | ||
path = parsed_url.path | ||
# Get the protocol | ||
protocol = parsed_url.scheme+'://' | ||
# Get the hostname (includes subdomains) | ||
hostname = parsed_url.hostname | ||
# Get root domain | ||
root_domain = '.'.join(hostname.split('.')[-2:]) | ||
except: | ||
print '[-] Could not parse url:', url | ||
return | ||
|
||
return (hostname, protocol, root_domain, path) |
Binary file not shown.