initial

DeadSilentSoS · Jun 23, 2014 · 99736db · 99736db
1 parent 9dbf1d6
commit 99736db
Show file tree

Hide file tree

Showing 16 changed files with 211 additions and 0 deletions.
diff --git a/scrapy.cfg b/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# http://doc.scrapy.org/en/latest/topics/scrapyd.html
+
+[settings]
+default = xsscrapy.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = xss_spider
diff --git a/xsscrapy/__init__.py b/xsscrapy/__init__.py
diff --git a/xsscrapy/__init__.pyc b/xsscrapy/__init__.pyc
diff --git a/xsscrapy/items.py b/xsscrapy/items.py
@@ -0,0 +1,10 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+from scrapy.item import Item, Field
+
+class Link(Item):
+    url = Field()
+    body = Field()
diff --git a/xsscrapy/items.pyc b/xsscrapy/items.pyc
diff --git a/xsscrapy/middlewares.py b/xsscrapy/middlewares.py
@@ -0,0 +1,10 @@
+from xsscrapy.settings import USER_AGENT_LIST
+import random
+from scrapy import log
+
+class RandomUserAgentMiddleware(object):
+    ''' Use a random user-agent for each request '''
+    def process_request(self, request, spider):
+        ua  = random.choice(USER_AGENT_LIST)
+        if ua:
+            request.headers.setdefault('User-Agent', ua)
diff --git a/xsscrapy/middlewares.pyc b/xsscrapy/middlewares.pyc
diff --git a/xsscrapy/pipelines.py b/xsscrapy/pipelines.py
@@ -0,0 +1,8 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+class XSS_pipeline(object):
+    def process_item(self, item, spider):
+        return item
diff --git a/xsscrapy/pipelines.pyc b/xsscrapy/pipelines.pyc
diff --git a/xsscrapy/settings.py b/xsscrapy/settings.py
@@ -0,0 +1,28 @@
+# For simplicity, this file contains only the most important settings by
+# default. All the other settings are documented here:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#
+
+BOT_NAME = 'xsscrapy'
+
+SPIDER_MODULES = ['xsscrapy.spiders']
+NEWSPIDER_MODULE = 'xsscrapy.spiders'
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'
+# Get a random user agent for each crawled page
+USER_AGENT_LIST = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36',
+                   'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36',
+                   'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14',
+                   'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20100101 Firefox/29.0',
+                   'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36',
+                   'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0']
+DOWNLOADER_MIDDLEWARES = {'xsscrapy.middlewares.RandomUserAgentMiddleware': 400,
+                          'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,}
+
+# prevent duplicate link crawling
+DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter'
+
+ITEM_PIPELINES = {'xsscrapy.pipelines.XSS_pipeline':100} # Look into what the 100 is doing (I know lower is higher priority, 0-1000)
+
diff --git a/xsscrapy/settings.pyc b/xsscrapy/settings.pyc
diff --git a/xsscrapy/spiders/__init__.py b/xsscrapy/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/xsscrapy/spiders/__init__.pyc b/xsscrapy/spiders/__init__.pyc
diff --git a/xsscrapy/spiders/tutorial_spider.pyc b/xsscrapy/spiders/tutorial_spider.pyc
diff --git a/xsscrapy/spiders/xss_spider.py b/xsscrapy/spiders/xss_spider.py
@@ -0,0 +1,140 @@
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.contrib.spiders import CrawlSpider, Rule
+from scrapy.selector import Selector
+from scrapy.http import Request
+
+from xsscrapy.items import Link
+
+from urlparse import urlparse, parse_qsl
+import urllib
+import re
+
+
+class XSSspider(CrawlSpider):
+    name = 'xss_spider'
+    #allowed_domains = ['coin.co']
+    #start_urls = ['http://coin.co']
+
+    rules = (Rule(SgmlLinkExtractor(), callback='parse_url', follow=True), )
+
+    def __init__(self, *args, **kwargs):
+        # run using: scrapy crawl xss_spider -a url='http://kadira.com'
+        super(XSSspider, self).__init__(*args, **kwargs)
+        self.start_urls = [kwargs.get('url')]
+        hostname = urlparse(self.start_urls[0]).hostname
+        self.allowed_domains = ['.'.join(hostname.split('.')[-2:])] # adding [] around the value seems to allow it to crawl subdomain of value
+        self.payloader = xss_payloader()
+
+    def parse_url(self, response):
+        item = Link()
+        item['url'] = response.url
+        payloaded_urls = self.payloader.run(item['url'])
+        if payloaded_urls:
+            return [Request(url, callback=self.find_xss_in_body) for url in payloaded_urls]
+
+        #item['body'] = response.body
+        return item
+
+    def find_xss_in_body(self, response):
+        delim = '9zqjx'
+        body = response.body
+        url = response.url
+        tester = '"\'><()=;/:'
+        if tester in body:
+            print '------------------------- 100% vulnerable:', url
+
+        allBetweenDelims = '%s(.*?)%s' % (delim, delim)
+        matches = re.findall(allBetweenDelims, body)
+        if len(matches) > 0:
+            pass
+
+
+class xss_payloader:
+    ''' Find urls with parameters then return a list of urls with 1 xss payload per param '''
+
+    def __init__(self):
+        self.xssDelim = '9zqjx' # zqjx has the least amount of google search results I can find for 4 letter combo (47.2K)
+        self.payloadTests = [self.xssDelim+'"\'><()=;/:'+self.xssDelim, # Normal check
+                             self.xssDelim+'%22%27%3E%3C%28%29%3D%3B%2F%3A'+self.xssDelim, # Hex encoded
+                             self.xssDelim+'&#34&#39&#62&#60&#40&#41&#61&#59&#47&#58'+self.xssDelim] # HTML encoded without semicolons
+
+    def run(self, url):
+        if '=' in url:
+            payloaded_urls = self.checkForURLparams(url)
+            return payloaded_urls
+
+    def checkForURLparams(self, url):
+        ''' Add links with variables in them to the queue again but with XSS testing payloads '''
+        payloaded_urls = []
+        params = self.getURLparams(url)
+        moddedParams = self.change_params(params)
+        hostname, protocol, root_domain, path = self.url_processor(url)
+        if hostname and protocol and path:
+            for payload in moddedParams:
+                for params in moddedParams[payload]:
+                    joinedParams = urllib.urlencode(params, doseq=1) # doseq maps the params back together
+                    newURL = urllib.unquote(protocol+hostname+path+'?'+joinedParams)
+                    payloaded_urls.append(newURL)
+        return payloaded_urls
+
+    def getURLparams(self, url):
+        ''' Parse out the URL parameters '''
+        parsedUrl = urlparse(url)
+        fullParams = parsedUrl.query
+        params = parse_qsl(fullParams) #parse_qsl rather than parse_ps in order to preserve order
+        return params
+
+    def change_params(self, params):
+        ''' Returns a list of complete parameters, each with 1 parameter changed to an XSS vector '''
+        changedParams = []
+        changedParam = False
+        moddedParams = []
+        allModdedParams = {}
+
+        # Create a list of lists, each list will be the URL we will test
+        # This preserves the order of the URL parameters and will also
+        # test each parameter individually instead of all at once
+        for payload in self.payloadTests:
+            allModdedParams[payload] = []
+            for x in xrange(0, len(params)):
+                for p in params:
+                    param = p[0]
+                    value = p[1]
+                    # If a parameter has not been modified yet
+                    if param not in changedParams and changedParam == False:
+                        newValue = payload
+                        changedParams.append(param)
+                        p = (param, newValue)
+                        moddedParams.append(p)
+                        changedParam = True
+                    else:
+                        moddedParams.append(p)
+
+                # Reset so we can step through again and change a diff param
+                allModdedParams[payload].append(moddedParams)
+
+                changedParam = False
+                moddedParams = []
+
+            # Reset the list of changed params each time a new payload is attempted
+            changedParams = []
+
+        return allModdedParams
+
+    def url_processor(self, url):
+        ''' Get the url domain, protocol, and hostname using urlparse '''
+        try:
+            parsed_url = urlparse(url)
+            # Get the path
+            path = parsed_url.path
+            # Get the protocol
+            protocol = parsed_url.scheme+'://'
+            # Get the hostname (includes subdomains)
+            hostname = parsed_url.hostname
+            # Get root domain
+            root_domain = '.'.join(hostname.split('.')[-2:])
+        except:
+            print '[-] Could not parse url:', url
+            return
+
+        return (hostname, protocol, root_domain, path)
diff --git a/xsscrapy/spiders/xss_spider.pyc b/xsscrapy/spiders/xss_spider.pyc