Skip to content

Commit

Permalink
fixed unfiltered characters
Browse files Browse the repository at this point in the history
  • Loading branch information
Dan McInerney committed Dec 15, 2014
1 parent 80ee5c6 commit 07c4b51
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 66 deletions.
133 changes: 68 additions & 65 deletions xsscrapy/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import lxml.html
from lxml.html import soupparser, fromstring
import itertools
#from IPython import embed
from IPython import embed

class XSSCharFinder(object):
def __init__(self):
Expand All @@ -27,7 +27,7 @@ def process_item(self, item, spider):
body = response.body
mismatch = False
error = None
orig_payload = payload.replace(delim, '').replace(';9', '') # xss char payload
fuzz_payload = payload.replace(delim, '').replace(';9', '') # xss char payload
# Regex: ( ) mean group 1 is within the parens, . means any char,
# {1,80} means match any char 0 to 80 times, 80 chosen because double URL encoding
# ? makes the search nongreedy so it stops after hitting its limits
Expand All @@ -47,19 +47,20 @@ def process_item(self, item, spider):
body = body.lower()

# XSS detection starts here
re_matches = sorted([(m.start(), m.group()) for m in re.finditer(full_match, body)])
re_matches = sorted([(m.start(), m.group(), m.end()) for m in re.finditer(full_match, body)])

if len(re_matches) > 0:
scolon_matches = sorted([(m.start(), m.group()) for m in re.finditer(sc_full_match, body)])
#scolon_matches = sorted([(m.start(), m.group()) for m in re.finditer(sc_full_match, body)])
# If regex finds anything, get lxml matches
lxml_injs = self.get_lxml_matches(full_match, body, resp_url, delim)
if lxml_injs:
err = None
if len(re_matches) != len(lxml_injs):
spider.log('Error: mismatch in injections found by lxml and regex. Higher chance of false positive for %s' % resp_url)
spider.log('Error: mismatch in injections found by lxml and regex; higher chance of false positive for %s' % resp_url)
error = 'Error: mismatch in injections found by lxml and regex; higher chance of false positive'
mismatch = True

inj_data = self.combine_regex_lxml(lxml_injs, re_matches, scolon_matches, body, mismatch)
inj_data = self.combine_regex_lxml(lxml_injs, re_matches, body, mismatch, payload, delim)
# If mismatch is True, then "for offset in sorted(inj_data)" will fail with TypeError
try:
for offset in sorted(inj_data):
Expand All @@ -85,9 +86,9 @@ def process_item(self, item, spider):

# Catch all test payload chars no delims
# Catches DB errors
pl_lines_found = self.payloaded_lines(body, orig_payload)
pl_lines_found = self.payloaded_lines(body, fuzz_payload)
if pl_lines_found:
item = self.make_item(meta, resp_url, pl_lines_found, orig_payload, None)
item = self.make_item(meta, resp_url, pl_lines_found, fuzz_payload, None)
if item:
item = self.url_item_filtering(item, spider)
item['error'] = 'Payload delims do not surround this injection point. Found via search for entire payload.'
Expand Down Expand Up @@ -199,40 +200,39 @@ def xss_logic(self, injection, meta, resp_url, error):
# Unpack the injection
#tag_index, tag, attr, attr_val, payload, unfiltered_chars, line = injection
# get_unfiltered_chars() can only return a string 0+ characters, but never None
reflected_chars = injection[5]
unfiltered_chars = injection[5]
payload = injection[4]
if ';' in reflected_chars:
payload += ';9'
# injection[6] sometimes == '<p' maybe?
# It happened POSTing chatRecord to http://service.taobao.com/support/minerva/robot_save_chat_record.htm
# that page returned a Connection: Close header and no body
line = injection[6]+payload
############### THIS NEEDS TO BE THE REFLECTED PAYLOAD
line = injection[6]
item_found = None

# get_reflected_chars() always returns a string
if len(reflected_chars) > 0:
if len(unfiltered_chars) > 0:
chars_payloads = self.get_breakout_chars(injection, resp_url)
# breakout_chars always returns a , never None
if len(chars_payloads) > 0:
sugg_payloads = []
for chars in chars_payloads:
if set(chars).issubset(set(reflected_chars)):
# Get rid of possible payloads with > in them if > not in reflected_chars
if set(chars).issubset(set(unfiltered_chars)):
# Get rid of possible payloads with > in them if > not in unfiltered_chars
item_found = True
for possible_payload in chars_payloads[chars]:
if '>' not in reflected_chars:
if '>' not in unfiltered_chars:
if '>' in possible_payload:
continue
sugg_payloads.append(possible_payload)

if item_found:
return self.make_item(meta, resp_url, line, reflected_chars, sugg_payloads)
return self.make_item(meta, resp_url, line, unfiltered_chars, sugg_payloads)

def get_breakout_chars(self, injection, resp_url):
''' Returns either None if no breakout chars were found
or a list of sets of potential breakout characters '''

tag_index, tag, attr, attr_val, payload, reflected_chars, line = injection
tag_index, tag, attr, attr_val, payload, unfiltered_chars, line = injection
pl_delim = payload[:6]
full_match = '%s.{0,85}?%s' % (pl_delim, pl_delim)
line = re.sub(full_match, 'INJECTION', line)
Expand Down Expand Up @@ -593,7 +593,7 @@ def html_parser(self, body, resp_url):
# self.log('XMLSyntaxError from lxml on %s' % resp_url)
# return

def combine_regex_lxml(self, lxml_injs, full_matches, scolon_matches, body, mismatch):
def combine_regex_lxml(self, lxml_injs, full_matches, body, mismatch, payload, delim):
''' Combine lxml injection data with the 2 regex injection search data '''

all_inj_data = {}
Expand Down Expand Up @@ -626,17 +626,16 @@ def combine_regex_lxml(self, lxml_injs, full_matches, scolon_matches, body, mism
else:
tag_delim = '<'+tag

match_offset = match[0]
payload = match[1]
pl_delim = payload[:6]
sub = pl_delim+'subbed'

match_offset = match[0]
split_body = body[:match_offset]
match_start_offset = match[0]
ret_between_delim = match[1]
match_end_offset = match[2]
sub = delim+'subbed'
split_body = body[:match_start_offset]
ref_payload = body[match_start_offset:][:(match_end_offset - match_start_offset)+2]
# split the body at the tag, then take the last fragment
# which is closest to the injection point as regexed
line_no_tag = split_body.split(tag_delim)[-1]#.replace('\\"', '').replace("\\'", "")
line = tag_delim + line_no_tag
line = tag_delim + line_no_tag + ref_payload
# Sometimes it may split wrong, in which case we drop that lxml match
if line_no_tag.startswith('<doctype') or line_no_tag.startswith('<html'):
line = ''
Expand All @@ -650,15 +649,49 @@ def combine_regex_lxml(self, lxml_injs, full_matches, scolon_matches, body, mism
attr_val = attr_dict[a]
break

#unfiltered_chars = self.get_unfiltered_chars(payload, pl_delim, scolon_matches, match_offset)
reflected_chars = self.get_reflected_chars(tag, attr, payload, pl_delim, scolon_matches, match_offset)
unfiltered_chars = self.get_unfiltered_chars(payload, ref_payload, delim, tag, attr)
# Common false+ shows only "> as unfiltered if script parses the chars between 2 unrelated delim strs
#if reflected_chars == '">':
# reflected_chars = ''
all_inj_data[match_offset] = [tag_index, tag, attr, attr_val, payload, reflected_chars, line]
#if unfiltered_chars == '">':
# unfiltered_chars = ''
all_inj_data[match_start_offset] = [tag_index, tag, attr, attr_val, payload, unfiltered_chars, line]

return all_inj_data

def get_unfiltered_chars(self, payload, ref_payload, delim, tag, attr):
"""
Pull out just the unfiltered chars from the reflected chars
payload = delim+fuzz+delim+;9
"""
# Remove delim from payload and add ;
ref_chars = ref_payload.replace(delim, '').replace('9', '')
fuzz_chars = payload.replace(delim, '').replace('9', '')
remove_chars = set([])
unfiltered_chars_list = []

html_entities = ['&#39', '&quot;', '&lt;', '&gt;']
# Remove html encoded entities
for entity in html_entities:
if entity in ref_chars:
ref_chars.replace(entity, '')

#If injection is inside script tag, remove the escaped chars
if tag == 'script' or attr in self.event_attributes():
ref_chars = ref_chars.replace("\\'", "").replace('\\"', '').replace('\;', '').replace('\\>', '').replace('\\<', '').replace('\\/', '')

for c in fuzz_chars:
if c in ref_chars:
continue
else:
remove_chars.add(c)

for char in fuzz_chars:
if char not in remove_chars:
unfiltered_chars_list.append(char)

unfiltered_chars = ''.join(unfiltered_chars_list)

return unfiltered_chars

def accurate_attr(self, tag, attrs_attrvals, match, line):
''' lxml cannot determine the order of attrs which is important
if we're using regex to find the location of the inj so this func
Expand Down Expand Up @@ -857,43 +890,13 @@ def unescape_payload(self, payload):

return payload

def get_reflected_chars(self, tag, attr, payload, delim, scolon_matches, match_offset):
def get_reflected_chars(self, tag, attr, ref_payload, delim, body, match_end_offset):
''' Check for the special chars and append them to a master list of tuples, one tuple per injection point
Always returns a string '''

# Change the delim+test_str+delim payload to just test_str
# Make sure js payloads remove escaped ' and ", also remove ;
# since ; will show up in html encoded entities. If ; is unfiltered
# it will be added after this function
chars_between_delim = payload.replace(delim, '')#.replace("\\'", "").replace('\\"', '').replace(';', '').replace('\\>', '').replace('\\<', '').replace('\\/', '')
returned_chars = ref_payload.replace(delim, '').replace('9','')

#If injection is inside script tag, remove the escaped chars
if tag == 'script' or attr in self.event_attributes():
chars_between_delim = chars_between_delim.replace("\\'", "").replace('\\"', '').replace(';', '').replace('\\>', '').replace('\\<', '').replace('\\/', '')
else:
# If it's not a script then just remove the \'s otherwise they show up in Unfiltered in the item
chars_between_delim = chars_between_delim.replace("\\", "")

# Check if a colon needs to be added to the unfiltered chars
for scolon_match in scolon_matches:
# Confirm the string offset of the match is the same
# Since scolon_match will only exist when ;9 was found
scolon_offset = scolon_match[0]
if match_offset == scolon_offset:
chars_between_delim += ';'
break

# Catch pesky false positives usually inside super long script tags from
# fucked up html like lots of JSON in a script tag like google has on some pages
# if len(unfiltered_chars) != len(set(unfiltered_chars)):
# unfiltered_chars = ''

# unfiltered_chars = ''.join(unfiltered_chars)

if len(chars_between_delim) > 0:
return chars_between_delim
else:
return ''
return returned_chars

def url_item_filtering(self, item, spider):
''' Make sure we're not just repeating the same URL XSS over and over '''
Expand Down
2 changes: 1 addition & 1 deletion xsscrapy/spiders/xss_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,7 @@ def make_payload(self):
def payload_end_of_url(self, url):
''' Payload the end of the URL to catch some DOM(?) and other reflected XSSes '''

payload = self.make_payload()
payload = self.make_payload().replace('/', '')
# Make URL test and delim strings unique
if url[-1] == '/':
payloaded_url = url+payload
Expand Down

0 comments on commit 07c4b51

Please sign in to comment.