-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathWorker.py
119 lines (101 loc) · 3.5 KB
/
Worker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import threading
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
import re
import urllib
import urlparse
from time import sleep
from Form import Form
class WorkThread(threading.Thread):
def __init__(self, queue, out_queue, base, sqli_queue, forms_queue):
threading.Thread.__init__(self)
self.queue = queue
self.out_queue = out_queue
self.sqli_queue = sqli_queue
self.forms_queue = forms_queue
self.base = base
self.seen = []
self.seen_forms = []
self.sqli = []
self.juicy = []
def run(self):
while True:
if self.queue.empty():
sleep(0.1)
self.out_queue.join()
if self.queue.empty():
return True
else:
html = self.queue.get() #get raw html
self.work(html) #do work
self.queue.task_done()
def work(self,html):
urls = self.crunch_links(self.extract_links(html))
self.eat_urls(urls)
self.eat_forms(self.extract_forms(html))
def extract_links(self,html):
# raw_links = re.findall(r'href=[\'"]?([^\'" >]+)', html)
raw_links = BeautifulSoup(html, 'html.parser' ,parse_only=SoupStrainer("a"))
return filter(None, [link.get('href') for link in raw_links]) # collect href text and filter non objects
def extract_forms(self,html):
raw_forms = BeautifulSoup(html, 'html.parser' ,parse_only=SoupStrainer("form"))
return filter(None, raw_forms)
def crunch_links(self,links):
r_image = re.compile(r".*(jpg|png|gif|JPG|PNG|GIF)$")
pending = []
for link in links:
if not (link.startswith('java') or link.startswith('mailto') or link.startswith("#") or (r_image.match(link))): #if its a url link
if link.startswith("http"):
if self.match_base(urlparse.urlparse(link)): #from same domain
pending.append(urlparse.urlparse(link))
continue
else:
continue
if not link.startswith('/'): #local link
pending.append(urlparse.urlparse(self.base.geturl() + "/" + str(link)))
else:
pending.append(urlparse.urlparse(self.base.geturl() + str(link)))
return pending
def eat_urls(self,urls):
if len(urls) > 0:
for url in urls:
if self.is_a_new_url(url):
self.out_queue.put(url)
self.detect_sqli(url)
self.seen.append(url)
def eat_forms(self,forms):
for form in forms:
form = Form(form)
if self.is_a_new_form(form):
#do some processing
self.forms_queue.put(form)
self.seen_forms.append(form)
def is_a_new_url(self,url):
for seen_url in self.seen:
if self.match_params(seen_url,url) and self.match_url(seen_url,url):
return False
return True
def is_a_new_form(self,form):
for seen_form in self.seen_forms:
if seen_form == form:
return False
return True
def match_params(self,url1,url2):
return set(urlparse.parse_qs(url1.query).keys()) == set(urlparse.parse_qs(url2.query).keys())
def match_url(self,url1,url2):
return ((url1.netloc + url1.path) == (url2.netloc + url2.path))
def match_base(self,url1):
url2 = self.base
return ((url1.netloc) == (url2.netloc + url2.path) or (url1.path) == (url2.netloc + url2.path))
def detect_sqli(self,url):
if(re.search('.*\?.*=.*',url.geturl())):
self.sqli_queue.put(url)
return True
else:
return False
def detect_juicy_files(self,url):
if(re.search(".*\.(pdf|xls|doc|txt)",url.geturl())):
self.juicy.append(url)
return True
else:
return False