forked from MikeMeliz/TorCrawl.py
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtorcrawl.py
executable file
·172 lines (144 loc) · 4.88 KB
/
torcrawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/usr/bin/python
help = '''
TorCrawl.py is a python script to crawl and extract (regular or onion)
webpages through TOR network.
usage: python torcrawl.py [options]
python torcrawl.py -u l0r3m1p5umD0lorS1t4m3t.onion
python torcrawl.py -v -w -u http://www.github.com -o github.htm
python torcrawl.py -v -u l0r3m1p5umD0lorS1t4m3t.onion -c -d 2 -p 5
python torcrawl.py -v -w -u http://www.github.com -c -d 2 -p 5 -e -o GitHub
General:
-h, --help : Help
-v, --verbose : Show more informations about the progress
-u, --url *.onion : URL of Webpage to crawl or extract
-w, --without : Without the use of Relay TOR
Extract:
-e, --extract : Extract page's code to terminal or file.
(Defualt: terminal)
-i, --input filename : Input file with URL(s) (seperated by line)
-o, --output [filename] : Output page(s) to file(s) (for one page)
Crawl:
-c, --crawl : Crawl website (Default output on /links.txt)
-d, --cdepth : Set depth of crawl's travel (Default: 1)
-z, --exclusions : Paths that you don't want to include (TODO)
-s, --simultaneous: How many pages to visit at the same time (TODO)
-p, --pause : The length of time the crawler will pause
(Default: 0)
-f, --folder : The root directory which will contain the
generated files
-l, --log : A save log will let you see which URLs were
visited (TODO)
GitHub: github.com/MikeMeliz/TorCrawl.py
License: GNU General Public License v3.0
'''
import socket
import socks
import argparse
# TorCrawl Modules
from modules.crawler import crawler
from modules.extractor import extractor
from modules.checker import *
# Set socket and connection with TOR network
def connectTor():
try:
port = 9050
# Set socks proxy and wrap the urllib module
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, '127.0.0.1', port)
socket.socket = socks.socksocket
# Perform DNS resolution through the socket
def getaddrinfo(*args):
return [(socket.AF_INET, socket.SOCK_STREAM, 6, '', (args[0], args[1]))]
socket.getaddrinfo = getaddrinfo
except:
e = sys.exc_info()[0]
print("Error: %s" % e + "\n## Can't establish connection with TOR")
def main():
# Initialize necessary variables
inputFile = outputFile = ''
cpause = 0
cdepth = 1
# Get arguments with argparse
parser = argparse.ArgumentParser(
description="TorCrawl.py is a python script to crawl and extract (regular or onion) webpages through TOR network.")
# General
parser.add_argument('-v',
'--verbose',
action='store_true',
help='Show more informations about the progress')
parser.add_argument('-u',
'--url',
required=True,
help='URL of Webpage to crawl or extract')
parser.add_argument('-w',
'--without',
action='store_true',
help='Without the use of Relay TOR')
# Extract
parser.add_argument('-e',
'--extract',
action='store_true',
help='Extract page\'s code to terminal or file.')
parser.add_argument('-i',
'--input',
help='Input file with URL(s) (seperated by line)')
parser.add_argument('-o',
'--output',
help='Output page(s) to file(s) (for one page)')
# Crawl
parser.add_argument('-c',
'--crawl',
action='store_true',
help='Crawl website (Default output on /links.txt)')
parser.add_argument('-d',
'--cdepth',
help='Set depth of crawl\'s travel (Default: 1)')
parser.add_argument('-p',
'--pause',
help='The length of time the crawler will pause')
parser.add_argument('-l',
'--log',
action='store_true',
help='A save log will let you see which URLs were visited')
parser.add_argument('-f',
'--folder',
help='The root directory which will contain the generated files')
args = parser.parse_args()
# Parse arguments to variables
if args.input:
inputFile = args.input
if args.output:
outputFile = args.output
if args.cdepth:
cdepth = args.cdepth
if args.pause:
cpause = args.cdepth
# Connect to TOR
if args.without is False:
checkTor(args.verbose)
connectTor()
if args.verbose:
checkIP()
print('## URL: ' + args.url)
# Canon/ion of website and create path for output
if len(args.url) > 0:
global website
global outpath
website = urlcanon(args.url, args.verbose)
if args.folder is not None:
outpath = folder(args.folder, args.verbose)
else:
outpath = folder(website, args.verbose)
if args.crawl:
lst = crawler(website, cdepth, cpause, outpath, args.log, args.verbose)
lstfile = open(outpath + '/links.txt', 'w+')
for item in lst:
lstfile.write("%s\n" % item)
lstfile.close()
print("## File created on " + os.getcwd() + "/" + outpath + "/links.txt")
if args.extract:
inputFile = outpath + "/links.txt"
extractor(website, args.crawl, outputFile, inputFile, outpath)
else:
extractor(website, args.crawl, outputFile, inputFile, outpath)
if __name__ == "__main__":
main()