-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcrawler.py
364 lines (336 loc) · 14.9 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
import os
import subprocess
import sys
sys.path.append('./gRPC')
import argparse
import numpy as np
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from tbselenium.tbdriver import TorBrowserDriver
from tbselenium.utils import start_xvfb, stop_xvfb
import utils as ut
from common import *
from torcontroller import *
import datetime
from gRPC import client
from common import ConnError, HasCaptcha, Timeout, OtherError
import utils
# do remember to change this when use host or docker container to crawl
TBB_PATH = '/home/docker/tor-browser_en-US/'
def parse_arguments():
parser = argparse.ArgumentParser(description='Crawl Alexa top websites and capture the traffic')
parser.add_argument('--start',
type=int,
metavar='<start ind>',
default=0,
help='Start from which site in the list (include this ind).')
parser.add_argument('--end',
type=int,
metavar='<end ind>',
default=50,
help='End to which site in the list (exclude this ind).')
parser.add_argument('--batch', '-b',
type=int,
metavar='<Num of batches>',
default=5,
help='Crawl batches, Tor restarts at each batch.')
parser.add_argument('-m',
type=int,
metavar='<Num of instances in each batch>',
default=5,
help='Number of instances for each website in each batch to crawl. In unmon mode, for every m instances, restart tor.')
parser.add_argument('--open',
type=int,
default=0,
help='Crawl monitored sites or unmonitored sites (default:0, monitored).')
parser.add_argument('--offset',
type=int,
default=200,
help='Index of first unmonsite in the crawllist, before that is monsites.')
parser.add_argument('--torrc',
type=str,
default=None,
help='Torrc file path.')
parser.add_argument('--mode',
type=str,
required=True,
metavar='<parse mode>',
help='The type of dataset: clean, burst?.')
parser.add_argument('-s',
action='store_true',
default=False,
help='Take a screenshot? (default:False)')
parser.add_argument('-w',
type=str,
default=None,
help='Self provided web list.')
parser.add_argument('-u',
action='store_true',
default=False,
help='If crawl mon or umon sites (default is mon)')
parser.add_argument('-l',
type=str,
default=None,
help='Crawl specific sites, given a list')
parser.add_argument('--crawllog',
type=str,
metavar='<log path>',
default=None,
help='path to the crawler log file. It will print to stdout by default.')
parser.add_argument('--tbblog',
type=str,
metavar='<log path>',
default=None,
help='path to the tbb log file. It will print to stdout by default.')
parser.add_argument('--headless',
action='store_false',
default=True,
help='Whether to use xvfb, false by default. (Make sure use customed headless tbb if true)')
parser.add_argument('--who',
type=str,
metavar='<email sender>',
default='',
help='The name of the sender that will send an email after finishing crawling.')
# Parse arguments
args = parser.parse_args()
return args
class WFCrawler:
def __init__(self, args, wlist, controller, gRPCClient, outputdir, picked_inds=None):
self.batch = args.batch
self.m = args.m
self.offset = args.offset
self.start = args.start
self.end = args.end
self.tbblog = args.tbblog
self.headless = args.headless
self.driver = None
self.controller = controller
self.outputdir = outputdir
self.wlist = wlist
self.s = args.s
self.picked_inds = picked_inds
self.gRPCClient = gRPCClient
self.last_crawl_time = time.time()
if self.headless:
logger.info("Run in headless mode.")
else:
logger.info("Run in non-headless mode.")
def write_to_badlist(self, filename, url, reason):
with open(join(self.outputdir, 'bad.list'), 'a+') as f:
f.write(filename + '\t' + url + '\t' + reason + '\n')
def get_driver(self):
ffprefs = {
}
if self.headless:
# actually this is not working since set_option is deprecated which is used in tbselenium
# instead, export MOZ_HEADLESS=1 is working
headless = True
else:
headless = False
caps = DesiredCapabilities().FIREFOX
caps['pageLoadStrategy'] = 'normal'
driver = TorBrowserDriver(tbb_path=TBB_PATH, tor_cfg=1, pref_dict=ffprefs,
tbb_logfile_path=self.tbblog,
socks_port=9050, capabilities=caps, headless=headless)
driver.profile.set_preference("dom.webdriver.enabled", False)
driver.profile.set_preference('useAutomationExtension', False)
driver.profile.update_preferences()
logger.info("profile dir: {}".format(driver.profile.profile_dir))
driver.set_page_load_timeout(SOFT_VISIT_TIMEOUT)
return driver
def crawl(self, url, filename):
"""This method corresponds to a single loading for url"""
# try to launch driver
tries = 3
sleeptime = 5
for i in range(tries):
pid = None
try:
# with ut.timeout(BROWSER_LAUNCH_TIMEOUT):
driver = self.get_driver()
pid = driver.service.process.pid
except Exception as exc:
if i < tries - 1:
logger.error("Fail to launch browser, retry {} times, Err msg:{}".format(tries - (i + 1), exc))
if pid:
logger.info("Kill remaining browser process")
ut.kill_all_children(pid)
driver.clean_up_profile_dirs()
time.sleep(sleeptime)
sleeptime += 10
continue
else:
raise OSError("Fail to launch browser after {} tries".format(tries))
break
# try to crawl website
try:
with ut.timeout(HARD_VISIT_TIMEOUT):
err = self.gRPCClient.sendRequest(turn_on=True, file_path='{}.cell'.format(filename))
if err != None:
logger.error(err)
# send a stop record request anyway
self.gRPCClient.sendRequest(turn_on=False, file_path='')
return err
time.sleep(1)
logger.info("Start capturing.")
self.last_crawl_time = time.time()
driver.get(url)
time.sleep(1)
if self.s:
driver.get_screenshot_as_file(filename + '.png')
if ut.check_conn_error(driver):
self.write_to_badlist(filename + '.cell', url, "ConnError")
elif ut.check_captcha(driver.page_source.strip().lower()):
self.write_to_badlist(filename + '.cell', url, "HasCaptcha")
except (ut.HardTimeoutException, TimeoutException):
logger.warning("{} got timeout".format(url))
self.write_to_badlist(filename + '.cell', url, "Timeout")
except Exception as exc:
logger.warning("Unknow error:{}".format(exc))
self.write_to_badlist(filename + '.cell', url, "OtherError")
finally:
t = time.time() - self.last_crawl_time
ut.kill_all_children(pid)
driver.clean_up_profile_dirs()
subprocess.call("rm -r /tmp/*", shell=True)
logger.info("Firefox killed by pid. Clean up tmp folders")
# We don't care about the err here since if something goes wrong, we will find it next time send a True
# Request in next loop
time.sleep(CRAWLER_DWELL_TIME)
self.gRPCClient.sendRequest(turn_on=False, file_path='')
logger.info("Stop capturing, save to {}.cell.".format(filename))
logger.info("Loaded {:.2f}s".format(t))
time.sleep(np.random.uniform(0, GAP_BETWEEN_SITES_MAX))
def crawl_mon(self):
"""This method corresponds to one crawl task over all the monitored websites"""
# crawl monitored webpages, round-robin fashion, restart Tor every m visits of a whole list
for bb in range(self.batch):
with self.controller.launch():
should_restart_tor = False
logger.info("Start Tor and sleep {}s".format(GAP_AFTER_LAUNCH))
time.sleep(GAP_AFTER_LAUNCH)
for wid, website in enumerate(self.wlist):
if should_restart_tor:
break
wid = wid + self.start
if (self.picked_inds is not None) and (wid not in self.picked_inds):
continue
for mm in range(self.m):
i = bb * self.m + mm
filename = join(self.outputdir, str(wid) + '-' + str(i))
logger.info("{:d}-{:d}: {}".format(wid, i, website))
err = self.crawl(website, filename)
if err is not None:
logger.error("Grpc server break down. Try to restart Tor.")
should_restart_tor = True
break
# change identity
self.controller.change_identity()
logger.info("Finish batch #{}, sleep {}s.".format(bb + 1, GAP_BETWEEN_BATCHES))
time.sleep(GAP_BETWEEN_BATCHES)
def crawl_unmon(self):
"""This method corresponds to one crawl task over all the unmonitored websites"""
# crawl unmonitored webpages, round-robin fashion, restart Tor every m sites each once
should_restart_tor = False
for raw_wid, website in enumerate(self.wlist):
if raw_wid % self.m == 0 or should_restart_tor:
logger.info("Restart Tor now.")
self.controller.restart_tor()
should_restart_tor = False
time.sleep(GAP_BETWEEN_BATCHES)
assert self.controller.tor_process is not None
wid2list = raw_wid + self.start
wid2file = raw_wid + self.start - self.offset
if (self.picked_inds is not None) and (wid2list not in self.picked_inds):
continue
filename = join(self.outputdir, str(wid2file))
logger.info("{:d}: {}".format(wid2list, website))
err = self.crawl(website, filename)
if err is not None:
logger.error("Grpc server break down. Try to restart Tor.")
should_restart_tor = True
else:
self.controller.change_identity()
def clean_up(self):
err_type_cnt = {'ConnError': 0,
'HasCptcha': 0,
'Timeout': 0,
'OtherError': 0, }
bad_list = set()
if os.path.exists(join(self.outputdir, 'bad.list')):
with open(join(self.outputdir, 'bad.list'), 'r') as f:
tmp = f.readlines()
for entry in tmp:
entry = entry.rstrip('\n').split('\t')
bad_list.add((entry[0], entry[1], entry[2]))
error_num = len(bad_list)
logger.info("Found {} bad (including Timeout) loadings.".format(error_num))
removed_list = set()
for bad_item in bad_list:
w, url, reason = bad_item[0], bad_item[1], bad_item[2]
if w in removed_list:
continue
else:
removed_list.add(w)
if reason == 'ConnError' and ConnError:
err_type_cnt['ConnError'] += 1
subprocess.call("rm " + w, shell=True)
elif reason == 'HasCaptcha' and HasCaptcha:
err_type_cnt['HasCptcha'] += 1
subprocess.call("rm " + w, shell=True)
elif reason == 'Timeout' and Timeout:
err_type_cnt['Timeout'] += 1
subprocess.call("rm " + w, shell=True)
elif reason == 'OtherError' and OtherError:
err_type_cnt['OtherError'] += 1
subprocess.call("rm " + w, shell=True)
logger.info(err_type_cnt)
def main():
args = parse_arguments()
logger = utils.config_logger(args.crawllog)
assert args.end > args.start
if args.u:
web_list_dir = unmon_list
else:
web_list_dir = mon_list
if args.w:
web_list_dir = args.w
with open(web_list_dir, 'r') as f:
wlist = f.readlines()[args.start: args.end]
websites = []
for w in wlist:
if "https" not in w:
websites.append("https://" + w.rstrip("\n"))
else:
websites.append(w.rstrip("\n"))
assert len(websites) > 0
if args.l:
l_inds = ut.pick_specific_webs(args.l)
assert len(l_inds) > 0
else:
l_inds = None
outputdir = utils.init_directories(args.mode, args.u)
controller = TorController(torrc_path=args.torrc)
gRPCClient = client.GRPCClient(cm.gRPCAddr)
wfcrawler = WFCrawler(args, websites, controller, gRPCClient, outputdir, picked_inds=l_inds)
if not args.headless:
xvfb_display = start_xvfb(1280, 800)
try:
logger.info(args)
if args.open:
wfcrawler.crawl_unmon()
else:
wfcrawler.crawl_mon()
ut.sendmail(args.who, "'Crawler Message:Crawl done at {}!'".format(datetime.datetime.now()))
except KeyboardInterrupt:
sys.exit(-1)
except Exception as e:
ut.sendmail(args.who, "'Crawler Message: An error occurred:\n{}'".format(e))
finally:
if not args.headless and (xvfb_display is not None):
stop_xvfb(xvfb_display)
wfcrawler.controller.quit()
# clean up bad webs
wfcrawler.clean_up()
if __name__ == "__main__":
main()