-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtdicrawler.py
34 lines (28 loc) · 1.05 KB
/
tdicrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#! /usr/bin/python
# "tiny, dirty, iffy" web crawler from
# http://blog.webhose.io/2015/08/12/tiny-basic-multi-threaded-web-crawler-in-python/
import sys, thread, Queue, re, urllib, urlparse, time, os, sys
dupcheck = set()
q = Queue.Queue(100)
q.put(sys.argv[1])
def queueURLs(html, origLink):
for url in re.findall('''<a[^>]+href=["'](.[^"']+)["']''', html, re.I):
link = url.split("#", 1)[0] if url.startswith("http") else '{uri.scheme}://{uri.netloc}'.format(uri=urlparse.urlparse(origLink)) + url.split("#", 1)[0]
if link in dupcheck:
continue
dupcheck.add(link)
if len(dupcheck) > 99999:
dupcheck.clear()
q.put(link)
def getHTML(link):
try:
html = urllib.urlopen(link).read()
open(str(time.time()) + ".html", "w").write("" % link + "\n" + html)
queueURLs(html, link)
except (KeyboardInterrupt, SystemExit):
raise
except Exception:
pass
while True:
thread.start_new_thread( getHTML, (q.get(),))
time.sleep(0.5)