forked from LandGrey/pydictor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpasscraper.py
150 lines (134 loc) · 5.77 KB
/
passcraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python
# coding:utf-8
# Referred: https://github.com/cheetz/brutescrape
#
"""
Copyright (c) 2016-2017 LandGrey (https://github.com/LandGrey/pydictor)
License: GNU GENERAL PUBLIC LICENSE Version 3
"""
from __future__ import unicode_literals
import os
import re
import ssl
from core.EXTEND import get_extend_dic
from lib.fun.osjudger import py_ver_egt_3
from lib.data.data import paths, pystrs, pyoptions
from lib.fun.fun import unique, cool, walk_pure_file, mybuildtime, finishcounter, finishprinter
ssl._create_default_https_context = ssl._create_unverified_context
# in python3: urllib + urilib2 -> urllib, and
# urllib2.urlopen() -> urllib.request.urlopen(), urllib2.Request() -> urllib.request.Request()
try:
if py_ver_egt_3():
from urllib.request import urlopen
else:
from urllib2 import urlopen
except ImportError as e:
print(e.message)
exit(cool.red('[-] can not import urllib or urllib2 module:') + pyoptions.CRLF)
passcratch_white_list = walk_pure_file(paths.scraperwhitelist_path)
def stripHTMLTags(html):
text = html
rules = [
{r'>\s+': '>'}, # Remove spaces after a tag opens or closes.
{r'\s+': ' '}, # Replace consecutive spaces.
{r'\s*<br\s*/?>\s*': '\n'}, # Newline after a <br>.
{r'</(div)\s*>\s*': '\n'}, # Newline after </p> and </div> and <h1/>.
{r'</(p|h\d)\s*>\s*': '\n\n'}, # Newline after </p> and </div> and <h1/>.
{r'<head>.*<\s*(/head|body)[^>]*>': ''}, # Remove <head> to </head>.
{r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'}, # Show links instead of texts.
{r'[ \t]*<[^<]*?/?>': ''}, # Remove remaining tags.
{r'^\s+': ''} # Remove spaces at the beginning.
]
for rule in rules:
for (k, v) in rule.items():
try:
regex = re.compile(k)
text = str(regex.sub(v, text))
except:
pass
htmlspecial = {
' ': ' ', '&': '&', '"': '"',
'<': '<', '>': '>'
}
for (k, v) in htmlspecial.items():
text = text.replace(k, v)
return text
def scratchword(siteList):
resluts = []
# Create an empty list for generation logic.
y_arr = []
for site in siteList:
try:
site = site.strip()
response = urlopen(site)
response.addheaders = \
[('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0')]
# if you don't decode('utf-8'), it will don't work both in python2 and python3
try:
x = stripHTMLTags(response.read().decode('utf-8') + site)
except:
try:
x = stripHTMLTags(response.read().decode('GBK') + site)
except:
exit(cool.red("[-] Page coding parse error, please use 'extend' plug instead") + pyoptions.CRLF)
# Replace junk found in our response
x = x.replace('\n', ' ')
x = x.replace(',', ' ')
x = x.replace('.', ' ')
x = x.replace('/', ' ')
x = re.sub('[^A-Za-z0-9]+', ' ', x)
x_arr = x.split(' ')
for y in x_arr:
y = y.strip()
if y and (len(y) >= 5):
if ((y[0] == '2') and (y[1] == 'F')) \
or ((y[0] == '2') and (y[1] == '3')) \
or ((y[0] == '3') and (y[1] == 'F')) or ((y[0] == '3') and (y[1] == 'D')):
y = y[2:]
if len(y) <= 8 and True if y.lower() not in passcratch_white_list and len(y) >= 5 else False:
y_arr.append(y)
elif 9 <= len(y) <= 25 and True if y.lower() not in passcratch_white_list else False:
y_arr.append(y)
except Exception:
exit(cool.red("[-] Process abort, please check url and try use 'extend' plug instead") + pyoptions.CRLF)
for yy in unique(y_arr):
if yy.strip().isdigit():
pass
else:
if not re.findall(pyoptions.passcraper_filter, yy.strip(), flags=re.I):
resluts.append(yy.strip())
return unique(resluts)
def checkurl(urlike):
try:
if not str(urlike).startswith('http'):
return 'http://' + urlike.strip()
else:
return urlike
except:
exit(cool.red("[-] Incorrect url/uri: {0}".format(cool.red(urlike.strip()))))
def scraper_magic(target=paths.scrapersites_path, only_scratch=False):
sites = []
if os.path.isfile(target):
with open(target, 'r') as f:
for _ in f.readlines():
if _.startswith(pyoptions.annotator):
pass
else:
sites.append(checkurl(_))
else:
sites.append(checkurl(target))
rawlist = scratchword(sites)
if only_scratch:
storepath = os.path.join(paths.results_path, "%s_%s%s" % (pystrs.SCFATCH_prefix, mybuildtime(),
pyoptions.filextension))
with open(storepath, "a") as f:
for line in rawlist:
f.write(str(line) + pyoptions.CRLF)
finishprinter(finishcounter(storepath), storepath)
else:
storepath = os.path.join(paths.results_path, "%s_%s%s" % (pystrs.SCFATCH_prefix, mybuildtime(),
pyoptions.filextension))
with open(storepath, "a") as f:
for line in rawlist:
f.write(str(line) + pyoptions.CRLF)
get_extend_dic(rawlist, need_extendscratch=True)