-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.py
110 lines (88 loc) · 3.36 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import time
from typing import final
from bs4 import BeautifulSoup
from tqdm import tqdm
import urllib, re, random, requests, sys
from datetime import datetime
import lxml
from serpapi import GoogleSearch
serpkey = "5bc5c4047d52b709733adf6a35b9d1d84c928e86bd7e9c4799444fa388b6bd7b"
query = 'how can i'
owned_domain = 'reddit.com'
exclude_urls = ['wikipedia',
'youtube', 'facebook', 'instagram', 'pinterest', 'ebay',
'tripadvisor', 'reddit', 'twitter', 'flickr', 'amazon', 'etsy',
'dailymotion', 'linkedin', 'google', 'aliexpress', 'quora', owned_domain]
query += " allinurl: " + owned_domain + ' '
for exclude in exclude_urls:
query = query + " -inurl:" + exclude
query = urllib.parse.quote_plus(query)
number_result = 2000
ua = requests.get("https://fake-useragent.herokuapp.com/browsers/0.1.11").json()['browsers']['chrome']
google_url = "https://www.google.com/search?q=" + query + "&num=" + str(number_result) + '&gbv=1&sei=YwHNVpHLOYiWmQHk3K24Cw'
print("Query:", google_url)
response = requests.get(google_url, headers = {"User-Agent": random.choice(ua)})
soup = BeautifulSoup(response.content, "html5lib")
result_div = soup.find_all('div', attrs={'class': 'egMi0 kCrYT'})
if len(result_div) == 0:
print('Nothing found..')
exit()
page = 1
while True:
souplx = BeautifulSoup(response.text, "lxml")
next_page = souplx.select_one("a#pnnext")
print(next_page)
next_page = souplx.select("a#pnnext")
print(next_page)
next_page = souplx.find('a', attrs={'id': 'pnnext'})
print(next_page)
if next_page is not None:
print(f"Parsing page:", page, "\r", end='')
response = requests.get("https://google.com"+next_page['href'])
soup = BeautifulSoup(response.content, "html5lib")
result_div += soup.find_all('div', attrs={'class': 'egMi0 kCrYT'})
page +=1
else:
break
links = []
titles = []
descriptions = []
for r in result_div:
try:
link = r.find('a', href = True)
title = r.find('h3', attrs={'class': 'zBAuLc l97dzf'}).get_text()
if link != '' and title != '':
links.append(link['href'])
except:
print('error on splittiong')
continue
print(f'Founded Links: {len(links)}')
to_remove = []
clean_links = []
for i, l in enumerate(links):
clean = re.search('\/url\?q\=(.*)\&sa',l)
if clean is None:
to_remove.append(i)
continue
clean_links.append(clean.group(1))
print(f'Clean Links: {len(clean_links)}')
backlinks = []
error_manual = []
x=0
def print_stats():
print('({}/{}) [F: {} | E: {}] Searching backlinks.. [{}%]'.format(x, len(clean_links), len(backlinks), len(error_manual), round(x/len(clean_links)*100, 2)), end='\r', flush=True)
for url in clean_links:
print_stats()
try:
if len([url for h in [a['href'] for a in BeautifulSoup(requests.get(url, timeout=4, allow_redirects=False).content, 'html5lib').find_all('a', href = True)] if owned_domain in h]) > 0 and url not in backlinks: backlinks.append(url)
except:
error_manual.append(url)
finally:
x+=1
time.sleep(1.5)
if len(backlinks) > 0:
backlinkFile = open(f'./list/{round(datetime.now().timestamp())}.txt', 'w', newline='')
for url in tqdm(backlinks, ncols=100):
backlinkFile.write(url+'\n')
backlinkFile.close()
print(f'URL has {len(backlinks)} backlinks..')