-
Notifications
You must be signed in to change notification settings - Fork 0
/
g_search_include_pdf.py
120 lines (96 loc) · 2.93 KB
/
g_search_include_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#coding = utf-8
try:
from googlesearch import search
except ImportError:
print("No module named 'google' found")
from os.path import exists
from tkinter import E
import urllib
import urllib.request as urllib2
import chardet #<- import this lib
import os
Key_file_name = "key.txt"
conf_file_name = "conf.txt"
def is_pdf(url):
try:
with urllib2.urlopen(url) as response:
first_bytes = response.read(4)
return first_bytes.startswith(b'%PDF')
except Exception as e:
print(f"Error: {e}")
return False
# to search
def savetohtml(root, index,url):
req = urllib2.Request(url)
req.add_header("User-agent", "Mozilla/5.0 (Windows NT 10.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.2228.0 Safari/537.36")
try:
result = url.lower().endswith("pdf")
if result and is_pdf(url):
print("PDF>>")
new_file = "{}//{}.pdf".format(root,index)
urllib2.urlretrieve(url, new_file)
return True
else:
f = urllib2.urlopen(req,timeout=5)
html = f.read()
f.close()
except:
print("urlopen failed!")
return False
charset1 = chardet.detect(html)
print(charset1)
encoding1 = charset1["encoding"]
if encoding1 == None:
encoding1 = "UTF-8"
try:
unicode_text = html.decode(encoding1)
except:
print("decode failed!")
return False
#print("Encoding:", encoding1)
new_file = "{}//{}.html".format(root,index)
f = open(new_file, 'w' ,encoding=encoding1)
#page = bytes(html, 'utf-8')
f.write(unicode_text)
f.close()
return True
def gsearch(root ,key, count=50):
index =1
for j in search(key, tld="co.kr", num=count+30, stop=count+30, pause=2):
#print(j)
if savetohtml(root, index, j) ==True:
index+=1
if index > count:
return True
def get_key():
nCount = 30
key_file_exists = exists(Key_file_name)
if key_file_exists==False:
print("don't exists key file")
exit()
conf_file_exists = exists(conf_file_name)
if conf_file_exists==False:
print("don't exists conf file")
##exit()
nCount = 30
else:
with open(conf_file_name, 'r') as conf_file:
try:
nCount=int(conf_file.readline())
if nCount<=0:
nCount = 30
except:
nCount = 30
f=open(Key_file_name,'r',encoding="UTF-8")
lists = f.readlines()
key_index =1
for key in lists :
if key=="":
continue
print("*"*10,key)
os.makedirs(str(key_index),exist_ok=True)
gsearch(key_index, key, nCount)
key_index+=1
f.close()
get_key()
#savetohtml("https://naver.com")