forked from LeLe86/vWeChatCrawl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstart01.py
272 lines (238 loc) · 9.43 KB
/
start01.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
import os, sys
import requests
import json
import subprocess
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from time import sleep
"""
本项目开源地址 https://github.com/LeLe86/vWeChatCrawl
讨论QQ群 703431832 加群暗号:不止技术流
"""
# 保存文件
def SaveFile(fpath, fileContent):
with open(fpath, 'w', encoding='utf-8') as f:
f.write(fileContent)
# 读取文件
def ReadFile(filepath):
with open(filepath, 'r', encoding='utf-8') as f:
all_the_text = f.read()
return all_the_text
# 时间戳转日期
def Timestamp2Datetime(stampstr):
dt = datetime.utcfromtimestamp(stampstr)
dt = dt + timedelta(hours=8)
newtimestr = dt.strftime("%Y%m%d_%H%M%S")
return newtimestr
# 初始化环境
def GetJson():
jstxt = ReadFile("config.json")
jstxt = jstxt.replace("\\\\", "/").replace("\\", "/") # 防止json中有 / 导致无法识别
jsbd = json.loads(jstxt)
if jsbd["htmlDir"][-1] == "/":
jsbd["htmlDir"] = jsbd["htmlDir"][:-1]
if jsbd["jsonDir"][-1] == "/":
jsbd["jsonDir"] = jsbd["jsonDir"][:-1]
return jsbd
# 下载url网页
def DownLoadHtml(url):
# 构造请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Connection': 'keep-alive',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
}
session = requests.Session()
session.trust_env = False
response = session.get(url, headers=headers)
if response.status_code == 200:
htmltxt = response.text # 返回的网页正文
return htmltxt
else:
return None
# 将图片从远程下载保存到本地
def DownImg(url, savepath):
# 构造请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Connection': 'keep-alive',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
}
session = requests.Session()
session.trust_env = False
response = session.get(url, headers=headers)
with open(savepath, 'wb') as f:
f.write(response.content)
# 修改网页中图片的src,使图片能正常显示
def ChangeImgSrc(htmltxt, saveimgdir, htmlname):
bs = BeautifulSoup(htmltxt, "lxml") # 由网页源代码生成BeautifulSoup对象,第二个参数固定为lxml
imgList = bs.findAll("img")
imgindex = 0
for img in imgList:
imgindex += 1
originalURL = "" # 图片真实url
if "data-src" in img.attrs: # 有的<img 标签中可能没有data-src
originalURL = img.attrs['data-src']
elif "src" in img.attrs: # 如果有src则提取出来
originalURL = img.attrs['src']
else:
originalURL = ""
if originalURL.startswith("//"): # 如果url以//开头,则需要添加http:
originalURL = "http:" + originalURL
if len(originalURL) > 20:
print("\r down imgs " + "▇" * imgindex + " " + str(imgindex), end="")
if "data-type" in img.attrs:
imgtype = img.attrs["data-type"]
else:
imgtype = "png"
imgname = htmlname + "_" + str(imgindex) + "." + imgtype # 形如 1.png的图片名
imgsavepath = os.path.join(saveimgdir, imgname) # 图片保存目录
DownImg(originalURL, imgsavepath)
img.attrs["src"] = "images/" + imgname # 网页中图片的相对路径
else:
img.attrs["src"] = ""
ChangeCssSrc(bs) # 修改link标签
ChangeContent(bs) # 修改js_content的style,使正文能正常显示
allscript = bs.findAll("script")
for script in allscript:
if "src" in script.attrs: # 解决远程加载js失败导致打开网页很慢的问题
script["src"] = ""
return str(bs) # 将BeautifulSoup对象再转换为字符串,用于保存
def ChangeCssSrc(bs):
linkList = bs.findAll("link")
for link in linkList:
href = link.attrs["href"]
if href.startswith("//"):
newhref = "http:" + href
link.attrs["href"] = newhref
def ChangeContent(bs):
jscontent = bs.find(id="js_content")
if jscontent:
jscontent.attrs["style"] = ""
else:
print("-----可能文章被删了-----")
# 文章类
class Article():
def __init__(self, url, pubdate, idx, title):
self.url = url
self.pubdate = pubdate
self.idx = idx
self.title = title
# 被我乱改一桶,原来是从json文件中读取抓包的工具
# 时间戳随便写一个,文章链接也要获取后写到代码中
# title名字随便起一个
# 只是为了能够下载已知文章链接的公众号图文
def GetArticleList(jsondir):
ArtList = []
pubstamp = 1845588900
pubdate = Timestamp2Datetime(pubstamp)
url = "https://mp.weixin.qq.com/s/j12KabNDpGiWaePoBIs6kQ" # 文章链接
idx = 1
title = "mybest"
art = Article(url, pubdate, idx, title)
ArtList.append(art)
return ArtList
def DownHtmlMain(jsonDir, saveHtmlDir):
saveHtmlDir = jsbd["htmlDir"]
if not os.path.exists(saveHtmlDir):
os.makedirs(saveHtmlDir)
saveImgDir = saveHtmlDir + "/images"
if not os.path.exists(saveImgDir):
os.makedirs(saveImgDir)
ArtList = GetArticleList(jsonDir)
ArtList.sort(key=lambda x: x.pubdate, reverse=True) # 按日期倒序排列
totalCount = len(ArtList)
idx = 0
for art in ArtList:
idx += 1
artname = art.pubdate + "_" + str(art.idx)
arthtmlname = artname + ".html"
arthtmlsavepath = saveHtmlDir + "/" + arthtmlname
print(idx, "of", totalCount, artname, art.title)
# 如果已经有了则跳过,便于暂停后续传
if os.path.exists(arthtmlsavepath):
print("exists", arthtmlsavepath)
continue
arthtmlstr = DownLoadHtml(art.url)
arthtmlstr = ChangeImgSrc(arthtmlstr, saveImgDir, artname)
print("\r", end="")
SaveFile(arthtmlsavepath, arthtmlstr)
sleep(3) # 防止下载过快被微信屏蔽,间隔3秒下载一篇
# 把一个文件夹下的html文件都转为pdf
def PDFDir(htmldir, pdfdir):
if not os.path.exists(pdfdir):
os.makedirs(pdfdir)
flist = os.listdir(htmldir)
for f in flist:
if (not f[-5:] == ".html") or ("tmp" in f): # 不是html文件的不转换,含有tmp的不转换
continue
htmlpath = htmldir + "/" + f
tmppath = htmlpath[:-5] + "_tmp.html" # 生成临时文件,供转pdf用
htmlstr = ReadFile(htmlpath)
bs = BeautifulSoup(htmlstr, "lxml")
title = ""
# pdf文件名中包含文章标题,但如果标题中有不能出现在文件名中的符号则会转换失败
titleTag = bs.find(id="activity-name")
if titleTag is not None:
title = "_" + titleTag.get_text().replace(" ", "").replace(" ", "").replace("\n", "").replace("|", "").replace(":", "")
ridx = htmlpath.rindex("/") + 1
pdfname = htmlpath[ridx:-5] + title
pdfpath = pdfdir + "/" + pdfname + ".pdf"
"""
把js等去掉,减少转PDF时的加载项,
注意此处去掉了css(link),如果发现pdf格式乱了可以不去掉css
"""
[s.extract() for s in bs(["script", "iframe", "link"])]
SaveFile(tmppath, str(bs))
try:
PDFOne(tmppath, pdfpath)
except:
print("转pdf失败,可能是因为标题中有特殊字符", f)
# 把一个Html文件转为pdf
def PDFOne(htmlpath, pdfpath, skipExists=True, removehtml=True):
if skipExists and os.path.exists(pdfpath):
print("pdf exists", pdfpath)
if removehtml:
os.remove(htmlpath)
return
exepath = "wkhtmltopdf.exe" # 把wkhtmltopdf.exe文件保存到与本py文件相同的目录下
cmdlist = []
cmdlist.append(" --load-error-handling ignore ")
cmdlist.append(" --page-height 200 ") # 数字可以自己调节,也可以不加这两行
cmdlist.append(" --page-width 140 ")
cmdlist.append(" " + htmlpath + " ")
cmdlist.append(" " + pdfpath + " ")
cmdstr = exepath + "".join(cmdlist)
print(cmdstr)
result = subprocess.check_call(cmdstr, shell=False)
# stdout,stderr = result.communicate()
# result.wait() #等待转换完一个再转下一个
if removehtml:
os.remove(htmlpath)
"""
1.设置:
先去config.json文件中设置
jsonDir:Fiddler生成的文件
htmlDir:保存html的目录,路径中不能有空格
pdfDir:保存pdf的目录,路径中不能有空格
2.使用方法:
运行 python start.py #开始下载html
运行 python start.py pdf #把下载的html转pdf
"""
if __name__ == "__main__":
if len(sys.argv) == 1:
arg = None
else:
arg = sys.argv[1]
if arg is None or arg == "html":
jsbd = GetJson()
saveHtmlDir = jsbd["htmlDir"]
jsdir = jsbd["jsonDir"]
DownHtmlMain(jsdir, saveHtmlDir)
elif arg == "pdf":
jsbd = GetJson()
saveHtmlDir = jsbd["htmlDir"]
savePdfDir = jsbd["pdfDir"]
PDFDir(saveHtmlDir, savePdfDir)