forked from LeLe86/vWeChatCrawl
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
247 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
requests==2.22.0 | ||
beautifulsoup4==4.8.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,245 @@ | ||
import os,sys | ||
import requests | ||
import json | ||
import subprocess | ||
from bs4 import BeautifulSoup | ||
from datetime import datetime,timedelta | ||
from time import sleep | ||
|
||
""" | ||
本项目开源地址 https://github.com/LeLe86/vWeChatCrawl | ||
""" | ||
|
||
#保存文件 | ||
def SaveFile(fpath,fileContent): | ||
with open(fpath, 'w', encoding='utf-8') as f: | ||
f.write(fileContent) | ||
|
||
#读取文件 | ||
def ReadFile(filepath): | ||
with open(filepath, 'r', encoding='utf-8') as f: | ||
all_the_text = f.read() | ||
return all_the_text | ||
|
||
#时间戳转日期 | ||
def Timestamp2Datetime(stampstr): | ||
dt = datetime.utcfromtimestamp(stampstr) | ||
dt = dt + timedelta(hours=8) | ||
newtimestr = dt.strftime("%Y%m%d_%H%M%S") | ||
return newtimestr | ||
|
||
#初始化环境 | ||
def GetJson(): | ||
jstxt =ReadFile("config.json") | ||
jsbd = json.loads(jstxt) | ||
return jsbd | ||
|
||
|
||
#下载url网页 | ||
def DownLoadHtml(url): | ||
#构造请求头 | ||
headers = { | ||
'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36', | ||
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | ||
'Connection':'keep-alive', | ||
'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3' | ||
} | ||
|
||
response = requests.get(url,headers = headers) | ||
if response.status_code == 200: | ||
htmltxt = response.text #返回的网页正文 | ||
return htmltxt | ||
else: | ||
return None | ||
|
||
#将图片从远程下载保存到本地 | ||
def DownImg(url,savepath): | ||
r = requests.get(url) | ||
with open(savepath, 'wb') as f: | ||
f.write(r.content) | ||
|
||
#修改网页中图片的src,使图片能正常显示 | ||
def ChangeImgSrc(htmltxt,saveimgdir,htmlname): | ||
bs =BeautifulSoup(htmltxt,"lxml") #由网页源代码生成BeautifulSoup对象,第二个参数固定为lxml | ||
imgList = bs.findAll("img") | ||
imgindex = 0 | ||
for img in imgList: | ||
imgindex += 1 | ||
originalURL = "" # 图片真实url | ||
if "data-src" in img.attrs:#有的<img 标签中可能没有data-src | ||
originalURL = img.attrs['data-src'] | ||
elif "src" in img.attrs:#如果有src则提取出来 | ||
originalURL = img.attrs['src'] | ||
else: | ||
originalURL = "" | ||
if originalURL.startswith("//"):#如果url以//开头,则需要添加http: | ||
originalURL = "http:" + originalURL | ||
if len(originalURL) > 0: | ||
print("down img",imgindex) | ||
if "data-type" in img.attrs: | ||
imgtype = img.attrs["data-type"] | ||
else: | ||
imgtype = "png" | ||
imgname = htmlname + "_"+str(imgindex)+"."+imgtype #形如 1.png的图片名 | ||
imgsavepath = os.path.join(saveimgdir, imgname) # 图片保存目录 | ||
DownImg(originalURL,imgsavepath) | ||
img.attrs["src"] = "images/" + imgname #网页中图片的相对路径 | ||
else : | ||
img.attrs["src"] = "" | ||
ChangeCssSrc(bs) #修改link标签 | ||
return str(bs) #将BeautifulSoup对象再转换为字符串,用于保存 | ||
|
||
def ChangeCssSrc(bs): | ||
linkList = bs.findAll("link") | ||
for link in linkList: | ||
href = link.attrs["href"] | ||
if href.startswith("//"): | ||
newhref = "http:" + href | ||
link.attrs["href"] = newhref | ||
|
||
#文章类 | ||
class Article(): | ||
def __init__(self,url,pubdate,idx,title): | ||
self.url = url | ||
self.pubdate = pubdate | ||
self.idx = idx | ||
self.title = title | ||
|
||
#从fiddler保存的json文件中提取文章url等信息 | ||
def GetArticleList(jsondir): | ||
filelist = os.listdir(jsondir) | ||
ArtList = [] | ||
for file in filelist: | ||
filepath = os.path.join(jsondir,file) | ||
filetxt = ReadFile(filepath) | ||
jsbody = json.loads(filetxt) | ||
general_msg_list = jsbody["general_msg_list"] | ||
jsbd2= json.loads(general_msg_list) | ||
list = jsbd2["list"] | ||
for item in list: #一个item里可能有多篇文章 | ||
artidx = 1 #请注意这里的编号只是为了保存html方便,并不对应于真实的文章发文位置(比如头条、次条、3条) | ||
comm_msg_info = item["comm_msg_info"] | ||
app_msg_ext_info = item["app_msg_ext_info"] | ||
pubstamp = comm_msg_info["datetime"] | ||
pubdate = Timestamp2Datetime(pubstamp) | ||
if comm_msg_info["type"] == 49: #49为普通图文类型,还有其他类型,暂不考虑 | ||
url = app_msg_ext_info["content_url"] #文章链接 | ||
idx = artidx | ||
title = app_msg_ext_info["title"] | ||
art = Article(url,pubdate,idx,title) | ||
ArtList.append(art) | ||
print(len(ArtList),pubdate, idx, title) | ||
if app_msg_ext_info["is_multi"] == 1: # 一次发多篇 | ||
artidx += 1 | ||
multi_app_msg_item_list = app_msg_ext_info["multi_app_msg_item_list"] | ||
for subArt in multi_app_msg_item_list: | ||
url =subArt["content_url"] | ||
idx =artidx | ||
title = subArt["title"] | ||
art = Article(url,pubdate,idx,title) | ||
ArtList.append(art) | ||
print(len(ArtList),pubdate, idx, title) | ||
return ArtList | ||
|
||
def DownHtmlMain(jsonDir,saveHtmlDir): | ||
saveHtmlDir = jsbd["htmlDir"] | ||
if not os.path.exists(saveHtmlDir): | ||
os.makedirs(saveHtmlDir) | ||
saveImgDir = os.path.join(saveHtmlDir, "images") | ||
if not os.path.exists(saveImgDir): | ||
os.makedirs(saveImgDir) | ||
ArtList = GetArticleList(jsonDir) | ||
ArtList.sort(key=lambda x:x.pubdate,reverse=True) #按日期倒序排列 | ||
totalCount = len(ArtList) | ||
idx=0 | ||
for art in ArtList: | ||
idx+=1 | ||
artname = art.pubdate + "_" + str(art.idx) | ||
arthtmlname = artname + ".html" | ||
arthtmlsavepath = os.path.join(saveHtmlDir,arthtmlname) | ||
print(idx,"of",totalCount,artname,art.title) | ||
# 如果已经有了则跳过,便于暂停后续传 | ||
if os.path.exists(arthtmlsavepath): | ||
print("exists",arthtmlsavepath) | ||
continue | ||
arthtmlstr = DownLoadHtml(art.url) | ||
arthtmlstr = ChangeImgSrc(arthtmlstr,saveImgDir,artname) | ||
SaveFile(arthtmlsavepath,arthtmlstr) | ||
|
||
sleep(3) #防止下载过快被微信屏蔽,间隔3秒下载一篇 | ||
|
||
#把一个文件夹下的html文件都转为pdf | ||
def PDFDir(htmldir,pdfdir): | ||
if not os.path.exists(pdfdir): | ||
os.makedirs(pdfdir) | ||
flist = os.listdir(htmldir) | ||
for f in flist: | ||
if (not f[-5:]==".html") or ("tmp" in f): #不是html文件的不转换,含有tmp的不转换 | ||
continue | ||
htmlpath = os.path.join(htmldir,f) | ||
tmppath = htmlpath[:-5] + "_tmp.html"#生成临时文件,供转pdf用 | ||
htmlstr = ReadFile(htmlpath) | ||
bs = BeautifulSoup(htmlstr, "lxml") | ||
title = "" | ||
# pdf文件名中包含文章标题,但如果标题中有不能出现在文件名中的符号则会转换失败 | ||
titleTag = bs.find(id="activity-name") | ||
if titleTag is not None: | ||
title = "_" + titleTag.get_text().replace(" ", "").replace(" ","").replace("\n","") | ||
ridx = htmlpath.rindex("/") + 1 | ||
htmlname = htmlpath[ridx:-5] + title | ||
pdfpath = os.path.join(pdfdir, htmlname + ".pdf") | ||
|
||
""" | ||
把js等去掉,减少转PDF时的加载项, | ||
注意此处去掉了css(link),如果发现pdf格式乱了可以不去掉css | ||
""" | ||
[s.extract() for s in bs(["script", "iframe", "link"])] | ||
SaveFile(tmppath, str(bs)) | ||
PDFOne(tmppath,pdfpath) | ||
|
||
#把一个Html文件转为pdf | ||
def PDFOne(htmlpath,pdfpath,skipExists=True,removehtml=True): | ||
if skipExists and os.path.exists(pdfpath): | ||
print("pdf exists",pdfpath) | ||
if removehtml: | ||
os.remove(htmlpath) | ||
return | ||
exepath = "wkhtmltopdf.exe"#把wkhtmltopdf.exe文件保存到与本py文件相同的目录下 | ||
cmdlist =[] | ||
cmdlist.append(" --load-error-handling ignore ") | ||
cmdlist.append(" --page-height 200 ") #数字可以自己调节,也可以不加这两行 | ||
cmdlist.append(" --page-width 140 ") | ||
cmdlist.append(" " + htmlpath +" ") | ||
cmdlist.append(" " + pdfpath + " ") | ||
cmdstr = exepath + "".join(cmdlist) | ||
print(cmdstr) | ||
result = subprocess.check_call(cmdstr, shell=False) | ||
# stdout,stderr = result.communicate() | ||
# result.wait() #等待转换完一个再转下一个 | ||
if removehtml: | ||
os.remove(htmlpath) | ||
|
||
""" | ||
先去config.json文件设置 | ||
jsonDir:Fiddler生成的文件 | ||
htmlDir:保存html的目录,路径中不能有空格 | ||
pdfDir:保存pdf的目录,路径中不能有空格 | ||
""" | ||
|
||
|
||
if __name__ == "__main__": | ||
if len(sys.argv)==1: | ||
arg = None | ||
else: | ||
arg = sys.argv[1] | ||
if arg is None or arg == "html" : | ||
jsbd = GetJson() | ||
saveHtmlDir = jsbd["htmlDir"] | ||
jsdir= jsbd["jsonDir"] | ||
DownHtmlMain(jsdir,saveHtmlDir) | ||
elif arg == "pdf": | ||
jsbd = GetJson() | ||
saveHtmlDir = jsbd["htmlDir"] | ||
savePdfDir = jsbd["pdfDir"] | ||
PDFDir(saveHtmlDir,savePdfDir) |
Binary file not shown.