forked from lzjun567/python_scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
163 lines (141 loc) · 4.65 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# coding=utf-8
import logging
import os
import re
import time
try:
from urllib.parse import urlparse # py3
except:
from urlparse import urlparse # py2
import pdfkit
import requests
from bs4 import BeautifulSoup
html_template = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
</head>
<body>
{content}
</body>
</html>
"""
class Crawler(object):
"""
爬虫基类,所有爬虫都应该继承此类
"""
name = None
def __init__(self, name, start_url):
"""
初始化
:param name: 保存问的PDF文件名,不需要后缀名
:param start_url: 爬虫入口URL
"""
self.name = name
self.start_url = start_url
self.domain = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(self.start_url))
def crawl(self, url):
"""
pass
:return:
"""
print(url)
response = requests.get(url)
return response
def parse_menu(self, response):
"""
解析目录结构,获取所有URL目录列表:由子类实现
:param response 爬虫返回的response对象
:return: url 可迭代对象(iterable) 列表,生成器,元组都可以
"""
raise NotImplementedError
def parse_body(self, response):
"""
解析正文,由子类实现
:param response: 爬虫返回的response对象
:return: 返回经过处理的html文本
"""
raise NotImplementedError
def run(self):
start = time.time()
options = {
'page-size': 'Letter',
'margin-top': '0.75in',
'margin-right': '0.75in',
'margin-bottom': '0.75in',
'margin-left': '0.75in',
'encoding': "UTF-8",
'custom-header': [
('Accept-Encoding', 'gzip')
],
'cookie': [
('cookie-name1', 'cookie-value1'),
('cookie-name2', 'cookie-value2'),
],
'outline-depth': 10,
}
htmls = []
for index, url in enumerate(self.parse_menu(self.crawl(self.start_url))):
html = self.parse_body(self.crawl(url))
f_name = ".".join([str(index), "html"])
with open(f_name, 'wb') as f:
f.write(html)
htmls.append(f_name)
pdfkit.from_file(htmls, self.name + ".pdf", options=options)
for html in htmls:
os.remove(html)
total_time = time.time() - start
print(u"总共耗时:%f 秒" % total_time)
class LiaoxuefengPythonCrawler(Crawler):
"""
廖雪峰Python3教程
"""
def parse_menu(self, response):
"""
解析目录结构,获取所有URL目录列表
:param response 爬虫返回的response对象
:return: url生成器
"""
soup = BeautifulSoup(response.content, "html.parser")
menu_tag = soup.find_all(class_="uk-nav uk-nav-side")[1]
for li in menu_tag.find_all("li"):
url = li.a.get("href")
if not url.startswith("http"):
url = "".join([self.domain, url]) # 补全为全路径
yield url
def parse_body(self, response):
"""
解析正文
:param response: 爬虫返回的response对象
:return: 返回处理后的html文本
"""
try:
soup = BeautifulSoup(response.content, 'html.parser')
body = soup.find_all(class_="x-wiki-content")[0]
# 加入标题, 居中显示
title = soup.find('h4').get_text()
center_tag = soup.new_tag("center")
title_tag = soup.new_tag('h1')
title_tag.string = title
center_tag.insert(1, title_tag)
body.insert(1, center_tag)
html = str(body)
# body中的img标签的src相对路径的改成绝对路径
pattern = "(<img .*?src=\")(.*?)(\")"
def func(m):
if not m.group(3).startswith("http"):
rtn = "".join([m.group(1), self.domain, m.group(2), m.group(3)])
return rtn
else:
return "".join([m.group(1), m.group(2), m.group(3)])
html = re.compile(pattern).sub(func, html)
html = html_template.format(content=html)
html = html.encode("utf-8")
return html
except Exception as e:
logging.error("解析错误", exc_info=True)
if __name__ == '__main__':
start_url = "http://www.liaoxuefeng.com/wiki/0013739516305929606dd18361248578c67b8067c8c017b000"
crawler = LiaoxuefengPythonCrawler("廖雪峰Git", start_url)
crawler.run()