refactor htmltopdf

chashisu · Feb 20, 2017 · 18c0683 · 18c0683
1 parent ac103b2
commit 18c0683
Show file tree

Hide file tree

Showing 3 changed files with 167 additions and 141 deletions.
diff --git a/pdf/README.md b/pdf/README.md
@@ -39,6 +39,10 @@ python crawler.py
 
 3. 生成的PDF文件可以在公众号回复『pdf』下载
 
+### 更新记录
+
+* 2017-2-21: 对代码进行了全面的重构,可扩展, 子类爬虫只需实现 `parse_menu`和`parse_body`方法就可以实现HTML转换PDF的逻辑
+
 
 ### Contact me
 

diff --git a/pdf/crawler.py b/pdf/crawler.py
@@ -0,0 +1,163 @@
+# coding=utf-8
+import logging
+import os
+import re
+import time
+
+try:
+    from urllib.parse import urlparse  # py3
+except:
+    from urlparse import urlparse  # py2
+
+import pdfkit
+import requests
+from bs4 import BeautifulSoup
+
+html_template = """
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+</head>
+<body>
+{content}
+</body>
+</html>
+
+"""
+
+
+class Crawler(object):
+    """
+    爬虫基类，所有爬虫都应该继承此类
+    """
+    name = None
+
+    def __init__(self, name, start_url):
+        """
+        初始化
+        :param name: 保存问的PDF文件名,不需要后缀名
+        :param start_url: 爬虫入口URL
+        """
+        self.name = name
+        self.start_url = start_url
+        self.domain = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(self.start_url))
+
+    def crawl(self, url):
+        """
+        pass
+        :return:
+        """
+        print(url)
+        response = requests.get(url)
+        return response
+
+    def parse_menu(self, response):
+        """
+        解析目录结构,获取所有URL目录列表:由子类实现
+        :param response 爬虫返回的response对象
+        :return: url 可迭代对象(iterable) 列表,生成器,元组都可以
+        """
+        raise NotImplementedError
+
+    def parse_body(self, response):
+        """
+        解析正文,由子类实现
+        :param response: 爬虫返回的response对象
+        :return: 返回经过处理的html文本
+        """
+        raise NotImplementedError
+
+    def run(self):
+        start = time.time()
+        options = {
+            'page-size': 'Letter',
+            'margin-top': '0.75in',
+            'margin-right': '0.75in',
+            'margin-bottom': '0.75in',
+            'margin-left': '0.75in',
+            'encoding': "UTF-8",
+            'custom-header': [
+                ('Accept-Encoding', 'gzip')
+            ],
+            'cookie': [
+                ('cookie-name1', 'cookie-value1'),
+                ('cookie-name2', 'cookie-value2'),
+            ],
+            'outline-depth': 10,
+        }
+        htmls = []
+        for index, url in enumerate(self.parse_menu(self.crawl(self.start_url))):
+            html = self.parse_body(self.crawl(url))
+            f_name = ".".join([str(index), "html"])
+            with open(f_name, 'wb') as f:
+                f.write(html)
+            htmls.append(f_name)
+
+        pdfkit.from_file(htmls, self.name + ".pdf", options=options)
+        for html in htmls:
+            os.remove(html)
+        total_time = time.time() - start
+        print(u"总共耗时：%f 秒" % total_time)
+
+
+class LiaoxuefengPythonCrawler(Crawler):
+    """
+    廖雪峰Python3教程
+    """
+
+    def parse_menu(self, response):
+        """
+        解析目录结构,获取所有URL目录列表
+        :param response 爬虫返回的response对象
+        :return: url生成器
+        """
+        soup = BeautifulSoup(response.content, "html.parser")
+        menu_tag = soup.find_all(class_="uk-nav uk-nav-side")[1]
+        for li in menu_tag.find_all("li"):
+            url = li.a.get("href")
+            if not url.startswith("http"):
+                url = "".join([self.domain, url])  # 补全为全路径
+            yield url
+
+    def parse_body(self, response):
+        """
+        解析正文
+        :param response: 爬虫返回的response对象
+        :return: 返回处理后的html文本
+        """
+        try:
+            soup = BeautifulSoup(response.content, 'html.parser')
+            body = soup.find_all(class_="x-wiki-content")[0]
+
+            # 加入标题, 居中显示
+            title = soup.find('h4').get_text()
+            center_tag = soup.new_tag("center")
+            title_tag = soup.new_tag('h1')
+            title_tag.string = title
+            center_tag.insert(1, title_tag)
+            body.insert(1, center_tag)
+
+            html = str(body)
+            # body中的img标签的src相对路径的改成绝对路径
+            pattern = "(<img .*?src=\")(.*?)(\")"
+
+            def func(m):
+                if not m.group(3).startswith("http"):
+                    rtn = "".join([m.group(1), self.domain, m.group(2), m.group(3)])
+                    return rtn
+                else:
+                    return "".join([m.group(1), m.group(2), m.group(3)])
+
+            html = re.compile(pattern).sub(func, html)
+            html = html_template.format(content=html)
+            html = html.encode("utf-8")
+            return html
+        except Exception as e:
+            logging.error("解析错误", exc_info=True)
+
+
+if __name__ == '__main__':
+    start_url = "http://www.liaoxuefeng.com/wiki/0013739516305929606dd18361248578c67b8067c8c017b000"
+    crawler = LiaoxuefengPythonCrawler("廖雪峰Git", start_url)
+    crawler.run()
diff --git a/pdf/crawler1.py b/pdf/crawler1.py