Merge branch 'jobbole'

SteelZheng · Feb 15, 2017 · d3301b8 · d3301b8
2 parents 3bd05b6 + fc7bde2
commit d3301b8
Show file tree

Hide file tree

Showing 17 changed files with 1,309 additions and 43 deletions.
diff --git a/README.md b/README.md
@@ -1,47 +1,10 @@
-#Python 爬虫：把廖雪峰的教程转换成 PDF 电子书
+# 目录
 
-### 系统要求
-python3.4以上版本, 不支持python2.x
+* [Python 爬虫：把廖雪峰的教程转换成 PDF 电子书](./pdf/README.md)
+* [用Python 写“爱心”](./heart/README.md)
 
-
-### 准备工具
-
-requests、beautifulsoup 是爬虫两大神器，reuqests 用于网络请求，beautifusoup 用于操作 html 数据。有了这两把梭子，干起活来利索。scrapy 这样的爬虫框架我们就不用了，这样的小程序派上它有点杀鸡用牛刀的意思。此外，既然是把 html 文件转为 pdf，那么也要有相应的库支持， wkhtmltopdf 就是一个非常的工具，它可以用适用于多平台的 html 到 pdf 的转换，pdfkit 是 wkhtmltopdf 的Python封装包。首先安装好下面的依赖包
-
-```python
-pip install requests
-pip install beautifulsoup4
-pip install pdfkit
-```
-
-### 安装 wkhtmltopdf
-Windows平台直接在 [http://wkhtmltopdf.org/downloads.html](http://wkhtmltopdf.org/downloads.html) 下载稳定版的 wkhtmltopdf 进行安装，安装完成之后把该程序的执行路径加入到系统环境 $PATH 变量中，否则 pdfkit 找不到 wkhtmltopdf 就出现错误 “No wkhtmltopdf executable found”。Ubuntu 和 CentOS 可以直接用命令行进行安装
-
-```shell
-$ sudo apt-get install wkhtmltopdf  # ubuntu
-$ sudo yum intsall wkhtmltopdf      # centos
-```
-
-### 运行
-```python
-python crawler.py
-```
-
-### 效果图
-![image](./crawer-pdf.png)
-
-### 常见问题
-
-1. SyntaxError: Missing parentheses in call to 'print'
-
-    beautifulsoup3不支持python2,所以下载beautifulsoup是要指定 beautifusoup4
-2. 如果是使用PyCharm开发, 那么运行的时候要在shell/cmd 窗口执行脚本, 直接在Pycharm中运行会找不到 wkhtmltopdf命令
-
-
-### contact me
+### Contact me
 
 >作者：liuzhijun  
->微信号： lzjun567  
->公众号：一个程序员的微站（VTtalk）
-
-
+>微信： lzjun567  
+>公众号：一个程序员的微站（id：VTtalk）
diff --git a/blog/__init__.py b/blog/__init__.py
diff --git a/blog/crawler_blog.py b/blog/crawler_blog.py
@@ -0,0 +1,23 @@
+# encoding: utf-8
+import requests
+
+__author__ = 'liuzhijun'
+
+if __name__ == '__main__':
+    cookies = {
+        "wordpress_logged_in_0efdf49af511fd88681529ef8c2e5fbf": "liuzhijun%7C1489451730%7Ch1qqRwDqQsBrt3MdwKXXen1IMV1m31tHXITLutHszlT%7C7c5e634d83279f3cf8d37ec7db76a80d775198593d55a165cf579c9f17308c28"
+    }
+
+    data = {"action": "user_login",
+            "user_login": "liuzhijun",
+            "user_pass": "lzjun854977",
+            "remember_me": "1",}
+    # redirect_url	http://www.jobbole.com}
+    url = "http://python.jobbole.com/wp-admin/admin-ajax.php"
+    response = requests.post(url, data)
+
+    for name, value in response.cookies.items():
+        print(name, value)
+
+    response = requests.get("http://python.jobbole.com/87305/", cookies=response.cookies)
+    print(response.content.decode('utf-8'))
diff --git a/blog/crawler_blog_async.py b/blog/crawler_blog_async.py
@@ -0,0 +1,170 @@
+# encoding: utf-8
+# !/usr/bin/env python
+
+import time
+from pymongo import MongoClient
+import requests
+from datetime import timedelta
+import re
+from bs4 import BeautifulSoup
+from tornado import httpclient, gen, ioloop, queues
+
+__author__ = 'liuzhijun'
+
+concurrency = 10
+
+headers = {
+    'Connection': 'Keep-Alive',
+    'Accept': 'text/html, application/xhtml+xml, */*',
+    'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
+    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Sa",
+    "Referer": "http://www.jobbole.com/",
+}
+
+
+@gen.coroutine
+def get_posts_url_from_page(page_url):
+    """
+    获取指定页面中所有文章的URL
+    :param page_url
+    :return:
+    """
+    try:
+        response = yield httpclient.AsyncHTTPClient().fetch(page_url, headers=headers)
+        soup = BeautifulSoup(response.body, 'html.parser')
+        posts_tag = soup.find_all('div', class_="post floated-thumb")
+        urls = []
+        for index, archive in enumerate(posts_tag):
+            meta = archive.find("div", class_="post-meta")
+            url = meta.p.a['href']
+            urls.append(url)
+        raise gen.Return(urls)
+    except httpclient.HTTPError as e:
+        print('Exception: %s %s' % (e, page_url))
+        raise gen.Return([])
+
+
+@gen.coroutine
+def get_post_data_from_url(post_url, cookies):
+    """
+    获取文章的元信息:阅读数\点赞数\收藏数\评论
+    :param post_url:
+    :return:
+    """
+    try:
+        headers["Cookie"] = ";".join([name + "=" + value for name, value in cookies.items()])
+        response = yield httpclient.AsyncHTTPClient().fetch(post_url, headers=headers)
+        soup = BeautifulSoup(response.body, 'html.parser')
+        title = soup.find("div", class_="entry-header").get_text()
+        meta_tag = soup.find("div", class_="entry-meta").p
+        text = meta_tag.get_text()
+
+        def extract_keyword(pattern, content):
+            """
+            利用正则表达式提取匹配的内容
+            """
+            match = re.compile(pattern, flags=re.S).search(content)
+            if match:
+                return int(match.group(1).replace(",", '').replace(" ", "0"))
+            else:
+                return 0
+
+        read_count = extract_keyword("([\d,]+) 阅读", text)
+        comment_count = extract_keyword("([\d,]+) 评论", text)
+
+        post_adds = soup.find("div", class_="post-adds")
+
+        vote_count = extract_keyword("([\d, ]+) 赞", post_adds.find("span", class_="vote-post-up").get_text())
+        bookmark_count = extract_keyword("([\d, ]+) 收藏", post_adds.find("span", class_="bookmark-btn").get_text())
+
+        post_data = {"url": post_url,
+                     "title": title,
+                     "read_count": read_count,
+                     "comment_count": comment_count,
+                     "vote_count": vote_count,
+                     "bookmark_count": bookmark_count}
+        print(title)
+        raise gen.Return(post_data)
+    except httpclient.HTTPError as e:
+        print('Exception: %s %s' % (e, post_url))
+        raise gen.Return({})
+
+
+
+@gen.coroutine
+def mainx():
+    start = time.time()
+    fetched = 0
+    client = MongoClient('mongodb://localhost:27017/')
+    db = client['posts']
+    cookies = {
+        'wordpress_logged_in_0efdf49af511fd88681529ef8c2e5fbf': 'liuzhijun%7C1489462391%7CcFSvpRWbyJcPRGSIelRPWRIqUNdIQnF5Jjh1BrBPQI2%7C812c5106ea45baeae74102845a2c6d269de6b7547e85a5613b575aa9c8708add',
+        'wordpress_0efdf49af511fd88681529ef8c2e5fbf': 'liuzhijun%7C1489462391%7CcFSvpRWbyJcPRGSIelRPWRIqUNdIQnF5Jjh1BrBPQI2%7C0edb104a0e34927a3c18e3fc4f10cc051153b1252f1f74efd7b57d21613e1f92'}
+    post_queue = queues.Queue()
+    page_queue = queues.Queue()
+    for i in range(1, 69):
+        page_url = "http://python.jobbole.com/all-posts/page/{page}/".format(page=i)
+        page_queue.put(page_url)
+        print(page_url)
+
+    @gen.coroutine
+    def posts_url_worker():
+        while True:
+            page = yield page_queue.get()
+            urls = yield get_posts_url_from_page(page)
+            for u in urls:
+                post_queue.put(u)
+            page_queue.task_done()
+
+    @gen.coroutine
+    def post_data_worker():
+        while True:
+            url = yield post_queue.get()
+            post = yield get_post_data_from_url(url, cookies)
+            nonlocal fetched
+            fetched += 1
+            db.posts.insert_one(post)
+            post_queue.task_done()
+
+    for _ in range(concurrency):
+        posts_url_worker()
+    for _ in range(concurrency):
+        post_data_worker()
+
+    yield page_queue.join()
+    yield post_queue.join()
+    # yield q.join(timeout=timedelta(seconds=300))
+    print('爬取%s 篇文章,总共耗时%d 秒.' % (fetched, time.time() - start))
+
+
+def login():
+    """
+    登录账户,获取登录cookie信息
+    :return:
+    """
+    url = "http://python.jobbole.com/wp-admin/admin-ajax.php"
+    account = {"action": "user_login",
+               "user_login": "liuzhijun",
+               "user_pass": "**********",
+               "remember_me": "1"}
+    response = requests.post(url, data=account)
+    print(response.cookies)
+    cookies = dict((name, value) for name, value in response.cookies.items())
+    return cookies
+
+
+if __name__ == '__main__':
+    # print(login())
+    #
+    # import logging
+    #
+    # logging.basicConfig()
+    io_loop = ioloop.IOLoop.current()
+    # io_loop.run_sync(main)
+    # io_loop.run_sync(lambda: get_all_post_url(67))
+    cookies = {
+        'wordpress_logged_in_0efdf49af511fd88681529ef8c2e5fbf': 'liuzhijun%7C1489462391%7CcFSvpRWbyJcPRGSIelRPWRIqUNdIQnF5Jjh1BrBPQI2%7C812c5106ea45baeae74102845a2c6d269de6b7547e85a5613b575aa9c8708add',
+        'wordpress_0efdf49af511fd88681529ef8c2e5fbf': 'liuzhijun%7C1489462391%7CcFSvpRWbyJcPRGSIelRPWRIqUNdIQnF5Jjh1BrBPQI2%7C0edb104a0e34927a3c18e3fc4f10cc051153b1252f1f74efd7b57d21613e1f92'}
+
+    # io_loop.run_sync(lambda: get_post_data_from_url("http://python.jobbole.com/87288/", cookies))
+    io_loop.run_sync(mainx)
diff --git a/crawer-pdf.png b/crawer-pdf.png
diff --git a/heart/README.md b/heart/README.md
@@ -0,0 +1,26 @@
+### 准备工作
+大体思路就是把微博数据爬下来，数据经过清洗加工后再分词处理，处理后的数据交给词云工具，配合科学计算工具和绘图工具制作成图像出来，涉及到的工具包有：
+
+Requests 用于网络请求爬取微博数据，结巴分词 jieba 进行中文分词处理，wordcloud 词云处理，图片处理库 Pillow，科学计算工具 NumPy ，类似于 MATLAB 的 2D 绘图库 Matplotlib
+
+### 工具安装
+安装这些工具包时，不同系统平台有可能出现不一样的错误，wordcloud，requests，jieba 都可以通过普通的 pip 方式在线安装，
+```python
+pip install wordcloud
+pip install requests
+pip install jieba
+```
+在Windows 平台安装  Pillow，NumPy，Matplotlib 直接用 pip 在线安装会出现各种问题，比较推荐的一种方式是在一个叫 Python Extension Packages for Windows [1] 的第三方平台下载 相应的.whl 文件安装。可以根据自己的系统环境选择下载安装 cp27 对应 python2.7，amd64 对应 64 位系统。下载到本地后进行安装
+```python
+pip install Pillow-4.0.0-cp27-cp27m-win_amd64.whl
+pip install scipy-0.18.0-cp27-cp27m-win_amd64.whl
+pip install numpy-1.11.3+mkl-cp27-cp27m-win_amd64.whl
+pip install matplotlib-1.5.3-cp27-cp27m-win_amd64.whl
+```
+其他平台可根据错误提示 Google 解决。也可以通过 [issue](https://github.com/lzjun567/crawler_html2pdf/issues) 在 GitHub 提交问题。
+
+### Contact me
+
+>作者：liuzhijun
+>微信： lzjun567
+>公众号：一个程序员的微站（id：VTtalk）
diff --git a/heart/__init__.py b/heart/__init__.py
diff --git a/heart/heart-mask.jpg b/heart/heart-mask.jpg
diff --git a/heart/heart.jpg b/heart/heart.jpg
diff --git a/heart/heart.py b/heart/heart.py
@@ -0,0 +1,87 @@
+# -*- coding:utf-8 -*-
+import codecs
+import csv
+import re
+
+import jieba.analyse
+import matplotlib.pyplot as plt
+import requests
+from scipy.misc import imread
+from wordcloud import WordCloud
+
+__author__ = 'liuzhijun'
+
+cookies = {
+    "ALF": "xxxx",
+    "SCF": "xxxxxx.",
+    "SUBP": "xxxxx",
+    "SUB": "xxxx",
+    "SUHB": "xxx-", "xx": "xx", "_T_WM": "xxx",
+    "gsScrollPos": "", "H5_INDEX": "0_my", "H5_INDEX_TITLE": "xxx",
+    "M_WEIBOCN_PARAMS": "xxxx"
+}
+
+
+def fetch_weibo():
+    api = "http://m.weibo.cn/index/my?format=cards&page=%s"
+    for i in range(1, 102):
+        response = requests.get(url=api % i, cookies=cookies)
+        data = response.json()[0]
+        groups = data.get("card_group") or []
+        for group in groups:
+            text = group.get("mblog").get("text")
+            text = text.encode("utf-8")
+
+            def cleanring(content):
+                """
+                去掉无用字符
+                """
+                pattern = "<a .*?/a>|<i .*?/i>|转发微博|//:|Repost|，|？|。|、|分享图片"
+                content = re.sub(pattern, "", content)
+                return content
+
+            text = cleanring(text).strip()
+            if text:
+                yield text
+
+
+def write_csv(texts):
+    with codecs.open('./weibo.csv', 'w') as f:
+        writer = csv.DictWriter(f, fieldnames=["text"])
+        writer.writeheader()
+        for text in texts:
+            writer.writerow({"text": text})
+
+
+def read_csv():
+    with codecs.open('./weibo.csv', 'r') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            yield row['text']
+
+
+def word_segment(texts):
+    jieba.analyse.set_stop_words("./stopwords.txt")
+    for text in texts:
+        tags = jieba.analyse.extract_tags(text, topK=20)
+        yield " ".join(tags)
+
+
+def generate_img(texts):
+    data = " ".join(text for text in texts)
+
+    mask_img = imread('./heart-mask.jpg', flatten=True)
+    wordcloud = WordCloud(
+        font_path='msyh.ttc',
+        background_color='white',
+        mask=mask_img
+    ).generate(data)
+    plt.imshow(wordcloud)
+    plt.axis('off')
+    plt.savefig('./heart.jpg', dpi=600)
+
+
+if __name__ == '__main__':
+    texts = fetch_weibo()
+    write_csv(texts)
+    generate_img(word_segment(read_csv()))