Skip to content

Commit

Permalink
Merge branch 'jobbole'
Browse files Browse the repository at this point in the history
  • Loading branch information
lzjun567 committed Feb 15, 2017
2 parents 3bd05b6 + fc7bde2 commit d3301b8
Show file tree
Hide file tree
Showing 17 changed files with 1,309 additions and 43 deletions.
49 changes: 6 additions & 43 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,47 +1,10 @@
#Python 爬虫:把廖雪峰的教程转换成 PDF 电子书
# 目录

### 系统要求
python3.4以上版本, 不支持python2.x
* [Python 爬虫:把廖雪峰的教程转换成 PDF 电子书](./pdf/README.md)
* [用Python 写“爱心”](./heart/README.md)


### 准备工具

requests、beautifulsoup 是爬虫两大神器,reuqests 用于网络请求,beautifusoup 用于操作 html 数据。有了这两把梭子,干起活来利索。scrapy 这样的爬虫框架我们就不用了,这样的小程序派上它有点杀鸡用牛刀的意思。此外,既然是把 html 文件转为 pdf,那么也要有相应的库支持, wkhtmltopdf 就是一个非常的工具,它可以用适用于多平台的 html 到 pdf 的转换,pdfkit 是 wkhtmltopdf 的Python封装包。首先安装好下面的依赖包

```python
pip install requests
pip install beautifulsoup4
pip install pdfkit
```

### 安装 wkhtmltopdf
Windows平台直接在 [http://wkhtmltopdf.org/downloads.html](http://wkhtmltopdf.org/downloads.html) 下载稳定版的 wkhtmltopdf 进行安装,安装完成之后把该程序的执行路径加入到系统环境 $PATH 变量中,否则 pdfkit 找不到 wkhtmltopdf 就出现错误 “No wkhtmltopdf executable found”。Ubuntu 和 CentOS 可以直接用命令行进行安装

```shell
$ sudo apt-get install wkhtmltopdf # ubuntu
$ sudo yum intsall wkhtmltopdf # centos
```

### 运行
```python
python crawler.py
```

### 效果图
![image](./crawer-pdf.png)

### 常见问题

1. SyntaxError: Missing parentheses in call to 'print'

beautifulsoup3不支持python2,所以下载beautifulsoup是要指定 beautifusoup4
2. 如果是使用PyCharm开发, 那么运行的时候要在shell/cmd 窗口执行脚本, 直接在Pycharm中运行会找不到 wkhtmltopdf命令


### contact me
### Contact me

>作者:liuzhijun
>微信号: lzjun567
>公众号:一个程序员的微站(VTtalk)

>微信: lzjun567
>公众号:一个程序员的微站(id:VTtalk)
Empty file added blog/__init__.py
Empty file.
23 changes: 23 additions & 0 deletions blog/crawler_blog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# encoding: utf-8
import requests

__author__ = 'liuzhijun'

if __name__ == '__main__':
cookies = {
"wordpress_logged_in_0efdf49af511fd88681529ef8c2e5fbf": "liuzhijun%7C1489451730%7Ch1qqRwDqQsBrt3MdwKXXen1IMV1m31tHXITLutHszlT%7C7c5e634d83279f3cf8d37ec7db76a80d775198593d55a165cf579c9f17308c28"
}

data = {"action": "user_login",
"user_login": "liuzhijun",
"user_pass": "lzjun854977",
"remember_me": "1",}
# redirect_url http://www.jobbole.com}
url = "http://python.jobbole.com/wp-admin/admin-ajax.php"
response = requests.post(url, data)

for name, value in response.cookies.items():
print(name, value)

response = requests.get("http://python.jobbole.com/87305/", cookies=response.cookies)
print(response.content.decode('utf-8'))
170 changes: 170 additions & 0 deletions blog/crawler_blog_async.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
# encoding: utf-8
# !/usr/bin/env python

import time
from pymongo import MongoClient
import requests
from datetime import timedelta
import re
from bs4 import BeautifulSoup
from tornado import httpclient, gen, ioloop, queues

__author__ = 'liuzhijun'

concurrency = 10

headers = {
'Connection': 'Keep-Alive',
'Accept': 'text/html, application/xhtml+xml, */*',
'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Sa",
"Referer": "http://www.jobbole.com/",
}


@gen.coroutine
def get_posts_url_from_page(page_url):
"""
获取指定页面中所有文章的URL
:param page_url
:return:
"""
try:
response = yield httpclient.AsyncHTTPClient().fetch(page_url, headers=headers)
soup = BeautifulSoup(response.body, 'html.parser')
posts_tag = soup.find_all('div', class_="post floated-thumb")
urls = []
for index, archive in enumerate(posts_tag):
meta = archive.find("div", class_="post-meta")
url = meta.p.a['href']
urls.append(url)
raise gen.Return(urls)
except httpclient.HTTPError as e:
print('Exception: %s %s' % (e, page_url))
raise gen.Return([])


@gen.coroutine
def get_post_data_from_url(post_url, cookies):
"""
获取文章的元信息:阅读数\点赞数\收藏数\评论
:param post_url:
:return:
"""
try:
headers["Cookie"] = ";".join([name + "=" + value for name, value in cookies.items()])
response = yield httpclient.AsyncHTTPClient().fetch(post_url, headers=headers)
soup = BeautifulSoup(response.body, 'html.parser')
title = soup.find("div", class_="entry-header").get_text()
meta_tag = soup.find("div", class_="entry-meta").p
text = meta_tag.get_text()

def extract_keyword(pattern, content):
"""
利用正则表达式提取匹配的内容
"""
match = re.compile(pattern, flags=re.S).search(content)
if match:
return int(match.group(1).replace(",", '').replace(" ", "0"))
else:
return 0

read_count = extract_keyword("([\d,]+) 阅读", text)
comment_count = extract_keyword("([\d,]+) 评论", text)

post_adds = soup.find("div", class_="post-adds")

vote_count = extract_keyword("([\d, ]+) 赞", post_adds.find("span", class_="vote-post-up").get_text())
bookmark_count = extract_keyword("([\d, ]+) 收藏", post_adds.find("span", class_="bookmark-btn").get_text())

post_data = {"url": post_url,
"title": title,
"read_count": read_count,
"comment_count": comment_count,
"vote_count": vote_count,
"bookmark_count": bookmark_count}
print(title)
raise gen.Return(post_data)
except httpclient.HTTPError as e:
print('Exception: %s %s' % (e, post_url))
raise gen.Return({})



@gen.coroutine
def mainx():
start = time.time()
fetched = 0
client = MongoClient('mongodb://localhost:27017/')
db = client['posts']
cookies = {
'wordpress_logged_in_0efdf49af511fd88681529ef8c2e5fbf': 'liuzhijun%7C1489462391%7CcFSvpRWbyJcPRGSIelRPWRIqUNdIQnF5Jjh1BrBPQI2%7C812c5106ea45baeae74102845a2c6d269de6b7547e85a5613b575aa9c8708add',
'wordpress_0efdf49af511fd88681529ef8c2e5fbf': 'liuzhijun%7C1489462391%7CcFSvpRWbyJcPRGSIelRPWRIqUNdIQnF5Jjh1BrBPQI2%7C0edb104a0e34927a3c18e3fc4f10cc051153b1252f1f74efd7b57d21613e1f92'}
post_queue = queues.Queue()
page_queue = queues.Queue()
for i in range(1, 69):
page_url = "http://python.jobbole.com/all-posts/page/{page}/".format(page=i)
page_queue.put(page_url)
print(page_url)

@gen.coroutine
def posts_url_worker():
while True:
page = yield page_queue.get()
urls = yield get_posts_url_from_page(page)
for u in urls:
post_queue.put(u)
page_queue.task_done()

@gen.coroutine
def post_data_worker():
while True:
url = yield post_queue.get()
post = yield get_post_data_from_url(url, cookies)
nonlocal fetched
fetched += 1
db.posts.insert_one(post)
post_queue.task_done()

for _ in range(concurrency):
posts_url_worker()
for _ in range(concurrency):
post_data_worker()

yield page_queue.join()
yield post_queue.join()
# yield q.join(timeout=timedelta(seconds=300))
print('爬取%s 篇文章,总共耗时%d 秒.' % (fetched, time.time() - start))


def login():
"""
登录账户,获取登录cookie信息
:return:
"""
url = "http://python.jobbole.com/wp-admin/admin-ajax.php"
account = {"action": "user_login",
"user_login": "liuzhijun",
"user_pass": "**********",
"remember_me": "1"}
response = requests.post(url, data=account)
print(response.cookies)
cookies = dict((name, value) for name, value in response.cookies.items())
return cookies


if __name__ == '__main__':
# print(login())
#
# import logging
#
# logging.basicConfig()
io_loop = ioloop.IOLoop.current()
# io_loop.run_sync(main)
# io_loop.run_sync(lambda: get_all_post_url(67))
cookies = {
'wordpress_logged_in_0efdf49af511fd88681529ef8c2e5fbf': 'liuzhijun%7C1489462391%7CcFSvpRWbyJcPRGSIelRPWRIqUNdIQnF5Jjh1BrBPQI2%7C812c5106ea45baeae74102845a2c6d269de6b7547e85a5613b575aa9c8708add',
'wordpress_0efdf49af511fd88681529ef8c2e5fbf': 'liuzhijun%7C1489462391%7CcFSvpRWbyJcPRGSIelRPWRIqUNdIQnF5Jjh1BrBPQI2%7C0edb104a0e34927a3c18e3fc4f10cc051153b1252f1f74efd7b57d21613e1f92'}

# io_loop.run_sync(lambda: get_post_data_from_url("http://python.jobbole.com/87288/", cookies))
io_loop.run_sync(mainx)
Binary file removed crawer-pdf.png
Binary file not shown.
26 changes: 26 additions & 0 deletions heart/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
### 准备工作
大体思路就是把微博数据爬下来,数据经过清洗加工后再分词处理,处理后的数据交给词云工具,配合科学计算工具和绘图工具制作成图像出来,涉及到的工具包有:

Requests 用于网络请求爬取微博数据,结巴分词 jieba 进行中文分词处理,wordcloud 词云处理,图片处理库 Pillow,科学计算工具 NumPy ,类似于 MATLAB 的 2D 绘图库 Matplotlib

### 工具安装
安装这些工具包时,不同系统平台有可能出现不一样的错误,wordcloud,requests,jieba 都可以通过普通的 pip 方式在线安装,
```python
pip install wordcloud
pip install requests
pip install jieba
```
在Windows 平台安装 Pillow,NumPy,Matplotlib 直接用 pip 在线安装会出现各种问题,比较推荐的一种方式是在一个叫 Python Extension Packages for Windows [1] 的第三方平台下载 相应的.whl 文件安装。可以根据自己的系统环境选择下载安装 cp27 对应 python2.7,amd64 对应 64 位系统。下载到本地后进行安装
```python
pip install Pillow-4.0.0-cp27-cp27m-win_amd64.whl
pip install scipy-0.18.0-cp27-cp27m-win_amd64.whl
pip install numpy-1.11.3+mkl-cp27-cp27m-win_amd64.whl
pip install matplotlib-1.5.3-cp27-cp27m-win_amd64.whl
```
其他平台可根据错误提示 Google 解决。也可以通过 [issue](https://github.com/lzjun567/crawler_html2pdf/issues) 在 GitHub 提交问题。

### Contact me

>作者:liuzhijun
>微信: lzjun567
>公众号:一个程序员的微站(id:VTtalk)
Empty file added heart/__init__.py
Empty file.
Binary file added heart/heart-mask.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added heart/heart.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
87 changes: 87 additions & 0 deletions heart/heart.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# -*- coding:utf-8 -*-
import codecs
import csv
import re

import jieba.analyse
import matplotlib.pyplot as plt
import requests
from scipy.misc import imread
from wordcloud import WordCloud

__author__ = 'liuzhijun'

cookies = {
"ALF": "xxxx",
"SCF": "xxxxxx.",
"SUBP": "xxxxx",
"SUB": "xxxx",
"SUHB": "xxx-", "xx": "xx", "_T_WM": "xxx",
"gsScrollPos": "", "H5_INDEX": "0_my", "H5_INDEX_TITLE": "xxx",
"M_WEIBOCN_PARAMS": "xxxx"
}


def fetch_weibo():
api = "http://m.weibo.cn/index/my?format=cards&page=%s"
for i in range(1, 102):
response = requests.get(url=api % i, cookies=cookies)
data = response.json()[0]
groups = data.get("card_group") or []
for group in groups:
text = group.get("mblog").get("text")
text = text.encode("utf-8")

def cleanring(content):
"""
去掉无用字符
"""
pattern = "<a .*?/a>|<i .*?/i>|转发微博|//:|Repost|,|?|。|、|分享图片"
content = re.sub(pattern, "", content)
return content

text = cleanring(text).strip()
if text:
yield text


def write_csv(texts):
with codecs.open('./weibo.csv', 'w') as f:
writer = csv.DictWriter(f, fieldnames=["text"])
writer.writeheader()
for text in texts:
writer.writerow({"text": text})


def read_csv():
with codecs.open('./weibo.csv', 'r') as f:
reader = csv.DictReader(f)
for row in reader:
yield row['text']


def word_segment(texts):
jieba.analyse.set_stop_words("./stopwords.txt")
for text in texts:
tags = jieba.analyse.extract_tags(text, topK=20)
yield " ".join(tags)


def generate_img(texts):
data = " ".join(text for text in texts)

mask_img = imread('./heart-mask.jpg', flatten=True)
wordcloud = WordCloud(
font_path='msyh.ttc',
background_color='white',
mask=mask_img
).generate(data)
plt.imshow(wordcloud)
plt.axis('off')
plt.savefig('./heart.jpg', dpi=600)


if __name__ == '__main__':
texts = fetch_weibo()
write_csv(texts)
generate_img(word_segment(read_csv()))
Loading

0 comments on commit d3301b8

Please sign in to comment.