forked from lzjun567/python_scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
17 changed files
with
1,309 additions
and
43 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,47 +1,10 @@ | ||
#Python 爬虫:把廖雪峰的教程转换成 PDF 电子书 | ||
# 目录 | ||
|
||
### 系统要求 | ||
python3.4以上版本, 不支持python2.x | ||
* [Python 爬虫:把廖雪峰的教程转换成 PDF 电子书](./pdf/README.md) | ||
* [用Python 写“爱心”](./heart/README.md) | ||
|
||
|
||
### 准备工具 | ||
|
||
requests、beautifulsoup 是爬虫两大神器,reuqests 用于网络请求,beautifusoup 用于操作 html 数据。有了这两把梭子,干起活来利索。scrapy 这样的爬虫框架我们就不用了,这样的小程序派上它有点杀鸡用牛刀的意思。此外,既然是把 html 文件转为 pdf,那么也要有相应的库支持, wkhtmltopdf 就是一个非常的工具,它可以用适用于多平台的 html 到 pdf 的转换,pdfkit 是 wkhtmltopdf 的Python封装包。首先安装好下面的依赖包 | ||
|
||
```python | ||
pip install requests | ||
pip install beautifulsoup4 | ||
pip install pdfkit | ||
``` | ||
|
||
### 安装 wkhtmltopdf | ||
Windows平台直接在 [http://wkhtmltopdf.org/downloads.html](http://wkhtmltopdf.org/downloads.html) 下载稳定版的 wkhtmltopdf 进行安装,安装完成之后把该程序的执行路径加入到系统环境 $PATH 变量中,否则 pdfkit 找不到 wkhtmltopdf 就出现错误 “No wkhtmltopdf executable found”。Ubuntu 和 CentOS 可以直接用命令行进行安装 | ||
|
||
```shell | ||
$ sudo apt-get install wkhtmltopdf # ubuntu | ||
$ sudo yum intsall wkhtmltopdf # centos | ||
``` | ||
|
||
### 运行 | ||
```python | ||
python crawler.py | ||
``` | ||
|
||
### 效果图 | ||
![image](./crawer-pdf.png) | ||
|
||
### 常见问题 | ||
|
||
1. SyntaxError: Missing parentheses in call to 'print' | ||
|
||
beautifulsoup3不支持python2,所以下载beautifulsoup是要指定 beautifusoup4 | ||
2. 如果是使用PyCharm开发, 那么运行的时候要在shell/cmd 窗口执行脚本, 直接在Pycharm中运行会找不到 wkhtmltopdf命令 | ||
|
||
|
||
### contact me | ||
### Contact me | ||
|
||
>作者:liuzhijun | ||
>微信号: lzjun567 | ||
>公众号:一个程序员的微站(VTtalk) | ||
|
||
>微信: lzjun567 | ||
>公众号:一个程序员的微站(id:VTtalk) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
# encoding: utf-8 | ||
import requests | ||
|
||
__author__ = 'liuzhijun' | ||
|
||
if __name__ == '__main__': | ||
cookies = { | ||
"wordpress_logged_in_0efdf49af511fd88681529ef8c2e5fbf": "liuzhijun%7C1489451730%7Ch1qqRwDqQsBrt3MdwKXXen1IMV1m31tHXITLutHszlT%7C7c5e634d83279f3cf8d37ec7db76a80d775198593d55a165cf579c9f17308c28" | ||
} | ||
|
||
data = {"action": "user_login", | ||
"user_login": "liuzhijun", | ||
"user_pass": "lzjun854977", | ||
"remember_me": "1",} | ||
# redirect_url http://www.jobbole.com} | ||
url = "http://python.jobbole.com/wp-admin/admin-ajax.php" | ||
response = requests.post(url, data) | ||
|
||
for name, value in response.cookies.items(): | ||
print(name, value) | ||
|
||
response = requests.get("http://python.jobbole.com/87305/", cookies=response.cookies) | ||
print(response.content.decode('utf-8')) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
# encoding: utf-8 | ||
# !/usr/bin/env python | ||
|
||
import time | ||
from pymongo import MongoClient | ||
import requests | ||
from datetime import timedelta | ||
import re | ||
from bs4 import BeautifulSoup | ||
from tornado import httpclient, gen, ioloop, queues | ||
|
||
__author__ = 'liuzhijun' | ||
|
||
concurrency = 10 | ||
|
||
headers = { | ||
'Connection': 'Keep-Alive', | ||
'Accept': 'text/html, application/xhtml+xml, */*', | ||
'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', | ||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Sa", | ||
"Referer": "http://www.jobbole.com/", | ||
} | ||
|
||
|
||
@gen.coroutine | ||
def get_posts_url_from_page(page_url): | ||
""" | ||
获取指定页面中所有文章的URL | ||
:param page_url | ||
:return: | ||
""" | ||
try: | ||
response = yield httpclient.AsyncHTTPClient().fetch(page_url, headers=headers) | ||
soup = BeautifulSoup(response.body, 'html.parser') | ||
posts_tag = soup.find_all('div', class_="post floated-thumb") | ||
urls = [] | ||
for index, archive in enumerate(posts_tag): | ||
meta = archive.find("div", class_="post-meta") | ||
url = meta.p.a['href'] | ||
urls.append(url) | ||
raise gen.Return(urls) | ||
except httpclient.HTTPError as e: | ||
print('Exception: %s %s' % (e, page_url)) | ||
raise gen.Return([]) | ||
|
||
|
||
@gen.coroutine | ||
def get_post_data_from_url(post_url, cookies): | ||
""" | ||
获取文章的元信息:阅读数\点赞数\收藏数\评论 | ||
:param post_url: | ||
:return: | ||
""" | ||
try: | ||
headers["Cookie"] = ";".join([name + "=" + value for name, value in cookies.items()]) | ||
response = yield httpclient.AsyncHTTPClient().fetch(post_url, headers=headers) | ||
soup = BeautifulSoup(response.body, 'html.parser') | ||
title = soup.find("div", class_="entry-header").get_text() | ||
meta_tag = soup.find("div", class_="entry-meta").p | ||
text = meta_tag.get_text() | ||
|
||
def extract_keyword(pattern, content): | ||
""" | ||
利用正则表达式提取匹配的内容 | ||
""" | ||
match = re.compile(pattern, flags=re.S).search(content) | ||
if match: | ||
return int(match.group(1).replace(",", '').replace(" ", "0")) | ||
else: | ||
return 0 | ||
|
||
read_count = extract_keyword("([\d,]+) 阅读", text) | ||
comment_count = extract_keyword("([\d,]+) 评论", text) | ||
|
||
post_adds = soup.find("div", class_="post-adds") | ||
|
||
vote_count = extract_keyword("([\d, ]+) 赞", post_adds.find("span", class_="vote-post-up").get_text()) | ||
bookmark_count = extract_keyword("([\d, ]+) 收藏", post_adds.find("span", class_="bookmark-btn").get_text()) | ||
|
||
post_data = {"url": post_url, | ||
"title": title, | ||
"read_count": read_count, | ||
"comment_count": comment_count, | ||
"vote_count": vote_count, | ||
"bookmark_count": bookmark_count} | ||
print(title) | ||
raise gen.Return(post_data) | ||
except httpclient.HTTPError as e: | ||
print('Exception: %s %s' % (e, post_url)) | ||
raise gen.Return({}) | ||
|
||
|
||
|
||
@gen.coroutine | ||
def mainx(): | ||
start = time.time() | ||
fetched = 0 | ||
client = MongoClient('mongodb://localhost:27017/') | ||
db = client['posts'] | ||
cookies = { | ||
'wordpress_logged_in_0efdf49af511fd88681529ef8c2e5fbf': 'liuzhijun%7C1489462391%7CcFSvpRWbyJcPRGSIelRPWRIqUNdIQnF5Jjh1BrBPQI2%7C812c5106ea45baeae74102845a2c6d269de6b7547e85a5613b575aa9c8708add', | ||
'wordpress_0efdf49af511fd88681529ef8c2e5fbf': 'liuzhijun%7C1489462391%7CcFSvpRWbyJcPRGSIelRPWRIqUNdIQnF5Jjh1BrBPQI2%7C0edb104a0e34927a3c18e3fc4f10cc051153b1252f1f74efd7b57d21613e1f92'} | ||
post_queue = queues.Queue() | ||
page_queue = queues.Queue() | ||
for i in range(1, 69): | ||
page_url = "http://python.jobbole.com/all-posts/page/{page}/".format(page=i) | ||
page_queue.put(page_url) | ||
print(page_url) | ||
|
||
@gen.coroutine | ||
def posts_url_worker(): | ||
while True: | ||
page = yield page_queue.get() | ||
urls = yield get_posts_url_from_page(page) | ||
for u in urls: | ||
post_queue.put(u) | ||
page_queue.task_done() | ||
|
||
@gen.coroutine | ||
def post_data_worker(): | ||
while True: | ||
url = yield post_queue.get() | ||
post = yield get_post_data_from_url(url, cookies) | ||
nonlocal fetched | ||
fetched += 1 | ||
db.posts.insert_one(post) | ||
post_queue.task_done() | ||
|
||
for _ in range(concurrency): | ||
posts_url_worker() | ||
for _ in range(concurrency): | ||
post_data_worker() | ||
|
||
yield page_queue.join() | ||
yield post_queue.join() | ||
# yield q.join(timeout=timedelta(seconds=300)) | ||
print('爬取%s 篇文章,总共耗时%d 秒.' % (fetched, time.time() - start)) | ||
|
||
|
||
def login(): | ||
""" | ||
登录账户,获取登录cookie信息 | ||
:return: | ||
""" | ||
url = "http://python.jobbole.com/wp-admin/admin-ajax.php" | ||
account = {"action": "user_login", | ||
"user_login": "liuzhijun", | ||
"user_pass": "**********", | ||
"remember_me": "1"} | ||
response = requests.post(url, data=account) | ||
print(response.cookies) | ||
cookies = dict((name, value) for name, value in response.cookies.items()) | ||
return cookies | ||
|
||
|
||
if __name__ == '__main__': | ||
# print(login()) | ||
# | ||
# import logging | ||
# | ||
# logging.basicConfig() | ||
io_loop = ioloop.IOLoop.current() | ||
# io_loop.run_sync(main) | ||
# io_loop.run_sync(lambda: get_all_post_url(67)) | ||
cookies = { | ||
'wordpress_logged_in_0efdf49af511fd88681529ef8c2e5fbf': 'liuzhijun%7C1489462391%7CcFSvpRWbyJcPRGSIelRPWRIqUNdIQnF5Jjh1BrBPQI2%7C812c5106ea45baeae74102845a2c6d269de6b7547e85a5613b575aa9c8708add', | ||
'wordpress_0efdf49af511fd88681529ef8c2e5fbf': 'liuzhijun%7C1489462391%7CcFSvpRWbyJcPRGSIelRPWRIqUNdIQnF5Jjh1BrBPQI2%7C0edb104a0e34927a3c18e3fc4f10cc051153b1252f1f74efd7b57d21613e1f92'} | ||
|
||
# io_loop.run_sync(lambda: get_post_data_from_url("http://python.jobbole.com/87288/", cookies)) | ||
io_loop.run_sync(mainx) |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
### 准备工作 | ||
大体思路就是把微博数据爬下来,数据经过清洗加工后再分词处理,处理后的数据交给词云工具,配合科学计算工具和绘图工具制作成图像出来,涉及到的工具包有: | ||
|
||
Requests 用于网络请求爬取微博数据,结巴分词 jieba 进行中文分词处理,wordcloud 词云处理,图片处理库 Pillow,科学计算工具 NumPy ,类似于 MATLAB 的 2D 绘图库 Matplotlib | ||
|
||
### 工具安装 | ||
安装这些工具包时,不同系统平台有可能出现不一样的错误,wordcloud,requests,jieba 都可以通过普通的 pip 方式在线安装, | ||
```python | ||
pip install wordcloud | ||
pip install requests | ||
pip install jieba | ||
``` | ||
在Windows 平台安装 Pillow,NumPy,Matplotlib 直接用 pip 在线安装会出现各种问题,比较推荐的一种方式是在一个叫 Python Extension Packages for Windows [1] 的第三方平台下载 相应的.whl 文件安装。可以根据自己的系统环境选择下载安装 cp27 对应 python2.7,amd64 对应 64 位系统。下载到本地后进行安装 | ||
```python | ||
pip install Pillow-4.0.0-cp27-cp27m-win_amd64.whl | ||
pip install scipy-0.18.0-cp27-cp27m-win_amd64.whl | ||
pip install numpy-1.11.3+mkl-cp27-cp27m-win_amd64.whl | ||
pip install matplotlib-1.5.3-cp27-cp27m-win_amd64.whl | ||
``` | ||
其他平台可根据错误提示 Google 解决。也可以通过 [issue](https://github.com/lzjun567/crawler_html2pdf/issues) 在 GitHub 提交问题。 | ||
|
||
### Contact me | ||
|
||
>作者:liuzhijun | ||
>微信: lzjun567 | ||
>公众号:一个程序员的微站(id:VTtalk) |
Empty file.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
# -*- coding:utf-8 -*- | ||
import codecs | ||
import csv | ||
import re | ||
|
||
import jieba.analyse | ||
import matplotlib.pyplot as plt | ||
import requests | ||
from scipy.misc import imread | ||
from wordcloud import WordCloud | ||
|
||
__author__ = 'liuzhijun' | ||
|
||
cookies = { | ||
"ALF": "xxxx", | ||
"SCF": "xxxxxx.", | ||
"SUBP": "xxxxx", | ||
"SUB": "xxxx", | ||
"SUHB": "xxx-", "xx": "xx", "_T_WM": "xxx", | ||
"gsScrollPos": "", "H5_INDEX": "0_my", "H5_INDEX_TITLE": "xxx", | ||
"M_WEIBOCN_PARAMS": "xxxx" | ||
} | ||
|
||
|
||
def fetch_weibo(): | ||
api = "http://m.weibo.cn/index/my?format=cards&page=%s" | ||
for i in range(1, 102): | ||
response = requests.get(url=api % i, cookies=cookies) | ||
data = response.json()[0] | ||
groups = data.get("card_group") or [] | ||
for group in groups: | ||
text = group.get("mblog").get("text") | ||
text = text.encode("utf-8") | ||
|
||
def cleanring(content): | ||
""" | ||
去掉无用字符 | ||
""" | ||
pattern = "<a .*?/a>|<i .*?/i>|转发微博|//:|Repost|,|?|。|、|分享图片" | ||
content = re.sub(pattern, "", content) | ||
return content | ||
|
||
text = cleanring(text).strip() | ||
if text: | ||
yield text | ||
|
||
|
||
def write_csv(texts): | ||
with codecs.open('./weibo.csv', 'w') as f: | ||
writer = csv.DictWriter(f, fieldnames=["text"]) | ||
writer.writeheader() | ||
for text in texts: | ||
writer.writerow({"text": text}) | ||
|
||
|
||
def read_csv(): | ||
with codecs.open('./weibo.csv', 'r') as f: | ||
reader = csv.DictReader(f) | ||
for row in reader: | ||
yield row['text'] | ||
|
||
|
||
def word_segment(texts): | ||
jieba.analyse.set_stop_words("./stopwords.txt") | ||
for text in texts: | ||
tags = jieba.analyse.extract_tags(text, topK=20) | ||
yield " ".join(tags) | ||
|
||
|
||
def generate_img(texts): | ||
data = " ".join(text for text in texts) | ||
|
||
mask_img = imread('./heart-mask.jpg', flatten=True) | ||
wordcloud = WordCloud( | ||
font_path='msyh.ttc', | ||
background_color='white', | ||
mask=mask_img | ||
).generate(data) | ||
plt.imshow(wordcloud) | ||
plt.axis('off') | ||
plt.savefig('./heart.jpg', dpi=600) | ||
|
||
|
||
if __name__ == '__main__': | ||
texts = fetch_weibo() | ||
write_csv(texts) | ||
generate_img(word_segment(read_csv())) |
Oops, something went wrong.