-
Notifications
You must be signed in to change notification settings - Fork 30
/
Copy pathtoutiao_spider.py
executable file
·147 lines (130 loc) · 4.8 KB
/
toutiao_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env python3
"""
Description: Get pisture collections from www.toutiao.com
Tip: This script comes from https://github.com/smslit/spider-collection/tree/master/ttpic
Link: https://www.smslit.top/2018/06/21/spider-practice-pic-dog/
"""
__author__ = '5km(smslit)'
__date__ = '20180627'
import os
import requests
from hashlib import md5
from functools import partial
from urllib.parse import urlencode
from multiprocessing.pool import Pool
PAGE_NUM = 5
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15',
'X-Requested-With': 'XMLHttpRequest'
}
def get_page_json(offset, keyword):
'''生成ajax请求,发出get请求,并获取响应结果,以字典的形式返回json数据
:param offset: 页面请求偏移值
:type offset: 能被20整除的整数
:param keyword: 图片搜索的关键词
:type keyword: unicode字符串
:return: 响应数据
:rtype: dict
'''
params = {
'aid': 24,
'app_name': 'web_search',
'offset': offset,
'format': 'json',
'keyword': keyword,
'autoload': 'true',
'count': '20',
'cur_tab': '3',
'from': 'gallery'
}
url = 'https://www.toutiao.com/api/search/content/?' + urlencode(params)
try:
response = requests.get(url, headers=HEADERS)
if response.status_code == 200:
return response.json()
except requests.ConnectionError as e:
print('Error', e)
def parse_images_url(json):
'''解析json字典,获得对应条目标题和图片链接
:param json: 页面请求响应结果对应的字典数据
:type json: dict
'''
data = json['data']
if data:
for item in data:
try:
title = item['title']
images = item['image_list']
except KeyError:
continue
if images:
for image in images:
yield {
'image': image['url'].replace('list', 'origin').replace('190x124/', ''),
'title': title
}
def save_image_from(image_info, to_dir=''):
'''根据链接获取图片,保存到标题命名的目录中,图片以md5码命名
:param image_info: 包含标题和链接信息的字典数据
:type image_info: dict
:param to_dir: 要保存到的目录,默认值是空字符串,可指定目录,格式如: '狗狗'
:type to_dir: unicode 字符串
'''
if image_info:
print(image_info)
image_dir = to_dir + '/' + image_info['title']
if not os.path.exists(image_dir):
os.makedirs(image_dir)
try:
response = requests.get(image_info['image'])
if response.status_code == 200:
image_path = '{0}/{1}.{2}'.format(image_dir,
md5(response.content).hexdigest(), 'jpg')
print(image_path)
if not os.path.exists(image_path):
with open(image_path, 'wb') as f:
f.write(response.content)
print('图片下载完成!')
else:
print('图片已下载 -> ', image_path)
except requests.ConnectionError as e:
print('图片下载失败!')
except:
print('出现异常!')
def get_images_of(offset, keyword):
'''主函数,用于多进程调度
:param offset: 页面链接offset参数的值
:type offset: 能被20整除的整数
:param keyword: 图片搜索关键词
:type keyword: unicode字符串
'''
print('获取第', offset + 1, '~', offset + 20, '个条目...')
json = get_page_json(offset, keyword)
for image in parse_images_url(json):
path = os.path.join(os.path.expanduser(
'~'), 'Pictures/python/toutiao', keyword)
save_image_from(image, path)
def main():
print('\nWelcome here to get pictures from www.toutiao.com!')
keyword = input('Please input your search keywords > ')
count = None
while count == None:
number_str = input(
'Please input count of picture collection that you want(Divisible by 20 ) > ')
try:
count = int(number_str)
except ValueError:
print('Please input a valid number!')
if count > 0:
print('Getting %s pictures...' % keyword)
page_num = count // 20 + (0 if count % 20 == 0 else 1)
offset_list = [x * 20 for x in range(0, page_num)]
pool = Pool()
partial_getter = partial(get_images_of, keyword=keyword)
pool.map(partial_getter, offset_list)
pool.close()
pool.join()
else:
print('Get Cancel!')
if __name__ == '__main__':
main()