forked from loveQt/Zhihu_voters
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathzhihu-vote.py
124 lines (114 loc) · 3.55 KB
/
zhihu-vote.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# -*- coding: utf-8 -*-
__author__ = 'loveQt'
import json
import os
import time
import re
import requests
import ConfigParser
import sys
import xlwt
from bs4 import BeautifulSoup
Zhihu = 'http://www.zhihu.com/'
Login_url = Zhihu + 'login'
#Vote_url = Zhihu + 'answer/' + ans_id +'/voters_profile?total=99999&offset='+str(num)+'0'
def login():
cf = ConfigParser.ConfigParser()
cf.read("config.ini")
cookies = cf._sections['cookies']
email = cf.get("info", "email")
password = cf.get("info", "password")
cookies = dict(cookies)
global s
s = requests.session()
login_data = {"email": email, "password": password}
header = {
'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0",
'Host': "www.zhihu.com",
'Referer': "http://www.zhihu.com/",
'X-Requested-With': "XMLHttpRequest"
}
r = s.post(Login_url, data=login_data, headers=header)
login()
def get_voters():
#两种思路
book = xlwt.Workbook(encoding = 'utf-8',style_compression=0)
sheet = book.add_sheet('data',cell_overwrite_ok = True)
login()
ans_id = raw_input('请输入抓包获得的问题id:')
h = s.get(Zhihu + 'answer/' + ans_id +'/voters_profile')
html = h.content.encode('utf-8')
#print html
target = json.loads(html)
print '总赞同数:',
print target['paging']['total']
total = target['paging']['total']
page_num = total/10+1
#ans_id = '14572251'
print 'Downloading...'
for num in range (0,page_num):
Vote_url = Zhihu + 'answer/' + ans_id +'/voters_profile?total='+str(total)+'&offset='+str(num)+'0'
#print Vote_url
try:
h = s.get(Vote_url)
except:
time.sleep(2)
h = s.get(Vote_url)
html = h.content.encode('utf-8')
#print html
#获取用户名
i = 10*num
name = r'a title=\\"(.+?)\\"'
namelist = re.findall(name,html)
for each in namelist:
#print each.decode("unicode-escape")
sheet.write(i,0,each.decode("unicode-escape"))
i = i+1
#获取用户地址
i = 10*num
userurl = r'href=\\"(http://www.zhihu.com/people/.*?)\\'
userurllist = re.findall(userurl,html)
#print voteslist
for each in userurllist:
#print each
sheet.write(i,5,each)
i = i+1
#获取点赞
i = 10*num
votes = r'([_a-zA-Z0-9_]{0,10}) \\u8d5e\\u540c'
voteslist = re.findall(votes,html)
#print voteslist
for each in voteslist:
#print each
sheet.write(i,1,each)
i = i+1
#获取感谢
i = 10*num
thank = r'([_a-zA-Z0-9_]{0,10}) \\u611f\\u8c22'
thanklist = re.findall(thank,html)
for each in thanklist:
#print (each)
sheet.write(i,2,each)
i = i+1
#获取提问
i = 10*num
ques = r'([_a-zA-Z0-9_]{0,10}) \\u63d0\\u95ee'
queslist = re.findall(ques,html)
for each in queslist:
#print (each)
sheet.write(i,3,each)
i = i+1
#获取回答
i = 10*num
ans = r'([_a-zA-Z0-9_]{0,10}) \\u56de\\u7b54'
anslist = re.findall(ans,html)
for each in anslist:
#print (each)
sheet.write(i,4,each)
i = i+1
book.save(r'.\\'+str(ans_id)+'v1result.xls')
print 'Mission Complete'
start = time.clock()
get_voters()
end = time.clock()
print end-start