-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun.py
142 lines (122 loc) · 4.69 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from bs4 import BeautifulSoup
import os
import json
import re
import time
import traceback
import function as func
from db import FollowerLog,QueryTaskStack,Base
from db import build_session
from datetime import datetime
from typing import Any
db_url = 'sqlite:///resource/database.db'
with open('config.json','r') as fp:
config = json.load(fp)
if config['remote']:
db_url=config['db_url']
def table_attr(Table:Base) -> list:
return list(filter(lambda x:not x.startswith('_'),Table.__dict__.keys()))
class WeiboFans():
def __init__(self):
pass
def get_uid_from_home(self):
pass
# 根据uid获取response
def get_apidata(self, uid):
"""dict 形式"""
prefix = 'https://m.weibo.cn/api/container/getIndex?type=uid&value='
# prefix_test = 'https://weibo.com/ajax/profile/info?uid='
url = prefix + uid
response = func.get_response(url)
return response.text
def get_uid_redirect_url(self, ref):
# 当出现自定义域名时使用这个方法获取原始id,会出现跳转域名
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OP'
'D3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chr'
'ome/84.0.4147.89 Mobile Safari/537.36 Edg/84.0.522.48'
}
content = func.get_response(ref, headers=headers)
uid = re.search('[0-9]+', content.url)
return uid.group()
def loop_crawl(self,uid_list:list):
global detail
for uid in uid_list:
user_info = {}
user_info_keys = table_attr(FollowerLog)
user_info_keys = user_info_keys[1:] # skip id
try:
detail = self.get_apidata(uid)
except Exception as err:
print('遇到了错误。')
print(err)
traceback.print_exc()
response_time = datetime.now()
detail_json = json.loads(detail)
if detail_json['ok'] == 0:
return False
else:
user_info_data = detail_json['data']['userInfo']
for k in user_info_keys:
user_info[k] = user_info_data[k] if k in user_info_data.keys() else ''
user_info['uid'] = user_info_data['id'] if 'id' in user_info_data.keys() else ''
user_info['raw_data'] = detail
user_info['response_time'] = response_time
if isinstance(user_info['followers_count'], str):
user_info['followers_count'] = int(float(user_info['followers_count'].replace('万',''))*10000)
new_log = FollowerLog(**user_info)
session = build_session(db_url=db_url)
try:
session.add(new_log)
session.commit()
print(f'成功添加数据:{user_info["screen_name"]}')
except Exception as err:
session.rollback()
print(err)
traceback.print_exc()
exit('数据库连接错误。')
return True
def add_query_task(self,**kwargs):
avail = {}
for k in kwargs.keys():
if k in table_attr(QueryTaskStack):
avail[k] = kwargs[k]
else:
avail[k] = ''
print("无效属性,跳过")
if avail['uid'] is None:
exit('设置博主uid不能为空值')
avail['add_time'] = datetime.now()
session = build_session(db_url=db_url)
new_task = QueryTaskStack(**avail)
try:
if session.query(QueryTaskStack.uid).filter_by(uid=str(avail['uid'])).count() > 0:
print('已存在相似任务。')
return True
session.add(new_task)
session.commit()
except Exception as err:
session.rollback()
print(err)
traceback.print_exc()
exit(-1)
def add_batch_query_task(self):
pass
def del_query_task(self,uid):
session = build_session(db_url=db_url)
try:
if session.query(QueryTaskStack).filter_by(uid=uid).count() > 0:
session.query(QueryTaskStack).filter_by(uid=uid).one().delete()
session.commit()
else:
print('uid不存在。')
except:
session.rollback()
def run(self):
session = build_session(db_url=db_url)
result = session.query(QueryTaskStack).all()
uid_list = [str(r.uid) for r in result]
self.loop_crawl(uid_list=uid_list)
if __name__ == '__main__':
wf = WeiboFans()
wf.run()