Skip to content

Commit

Permalink
[update] 停用Redis type SsdbClient支持redis
Browse files Browse the repository at this point in the history
  • Loading branch information
jinghao_wb committed Sep 22, 2017
1 parent 88e8fab commit 0add262
Show file tree
Hide file tree
Showing 9 changed files with 100 additions and 79 deletions.
9 changes: 5 additions & 4 deletions Api/ProxyApi.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,10 @@

api_list = {
'get': u'get an usable proxy',
'refresh': u'refresh proxy pool',
# 'refresh': u'refresh proxy pool',
'get_all': u'get all proxy from proxy pool',
'delete?proxy=127.0.0.1:8080': u'delete an unable proxy',
'get_status': u'proxy statistics'
}


Expand All @@ -55,7 +56,7 @@ def refresh():
@app.route('/get_all/')
def getAll():
proxies = ProxyManager().getAll()
return jsonify([proxy.decode('utf8') for proxy in proxies])
return jsonify(proxies)


@app.route('/delete/', methods=['GET'])
Expand All @@ -66,8 +67,8 @@ def delete():


@app.route('/get_status/')
def get_status():
status = ProxyManager().get_status()
def getStatus():
status = ProxyManager().getNumber()
return jsonify(status)


Expand Down
2 changes: 1 addition & 1 deletion Config.ini
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[DB]
;Configure the database information
;type: SSDB/REDIS
;type: SSDB/REDIS if use redis, only modify the host port,the type should be SSDB
type = SSDB
host = localhost
port = 8888
Expand Down
43 changes: 23 additions & 20 deletions DB/DbClient.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# !/usr/bin/env python
"""
-------------------------------------------------
File Name: DbClient.py
File Name: DbClient.py
Description : DB工厂类
Author : JHao
date: 2016/12/2
Expand Down Expand Up @@ -31,13 +31,16 @@ class DbClient(object):
useful_proxy_queue: 存放检验后的代理;
抽象方法定义:
get: 随机返回一个代理;
put: 放回一个代理;
getvalue: 返回代理属性(一个计数器);
inckey: 修改代理属性计数器的值;
delete: 删除指定代理;
getAll: 返回所有代理;
changeTable: 切换 table or collection or hash;
get(proxy): 返回proxy的信息;
put(proxy): 存入一个代理;
pop(): 弹出一个代理
exists(proxy): 判断代理是否存在
getNumber(raw_proxy): 返回代理总数(一个计数器);
update(proxy, num): 修改代理属性计数器的值;
delete(proxy): 删除指定代理;
getAll(): 返回所有代理;
changeTable(name): 切换 table or collection or hash;
所有方法需要相应类去具体实现:
SSDB:SsdbClient.py
Expand Down Expand Up @@ -72,32 +75,32 @@ def __initDbClient(self):
host=self.config.db_host,
port=self.config.db_port)

def get(self, **kwargs):
return self.client.get(**kwargs)
def get(self, key, **kwargs):
return self.client.get(key, **kwargs)

def put(self, key, **kwargs):
return self.client.put(key, **kwargs)

def getvalue(self, key, **kwargs):
return self.client.getvalue(key, **kwargs)

def pop(self, **kwargs):
return self.client.pop(**kwargs)

def inckey(self, key, value, **kwargs):
return self.client.inckey(key, value, **kwargs)
def update(self, key, value, **kwargs):
return self.client.update(key, value, **kwargs)

def delete(self, key, **kwargs):
return self.client.delete(key, **kwargs)

def exists(self, key, **kwargs):
return self.client.exists(key, **kwargs)

def pop(self, **kwargs):
return self.client.pop(**kwargs)

def getAll(self):
return self.client.getAll()

def changeTable(self, name):
self.client.changeTable(name)

def get_status(self):
return self.client.get_status()
def getNumber(self):
return self.client.getNumber()


if __name__ == "__main__":
Expand Down
10 changes: 8 additions & 2 deletions DB/RedisClient.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,17 @@
2017/4/17 修改pop
'''

# ############################
# 已弃用,
# SsdbClient.py 支持redis
##############################

import json
import random
import redis
import sys


class RedisClient(object):
"""
Reids client
Expand Down Expand Up @@ -41,7 +47,7 @@ def get(self):
return rkey.decode('utf-8')
else:
return rkey
# return self.__conn.srandmember(name=self.name)
# return self.__conn.srandmember(name=self.name)

def put(self, key):
"""
Expand Down Expand Up @@ -87,7 +93,7 @@ def getAll(self):
return [key.decode('utf-8') for key in self.__conn.hgetall(self.name).keys()]
else:
return self.__conn.hgetall(self.name).keys()
# return self.__conn.smembers(self.name)
# return self.__conn.smembers(self.name)

def get_status(self):
return self.__conn.hlen(self.name)
Expand Down
70 changes: 34 additions & 36 deletions DB/SsdbClient.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
-------------------------------------------------
Change Activity:
2016/12/2:
2017/04/26: 添加get_status方法获取hash长度
2017/09/22: PY3中 redis-py返回的数据是bytes型
-------------------------------------------------
"""
__author__ = 'JHao'
Expand All @@ -19,16 +19,15 @@
from redis.connection import BlockingConnectionPool
from redis import Redis
import random
import json


class SsdbClient(object):
"""
SSDB client
SSDB中代理存放的容器为hash:
原始代理存放在name为raw_proxy的hash中,key为代理的ip:port,value为None,以后扩展可能会加入代理属性;
验证后供flask使用的代理存放在name为useful_proxy的hash中,key为代理的ip:port,value为None,以后扩展可能会加入代理属性
原始代理存放在name为raw_proxy的hash中,key为代理的ip:port,value为为None,以后扩展可能会加入代理属性;
验证后的代理存放在name为useful_proxy的hash中,key为代理的ip:port,value为一个计数,初始为1,每校验失败一次减1
"""

Expand All @@ -43,43 +42,28 @@ def __init__(self, name, host, port):
self.name = name
self.__conn = Redis(connection_pool=BlockingConnectionPool(host=host, port=port))

def get(self):
def get(self, proxy):
"""
get an item
从useful_proxy_queue随机获取一个可用代理, 使用前需要调用changeTable("useful_proxy_queue")
从hash中获取对应的proxy, 使用前需要调用changeTable()
:param proxy:
:return:
"""
values = self.__conn.hkeys(name=self.name)
keys = list(values) if EnvUtil.PY3 else values
return random.choice(keys) if values else None
data = self.__conn.hget(name=self.name, key=proxy)
if data:
return data.decode('utf-8') if EnvUtil.PY3 else data
else:
return None

def put(self, key):
def put(self, proxy, num=1):
"""
put an item
将代理放入hash, 使用changeTable指定hash name
:param key:
:param proxy:
:param num:
:return:
"""
key = json.dump(key, ensure_ascii=False) if isinstance(key, (dict, list)) else key
return self.__conn.hincrby(self.name, key, 1)

def getvalue(self, key):
value = self.__conn.hget(self.name, key)
return value if value else None

def pop(self):
"""
pop an item
弹出一个代理, 使用changeTable指定hash name
:return:
"""
key = self.get()
if key:
self.__conn.hdel(self.name, key)
return key
data = self.__conn.hincrby(self.name, proxy, num)
return data

def delete(self, key):
"""
Expand All @@ -89,14 +73,28 @@ def delete(self, key):
"""
self.__conn.hdel(self.name, key)

def inckey(self, key, value):
def update(self, key, value):
self.__conn.hincrby(self.name, key, value)

def pop(self):
proxies = self.__conn.hkeys(self.name)
if proxies:
proxy = random.choice(proxies)
self.delete(proxy)
return proxy
return None

def exists(self, key):
return self.__conn.hexists(self.name, key)

def getAll(self):
keys = self.__conn.hkeys(self.name)
return list(keys) if EnvUtil.PY3 else keys
item_dict = self.__conn.hgetall(self.name)
if EnvUtil.PY3:
return {key.decode('utf8'): value.decode('utf8') for key, value in item_dict.items()}
else:
return item_dict

def get_status(self):
def getNumber(self):
"""
Return the number of elements in hash ``name``
:return:
Expand Down
31 changes: 22 additions & 9 deletions Manager/ProxyManager.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
"""
__author__ = 'JHao'

import random

from Util import EnvUtil
from DB.DbClient import DbClient
from Util.GetConfig import GetConfig
from Util.LogHandler import LogHandler
Expand Down Expand Up @@ -45,8 +48,11 @@ def refresh(self):
proxy_set.add(proxy.strip())

# store raw proxy
self.db.changeTable(self.raw_proxy_queue)
for proxy in proxy_set:
self.db.changeTable(self.useful_proxy_queue)
if self.db.exists(proxy):
continue
self.db.changeTable(self.raw_proxy_queue)
self.db.put(proxy)

def get(self):
Expand All @@ -55,7 +61,13 @@ def get(self):
:return:
"""
self.db.changeTable(self.useful_proxy_queue)
return self.db.get()
item_dict = self.db.getAll()
if item_dict:
if EnvUtil.PY3:
return random.choice(list(item_dict.keys()))
else:
return random.choice(item_dict.keys())
return None
# return self.db.pop()

def delete(self, proxy):
Expand All @@ -69,21 +81,22 @@ def delete(self, proxy):

def getAll(self):
"""
get all proxy from pool
get all proxy from pool as list
:return:
"""
self.db.changeTable(self.useful_proxy_queue)
return self.db.getAll()
item_dict = self.db.getAll()
if EnvUtil.PY3:
return list(item_dict.keys()) if item_dict else list()
return item_dict.keys() if item_dict else list()

def get_status(self):
# TODO rename get_count..
def getNumber(self):
self.db.changeTable(self.raw_proxy_queue)
total_raw_proxy = self.db.get_status()
total_raw_proxy = self.db.getNumber()
self.db.changeTable(self.useful_proxy_queue)
total_useful_queue = self.db.get_status()
total_useful_queue = self.db.getNumber()
return {'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue}

if __name__ == '__main__':
pp = ProxyManager()
pp.refresh()
print(pp.get_status())
4 changes: 2 additions & 2 deletions Schedule/ProxyRefreshSchedule.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def validProxy(self):
raw_proxy = self.db.pop()
self.log.info('ProxyRefreshSchedule: %s start validProxy' % time.ctime())
# 计算剩余代理,用来减少重复计算
remaining_proxies = self.db.getAll()
remaining_proxies = self.getAll()
while raw_proxy:
if isinstance(raw_proxy, bytes):
# 兼容Py3
Expand All @@ -63,7 +63,7 @@ def validProxy(self):
self.log.info('ProxyRefreshSchedule: %s validation fail' % raw_proxy)
self.db.changeTable(self.raw_proxy_queue)
raw_proxy = self.db.pop()
remaining_proxies = self.db.getAll()
remaining_proxies = self.getAll()
self.log.info('ProxyRefreshSchedule: %s validProxy complete' % time.ctime())


Expand Down
6 changes: 3 additions & 3 deletions Schedule/ProxyValidSchedule.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,19 +39,19 @@ def __validProxy(self):
# 兼容PY3
each_proxy = each_proxy.decode('utf-8')

value = self.db.getvalue(each_proxy)
value = self.db.get(each_proxy)
if validUsefulProxy(each_proxy):
# 成功计数器加1
if value and int(value) < 1:
self.db.inckey(each_proxy, 1)
self.db.update(each_proxy, 1)
self.log.info('ProxyValidSchedule: {} validation pass'.format(each_proxy))
else:
# 失败计数器减一
if value and int(value) < -5:
# 计数器小于-5删除该代理
self.db.delete(each_proxy)
else:
self.db.inckey(each_proxy, -1)
self.db.update(each_proxy, -1)
self.log.info('ProxyValidSchedule: {} validation fail'.format(each_proxy))

self.log.info('ProxyValidSchedule running normal')
Expand Down
4 changes: 2 additions & 2 deletions Util/utilFunction.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,8 @@ def validUsefulProxy(proxy):
proxy = proxy.decode('utf8')
proxies = {"https": "https://{proxy}".format(proxy=proxy)}
try:
# 超过40秒的代理就不要了
r = requests.get('https://www.baidu.com', proxies=proxies, timeout=40, verify=False)
# 超过20秒的代理就不要了
r = requests.get('https://www.baidu.com', proxies=proxies, timeout=20, verify=False)
if r.status_code == 200:
logger.info('%s is ok' % proxy)
return True
Expand Down

0 comments on commit 0add262

Please sign in to comment.