Skip to content

Commit

Permalink
create feature:proxy type check https/http@jiannanya
Browse files Browse the repository at this point in the history
  • Loading branch information
jiannanya committed Apr 29, 2021
1 parent 166cb21 commit 8330a4c
Show file tree
Hide file tree
Showing 8 changed files with 172 additions and 12 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,12 @@ docker run --env DB_CONN=redis://:password@ip:port/db -p 5010:5010 jhao104/proxy
| api | method | Description | arg|
| ----| ---- | ---- | ----|
| / | GET | api介绍 | None |
| /get | GET | 随机获取一个代理 | None|
| /get_all | GET | 获取所有代理 |None|
| /get | GET | 随机获取一个代理(http/https) | None|
| /get_all | GET | 获取所有代理(http/https) |None|
| /get_status | GET | 查看代理数量 |None|
| /delete | GET | 删除代理 |proxy=host:ip|
| /get_https | GET | 随机获取一个https代理 | None
| /get_https_all | GET | 获取所有https代理 | None

* 爬虫使用

Expand Down
22 changes: 20 additions & 2 deletions api/proxyApi.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,9 @@ def force_type(cls, response, environ=None):
# 'refresh': u'refresh proxy pool',
'get_all': u'get all proxy from proxy pool',
'delete?proxy=127.0.0.1:8080': u'delete an unable proxy',
'get_status': u'proxy number'
'get_status': u'proxy number',
'get_https': u'get an useful https proxy',
'get_https_all': u'get all usefull https proxies'
}


Expand Down Expand Up @@ -91,10 +93,26 @@ def getStatus():
status = proxy_handler.getCount()
return status

@app.route('/get_https/')
def getHttps():
proxy = proxy_handler.get()
while(proxy.type != "https"):
proxy = proxy_handler.get()
return proxy.to_dict if proxy else {"code": 0, "src": "no proxy"}

@app.route('/get_https_all/')
def getAllHttps():
proxyList = proxy_handler.getAll()
httpsList = []
for proxy in proxyList:
#print(proxy,proxy.type,type(proxy.type))
if proxy.type == "https":
httpsList.append(proxy)
return jsonify([_.to_dict for _ in httpsList])

def runFlask():
if platform.system() == "Windows":
app.run(host=conf.serverHost, port=conf.serverPort)
app.run(host=conf.serverHost, port=conf.serverPort,debug=conf.debugMode)
else:
import gunicorn.app.base

Expand Down
4 changes: 4 additions & 0 deletions handler/configHandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,7 @@ def poolSizeMin(self):
@LazyProperty
def timezone(self):
return os.getenv("TIMEZONE", getattr(setting, 'TIMEZONE', None))

@LazyProperty
def debugMode(self):
return setting.DEBUGMODE
106 changes: 106 additions & 0 deletions helper/ishttps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# -*- coding: utf-8 -*-
"""
-------------------------------------------------
File Name: isHttps
Description :
Author : nansirc(https://github.com/jiannanya)
date: 2021/4/26
-------------------------------------------------
Change Activity:
2021/4/2:
-------------------------------------------------
"""
__author__ = 'jiannanya'

from handler.logHandler import LogHandler
from handler.proxyHandler import ProxyHandler
from helper.proxy import Proxy
import requests
from requests.exceptions import RequestException
import threading

class HttpsCheckerThread(threading.Thread):
def __init__(self, func, *args):
super().__init__()
self.func = func
self.args = args
self.result = 0

def run(self):
self.result = self.func(*self.args)

def get_result(self):
try:
return self.result
except Exception:
return None

class HttpsChecker(object):

__headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'cache-control': 'max-age=0',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
}

__proxy_handler = ProxyHandler()
__log = LogHandler("ishttps-checker")

@staticmethod
def __put_proxy(proxy):
HttpsChecker.__proxy_handler.put(proxy)

@staticmethod
def __delete_proxy(proxy):
HttpsChecker.__proxy_handler.delete(proxy)

@staticmethod
def __proxy_check(proxy):
try:
response = requests.get("https://www.qq.com/", headers=HttpsChecker.__headers, proxies={"https": "https://{}".format(proxy.proxy)},timeout=3)
if response.status_code == 200:
proxy.type = "https"
HttpsChecker.__delete_proxy(proxy)
HttpsChecker.__put_proxy(proxy)#(Proxy.createFromJson(json.dumps(proxy)))
HttpsChecker.__log.info("Https Check - {} is https".format(proxy.proxy))
except RequestException:
proxy.type = "http"
HttpsChecker.__delete_proxy(proxy)
HttpsChecker.__put_proxy(proxy)#(Proxy.createFromJson(json.dumps(proxy)))
HttpsChecker.__log.info("Https Check - {} is http".format(proxy.proxy))

@staticmethod
def __proxy_get_all():
return HttpsChecker.__proxy_handler.getAll()


@staticmethod
def https_check():
proxyList = HttpsChecker.__proxy_get_all()
HttpsChecker.__log.info("Https Check Start!")
threadList = []
for proxy in proxyList:
t=HttpsCheckerThread(HttpsChecker.__proxy_check,proxy)
threadList.append(t)
t.start()
for t in threadList:
t.join()
HttpsChecker.__log.info("Https Check Done!")

def _runHttpsChecker():
HttpsChecker.https_check()

if __name__ == "__main__":
#test below
HttpsChecker.https_check()
# for _ in range(30):
# __proxy_check(__get_proxy())
#all = __proxy_get_all()
#print(type(__proxy_get_all()[0]['proxy']))
4 changes: 3 additions & 1 deletion helper/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from helper.proxy import Proxy
from helper.fetch import runFetcher
from helper.check import runChecker
from helper.ishttps import _runHttpsChecker
from handler.logHandler import LogHandler
from handler.proxyHandler import ProxyHandler
from handler.configHandler import ConfigHandler
Expand Down Expand Up @@ -47,14 +48,15 @@ def _runProxyCheck():

def runScheduler():
_runProxyFetch()
_runHttpsChecker()

timezone = ConfigHandler().timezone
scheduler_log = LogHandler("scheduler")
scheduler = BlockingScheduler(logger=scheduler_log, timezone=timezone)

scheduler.add_job(_runProxyFetch, 'interval', minutes=4, id="proxy_fetch", name="proxy采集")
scheduler.add_job(_runProxyCheck, 'interval', minutes=2, id="proxy_check", name="proxy检查")

scheduler.add_job(_runHttpsChecker,'interval',minutes=2, id="https_check", name="https检查")
executors = {
'default': {'type': 'threadpool', 'max_workers': 20},
'processpool': ProcessPoolExecutor(max_workers=5)
Expand Down
3 changes: 2 additions & 1 deletion setting.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
# example:
# Redis: redis://:password@ip:port/db
# Ssdb: ssdb://:password@ip:port
DB_CONN = 'redis://:pwd@127.0.0.1:6379/0'
DB_CONN = 'redis://@127.0.0.1:6379/0'

# proxy table name
TABLE_NAME = 'use_proxy'
Expand Down Expand Up @@ -86,3 +86,4 @@
# Otherwise it will detect the timezone from the system automatically.

# TIMEZONE = "Asia/Shanghai"
DEBUGMODE = False
16 changes: 10 additions & 6 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,17 @@
from test import testConfigHandler
from test import testLogHandler
from test import testDbClient
from test import testIsHttps

if __name__ == '__main__':
print("ConfigHandler:")
testConfigHandler.testConfig()
print("ConfigHandler:")
testConfigHandler.testConfig()

print("LogHandler:")
testLogHandler.testLogHandler()
print("LogHandler:")
testLogHandler.testLogHandler()

print("DbClient:")
testDbClient.testDbClient()
print("DbClient:")
testDbClient.testDbClient()

print("IsHttps:")
testIsHttps.runHttpsChecker()
23 changes: 23 additions & 0 deletions test/testIsHttps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# -*- coding: utf-8 -*-
"""
-------------------------------------------------
File Name: testIsHttps
Description :
Author : nansirc(https://github.com/jiannanya)
date: 2021/2/06
-------------------------------------------------
Change Activity:
2020/2/06:
-------------------------------------------------
"""
__author__ = 'jiannanya'

from helper.ishttps import _runHttpsChecker

def testIsHttps():
_runHttpsChecker()



if __name__ == "__main__":
testIsHttps()

0 comments on commit 8330a4c

Please sign in to comment.