forked from jhao104/proxy_pool
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscheduler.py
71 lines (55 loc) · 2.05 KB
/
scheduler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# -*- coding: utf-8 -*-
"""
-------------------------------------------------
File Name: proxyScheduler
Description :
Author : JHao
date: 2019/8/5
-------------------------------------------------
Change Activity:
2019/08/05: proxyScheduler
2021/02/23: runProxyCheck时,剩余代理少于POOL_SIZE_MIN时执行抓取
-------------------------------------------------
"""
__author__ = 'JHao'
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.executors.pool import ProcessPoolExecutor
from util.six import Queue
from helper.fetch import Fetcher
from helper.check import Checker
from handler.logHandler import LogHandler
from handler.proxyHandler import ProxyHandler
from handler.configHandler import ConfigHandler
def __runProxyFetch():
proxy_queue = Queue()
proxy_fetcher = Fetcher()
for proxy in proxy_fetcher.run():
proxy_queue.put(proxy)
Checker("raw", proxy_queue)
def __runProxyCheck():
proxy_handler = ProxyHandler()
proxy_queue = Queue()
if proxy_handler.db.getCount().get("total", 0) < proxy_handler.conf.poolSizeMin:
__runProxyFetch()
for proxy in proxy_handler.getAll():
proxy_queue.put(proxy)
Checker("use", proxy_queue)
def runScheduler():
__runProxyFetch()
timezone = ConfigHandler().timezone
scheduler_log = LogHandler("scheduler")
scheduler = BlockingScheduler(logger=scheduler_log, timezone=timezone)
scheduler.add_job(__runProxyFetch, 'interval', minutes=4, id="proxy_fetch", name="proxy采集")
scheduler.add_job(__runProxyCheck, 'interval', minutes=2, id="proxy_check", name="proxy检查")
executors = {
'default': {'type': 'threadpool', 'max_workers': 20},
'processpool': ProcessPoolExecutor(max_workers=5)
}
job_defaults = {
'coalesce': False,
'max_instances': 10
}
scheduler.configure(executors=executors, job_defaults=job_defaults, timezone=timezone)
scheduler.start()
if __name__ == '__main__':
runScheduler()