Skip to content

Commit 55506af

Browse files
committed
Major Refactoring
1 parent 39f6907 commit 55506af

File tree

1 file changed

+268
-0
lines changed

1 file changed

+268
-0
lines changed

config.py

Lines changed: 268 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,268 @@
1+
import json
2+
import time
3+
import random
4+
import operator
5+
import sys
6+
import os
7+
from utils import enum
8+
9+
# Configurations
10+
11+
# Rotation Policies
12+
Policy = enum('ROTATION_RANDOM',
13+
# Least recently used
14+
'ROTATION_LRU',
15+
# Switch to another region
16+
'ROTATION_NEW_REGION',
17+
# LRU + New region
18+
'ROTATION_LRU_NEW_REGION')
19+
20+
region_dict = {2: 'Dallas',
21+
3: 'Fremont',
22+
4: 'Atlanta',
23+
6: 'Newark',
24+
7: 'London',
25+
8: 'Tokyo',
26+
9: 'Singapore',
27+
10: 'Frankfurt'}
28+
29+
30+
email_template = """
31+
32+
I just switched a proxy node in the proxy infrastructure. Details are below.
33+
34+
In: %(label)s, %(proxy_in)s
35+
Out: %(label)s, %(proxy_out)s
36+
37+
Region: %(region)s
38+
39+
-- Linode proxy daemon
40+
41+
"""
42+
43+
# Post process command
44+
post_process_cmd_template = """ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null %s@%s "%s" """
45+
iptables_restore_cmd = "sudo iptables-restore < /etc/iptables.rules"
46+
squid_restart_cmd = "sudo squid3 -f /etc/squid3/squid.conf"
47+
48+
49+
class ProxyConfig(object):
50+
""" Class representing configuration of crawler proxy infrastructure """
51+
52+
def __init__(self, cfg='proxy.conf'):
53+
""" Initialize proxy config from the config file """
54+
55+
self.parse_config(cfg)
56+
# This is a file with each line of the form
57+
# IPV4 address, datacenter code, linode-id, switch_in timestamp, switch_out timestamp
58+
# E.g: 45.79.91.191, 3, 1446731065, 144673390
59+
try:
60+
proxies = map(lambda x: x.strip().split(','), open(self.proxylist).readlines())
61+
# Proxy IP to (switch_in, switch_out) timestamp mappings
62+
self.proxy_dict = {}
63+
# Proxy IP to enabled mapping
64+
self.proxy_state = {}
65+
self.process_proxies(proxies)
66+
except (OSError, IOError), e:
67+
print e
68+
sys.exit("Fatal error, proxy list input file " + self.proxylist + " not found!")
69+
except ValueError, e:
70+
print e
71+
print self.proxylist + " is empty or has junk values"
72+
73+
try:
74+
self.proxy_template = open(self.lb_template).read()
75+
except (OSError, IOError), e:
76+
print e
77+
sys.exit("Fatal error, template config input file " + template_file + " not found!")
78+
79+
def parse_config(self, cfg):
80+
""" Parse the configuration file and load config """
81+
82+
self.config = json.load(open(cfg))
83+
for key,value in self.config.items():
84+
# Set attribute locally
85+
setattr(self, key, value)
86+
87+
# Do some further processing
88+
self.frequency = float(self.frequency)*3600.0
89+
self.policy = eval('Policy.' + self.policy)
90+
91+
def get_proxy_ips(self):
92+
""" Return all proxy IP addresses as a list """
93+
94+
return self.proxy_state.keys()
95+
96+
def get_active_proxies(self):
97+
""" Return a list of all active proxies as a list """
98+
99+
return map(self.proxy_dict.get, filter(self.proxy_state.get, self.proxy_state.keys()))
100+
101+
def process_proxies(self, proxies):
102+
""" Process the proxy information to create internal dictionaries """
103+
104+
# Prepare the proxy region dict
105+
for proxy_ip, region, proxy_id, switch_in, switch_out in proxies:
106+
# If switch_in ==0: put current time
107+
if int(float(switch_in))==0:
108+
switch_in = int(time.time())
109+
if int(float(switch_out))==0:
110+
switch_out = int(time.time())
111+
112+
self.proxy_dict[proxy_ip] = [proxy_ip, int(region), proxy_id, int(float(switch_in)), int(float(switch_out))]
113+
self.proxy_state[proxy_ip] = True
114+
115+
print 'Processed',len(self.proxy_state),'proxies.'
116+
117+
def get_proxy_for_rotation(self,
118+
use_random=False,
119+
least_used=False,
120+
region_switch=False,
121+
input_region=3):
122+
""" Return a proxy IP address for rotation using the given settings. The
123+
returned proxy will be replaced with a new proxy.
124+
125+
@use_random - Means returns a random proxy from the current active list
126+
@least_used - Returns a proxy IP which is the oldest switched out one
127+
so we keep the switching more or less democratic.
128+
@region_switch - Returns a proxy which belongs to a different region
129+
from the new proxy.
130+
@input_region - The region of the new proxy node - defaults to Fremont, CA.
131+
132+
Note that if use_random is set to true, the other parameters are ignored.
133+
134+
"""
135+
136+
active_proxies = self.get_active_proxies()
137+
print 'Active proxies =>',active_proxies
138+
139+
if use_random:
140+
# Pick a random proxy IP
141+
proxy = random.choice(active_proxies)
142+
print 'Returning proxy =>',proxy
143+
proxy_ip = proxy[0]
144+
145+
# Remove it from every data structure
146+
self.switch_out_proxy(proxy_ip)
147+
return proxy
148+
149+
if least_used:
150+
# Pick the oldest switched out proxy i.e one
151+
# with smallest switched out value
152+
proxies_used = sorted(active_proxies,
153+
key=operator.itemgetter(-1))
154+
155+
print 'Proxies used =>',proxies_used
156+
157+
if region_switch:
158+
# Find the one with a different region from input
159+
for proxy, reg, pi, si, so in proxies_used:
160+
if reg != input_region:
161+
print 'Returning proxy',proxy,'from region',reg
162+
self.switch_out_proxy(proxy)
163+
return proxy
164+
165+
# If all regions are already in use, pick the last used
166+
# proxy anyway
167+
return proxies_used[0][0]
168+
169+
if region_switch:
170+
# Pick a random proxy not in the input region
171+
proxies = active_proxies
172+
random.shuffle(proxies)
173+
174+
for proxy, reg, pi, si, so in proxies:
175+
if reg != input_region:
176+
print 'Returning proxy',proxy,'from region',reg
177+
self.switch_out_proxy(proxy)
178+
return proxy
179+
180+
def __getattr__(self, name):
181+
""" Return from local, else written from config """
182+
183+
try:
184+
return self.__dict__[name]
185+
except KeyError:
186+
return self.config.get(name)
187+
188+
def switch_out_proxy(self, proxy):
189+
""" Switch out a given proxy IP """
190+
191+
# Disable it
192+
self.proxy_state[proxy] = False
193+
# Mark its switched out timestamp
194+
self.proxy_dict[proxy][-1] = int(time.time())
195+
196+
def switch_in_proxy(self, proxy, proxy_id, region):
197+
""" Switch in a given proxy IP """
198+
199+
# Mark its switched out timestamp
200+
self.proxy_dict[proxy] = [proxy, int(region), proxy_id, int(time.time()), int(time.time())]
201+
# Enable it
202+
self.proxy_state[proxy] = True
203+
204+
def get_active_regions(self):
205+
""" Return unique regions for which proxies are active """
206+
207+
regions = set()
208+
for proxy,region,pi,si,so in self.proxy_dict.values():
209+
if self.proxy_state[proxy]:
210+
regions.add(region)
211+
212+
return list(regions)
213+
214+
def write(self, disabled=False):
215+
""" Write current state to an output file """
216+
217+
lines = []
218+
for proxy, reg, pi, si, so in self.proxy_dict.values():
219+
if disabled or self.proxy_state[proxy]:
220+
lines.append('%s,%s,%s,%s,%s\n' % (proxy, str(reg), str(pi), str(int(si)), str(int(so))))
221+
222+
open(self.proxylist,'w').writelines(lines)
223+
224+
def write_lb_config(self, disabled=False, test=False):
225+
""" Write current proxy configuration into the load balancer config """
226+
227+
lines, idx = [], 1
228+
# Shuffle
229+
items = self.proxy_dict.values()
230+
for i in range(10):
231+
random.shuffle(items)
232+
233+
for proxy, reg, pi, si, so in items:
234+
if self.proxy_state[proxy]:
235+
lines.append('\tserver squid%d %s:8321 check inter 10000 rise 2 fall 5' % (idx, proxy))
236+
idx += 1
237+
238+
squid_config = "\n".join(lines)
239+
content = self.proxy_template % locals()
240+
# Write to temp file
241+
tmpfile = '/tmp/.haproxy.cfg'
242+
open(tmpfile,'w').write(content)
243+
244+
# If running in test mode, don't do this!
245+
if not test:
246+
# Run as sudo
247+
cmd = 'sudo cp %s %s; rm -f %s' % (tmpfile, self.lb_config, tmpfile)
248+
os.system(cmd)
249+
250+
self.reload_lb()
251+
return True
252+
253+
def reload_lb(self):
254+
""" Reload the HAProxy load balancer """
255+
256+
return (os.system(self.lb_restart) == 0)
257+
258+
def get_proxy_id(self, proxy):
259+
""" Given proxy return its id """
260+
261+
return self.proxy_dict[proxy][2]
262+
263+
def get_email_config(self):
264+
""" Return email configuration """
265+
266+
return self.config['email']
267+
268+

0 commit comments

Comments
 (0)