Skip to content

Commit

Permalink
Modify browser and search to be able to set socket timeout
Browse files Browse the repository at this point in the history
  • Loading branch information
Thomas Ryckeboer committed Oct 10, 2019
1 parent 9889bd8 commit 07e5ace
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 22 deletions.
17 changes: 11 additions & 6 deletions xgoogle/browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
# Code is licensed under MIT license.
#

import sys
import ssl
import random
import socket
Expand All @@ -18,6 +19,7 @@
import http.cookiejar
import http.cookies


BROWSERS = (
# Top most popular browsers in my access.log on 2009.02.12
# tail -50000 access.log |
Expand All @@ -42,7 +44,7 @@
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
)

TIMEOUT = 5 # socket timeout
TIMEOUT_SOCKET = 5 # socket timeout

class BrowserError(Exception):
def __init__(self, url, error):
Expand All @@ -52,6 +54,7 @@ def __init__(self, url, error):
class PoolHTTPConnection(http.client.HTTPConnection):
def connect(self):
"""Connect to the host and port specified in __init__."""
global TIMEOUT_SOCKET
msg = "getaddrinfo returns an empty list"
for res in socket.getaddrinfo(self.host, self.port, 0,
socket.SOCK_STREAM):
Expand All @@ -60,7 +63,7 @@ def connect(self):
self.sock = socket.socket(af, socktype, proto)
if self.debuglevel > 0:
print("connect: (%s, %s)" % (self.host, self.port))
self.sock.settimeout(TIMEOUT)
self.sock.settimeout(TIMEOUT_SOCKET)
self.sock.connect(sa)
except socket.error as msg:
if self.debuglevel > 0:
Expand All @@ -80,7 +83,9 @@ def http_open(self, req):
class Browser(object):
"""Provide a simulated browser object.
"""
def __init__(self, user_agent=BROWSERS[0], debug=False, use_pool=False):
def __init__(self, timeout, user_agent=BROWSERS[0], debug=False, use_pool=False):
global TIMEOUT_SOCKET
TIMEOUT_SOCKET = timeout
self.headers = {
'User-Agent': user_agent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
Expand Down Expand Up @@ -112,15 +117,15 @@ def get_page(self, url, data=None):
try:
response = self.opener.open(request)
return response.read()
except (urllib.error.HTTPError) as e:
except urllib.error.HTTPError as e:
# Check if we've reached the captcha
if e.code == 503:
print("Error: Captcha page has been reached, exiting...")
sys.exit(1)
raise BrowserError(url, str(e))
except (urllib.error.URLError) as e:
except urllib.error.URLError as e:
raise BrowserError(url, str(e))
except (socket.error, socket.sslerror) as msg:
except (socket.error, ssl.SSLError) as msg:
raise BrowserError(url, msg)
except socket.timeout as e:
raise BrowserError(url, "timeout")
Expand Down
32 changes: 16 additions & 16 deletions xgoogle/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,10 @@ class GoogleSearch(object):
SEARCH_URL_1 = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&num=%(num)d&btnG=Google+Search"
NEXT_PAGE_1 = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&num=%(num)d&start=%(start)d"

def __init__(self, query, random_agent=True, debug=False, lang="en", tld="com", re_search_strings=None, repeat=None):
def __init__(self, query, random_agent=True, debug=False, lang="en", tld="com", re_search_strings=None, repeat=None, timeout=5):
self.query = query
self.debug = debug
self.browser = Browser(debug=debug)
self.browser = Browser(debug=debug, timeout=timeout)
self.results_info = None
self.eor = False # end of results
self._page = 0
Expand Down Expand Up @@ -358,14 +358,14 @@ def _html_unescape(self, str):
def entity_replacer(m):
entity = m.group(1)
if entity in name2codepoint:
return unichr(name2codepoint[entity])
return chr(name2codepoint[entity])
else:
return m.group(0)

def ascii_replacer(m):
cp = int(m.group(1))
if cp <= 255:
return unichr(cp)
return chr(cp)
else:
return m.group(0)

Expand All @@ -378,10 +378,10 @@ class GoogleVideoSearch(object):
SEARCH_URL_1 = "http://www.google.%(tld)s/search?tbm=vid&hl=%(lang)s&q=%(query)s&num=%(num)d"
NEXT_PAGE_1 = "http://www.google.%(tld)s/search?tbm=vid&hl=%(lang)s&q=%(query)s&num=%(num)d&start=%(start)d"

def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None):
def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None, timeout=5):
self.query = query
self.debug = debug
self.browser = Browser(debug=debug)
self.browser = Browser(debug=debug, timeout=timeout)
self.results_info = None
self.eor = False # end of results
self._page = 0
Expand Down Expand Up @@ -626,14 +626,14 @@ def _html_unescape(self, str):
def entity_replacer(m):
entity = m.group(1)
if entity in name2codepoint:
return unichr(name2codepoint[entity])
return chr(name2codepoint[entity])
else:
return m.group(0)

def ascii_replacer(m):
cp = int(m.group(1))
if cp <= 255:
return unichr(cp)
return chr(cp)
else:
return m.group(0)

Expand All @@ -646,10 +646,10 @@ class GoogleImageSearch(object):
SEARCH_URL_1 = "http://www.google.%(tld)s/search?tbm=isch&hl=%(lang)s&q=%(query)s&num=%(num)d"
NEXT_PAGE_1 = "http://www.google.%(tld)s/search?tbm=isch&hl=%(lang)s&q=%(query)s&num=%(num)d&start=%(start)d"

def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None):
def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None, timeout=5):
self.query = query
self.debug = debug
self.browser = Browser(debug=debug)
self.browser = Browser(debug=debug, timeout=timeout)
self.results_info = None
self.eor = False # end of results
self._page = 0
Expand Down Expand Up @@ -880,14 +880,14 @@ def _html_unescape(self, str):
def entity_replacer(m):
entity = m.group(1)
if entity in name2codepoint:
return unichr(name2codepoint[entity])
return chr(name2codepoint[entity])
else:
return m.group(0)

def ascii_replacer(m):
cp = int(m.group(1))
if cp <= 255:
return unichr(cp)
return chr(cp)
else:
return m.group(0)

Expand All @@ -901,10 +901,10 @@ class GoogleFaceImageSearch(object):
SEARCH_URL_1 = "http://www.google.%(tld)s/search?tbm=isch&tbs=itp:face&hl=%(lang)s&q=%(query)s&num=%(num)d"
NEXT_PAGE_1 = "http://www.google.%(tld)s/search?tbm=isch&tbs=itp:face&hl=%(lang)s&q=%(query)s&num=%(num)d&start=%(start)d"

def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None):
def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None, timeout=5):
self.query = query
self.debug = debug
self.browser = Browser(debug=debug)
self.browser = Browser(debug=debug, timeout=timeout)
self.results_info = None
self.eor = False # end of results
self._page = 0
Expand Down Expand Up @@ -1149,14 +1149,14 @@ def _html_unescape(self, str):
def entity_replacer(m):
entity = m.group(1)
if entity in name2codepoint:
return unichr(name2codepoint[entity])
return chr(name2codepoint[entity])
else:
return m.group(0)

def ascii_replacer(m):
cp = int(m.group(1))
if cp <= 255:
return unichr(cp)
return chr(cp)
else:
return m.group(0)

Expand Down

0 comments on commit 07e5ace

Please sign in to comment.