Skip to content

Commit

Permalink
Merge branch 'master' of kenorb/xgoogle
Browse files Browse the repository at this point in the history
  • Loading branch information
kenorb committed Aug 4, 2015
2 parents 5f413ee + 1e50a34 commit 85d1cab
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 54 deletions.
43 changes: 23 additions & 20 deletions xgoogle/browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@
import random
import socket
import urllib
import urllib2
import httplib
import cookielib
import urllib.request
import http.client
import http.cookiejar
import http.cookies

BROWSERS = (
# Top most popular browsers in my access.log on 2009.02.12
Expand All @@ -35,6 +36,7 @@
'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.0.6) Gecko/2009020911 Ubuntu/8.10 (intrepid) Firefox/3.0.6',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.5) Gecko/2008121621 Ubuntu/8.04 (hardy) Firefox/3.0.5',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-us) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
)
Expand All @@ -46,7 +48,7 @@ def __init__(self, url, error):
self.url = url
self.error = error

class PoolHTTPConnection(httplib.HTTPConnection):
class PoolHTTPConnection(http.client.HTTPConnection):
def connect(self):
"""Connect to the host and port specified in __init__."""
msg = "getaddrinfo returns an empty list"
Expand All @@ -56,21 +58,21 @@ def connect(self):
try:
self.sock = socket.socket(af, socktype, proto)
if self.debuglevel > 0:
print "connect: (%s, %s)" % (self.host, self.port)
print("connect: (%s, %s)" % (self.host, self.port))
self.sock.settimeout(TIMEOUT)
self.sock.connect(sa)
except socket.error, msg:
except socket.error as msg:
if self.debuglevel > 0:
print 'connect fail:', (self.host, self.port)
print('connect fail:', (self.host, self.port))
if self.sock:
self.sock.close()
self.sock = None
continue
break
if not self.sock:
raise socket.error, msg
raise socket.error(msg)

class PoolHTTPHandler(urllib2.HTTPHandler):
class PoolHTTPHandler(urllib.request.HTTPHandler):
def http_open(self, req):
return self.do_open(PoolHTTPConnection, req)

Expand All @@ -81,36 +83,37 @@ def __init__(self, user_agent=BROWSERS[0], debug=False, use_pool=False):
self.headers = {
'User-Agent': user_agent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-us,en;q=0.5'
'Accept-Language': 'en-GB,en-US;q=0.8,en;q=0.6',
# 'Accept-Encoding': 'deflate'
}
self.debug = debug
self._cj = cookielib.CookieJar()
self._cj = http.cookiejar.CookieJar()

self.handlers = [PoolHTTPHandler]
self.handlers.append(urllib2.HTTPCookieProcessor(self._cj))
self.handlers.append(urllib.request.HTTPCookieProcessor(self._cj))

self.opener = urllib2.build_opener(*self.handlers)
self.opener = urllib.request.build_opener(*self.handlers)
self.opener.addheaders = []

try:
conn = self.opener.open("http://www.google.com/ncr")
conn.info() # retrieve session cookie
except Exception, e:
print e
except Exception as e:
print(e)

def get_page(self, url, data=None):
# handlers = [PoolHTTPHandler]
# opener = urllib2.build_opener(*handlers)
# opener = urllib.request.build_opener(*handlers)
if data: data = urllib.urlencode(data)
request = urllib2.Request(url, data, self.headers)
request = urllib.request.Request(url, data, self.headers)
try:
response = self.opener.open(request)
return response.read()
except (urllib2.HTTPError, urllib2.URLError), e:
except (urllib.error.HTTPError, urllib.error.URLError) as e:
raise BrowserError(url, str(e))
except (socket.error, socket.sslerror), msg:
except (socket.error, socket.sslerror) as msg:
raise BrowserError(url, msg)
except socket.timeout, e:
except socket.timeout as e:
raise BrowserError(url, "timeout")
except KeyboardInterrupt:
raise
Expand Down
66 changes: 32 additions & 34 deletions xgoogle/search.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
#!/usr/bin/python
#!/usr/bin/env python3
# -*- coding: utf8 -*-
#
# Peteris Krumins ([email protected])
# http://www.catonmat.net -- good coders code, great reuse
# Updated by Nikola Milosevic ([email protected]
# http://www.inspiratron.org
# Peteris Krumins ([email protected]) [http://www.catonmat.net]
# Updated by Nikola Milosevic ([email protected]) [http://www.inspiratron.org]
#
# http://www.catonmat.net/blog/python-library-for-google-search/
#
Expand All @@ -13,8 +11,8 @@

import re
import urllib
from htmlentitydefs import name2codepoint
from BeautifulSoup import BeautifulSoup
import html.entities
from bs4 import BeautifulSoup
import nltk

from browser import Browser, BrowserError
Expand Down Expand Up @@ -171,7 +169,7 @@ def _set_first_indexed_in_previous(self, interval):
try:
num = float(interval)
except ValueError:
raise SearchError, "Wrong parameter to first_indexed_in_previous: %s" % (str(interval))
raise SearchError("Wrong parameter to first_indexed_in_previous: %s" % (str(interval)))
self._first_indexed_in_previous = 'm' + str(interval)

first_indexed_in_previous = property(_get_first_indexed_in_previous, _set_first_indexed_in_previous, doc="possible values: day, week, month, year, or a float value of months")
Expand Down Expand Up @@ -237,7 +235,7 @@ def _get_results_page(self):
else:
url = GoogleSearch.NEXT_PAGE_1

safe_url = [url % { 'query': urllib.quote_plus(self.query),
safe_url = [url % { 'query': urllib.parse.quote_plus(self.query),
'start': self._page * self._results_per_page,
'num': self._results_per_page,
'tld' : self._tld,
Expand All @@ -254,10 +252,10 @@ def _get_results_page(self):

try:
page = self.browser.get_page(safe_url)
except BrowserError, e:
raise SearchError, "Failed getting %s: %s" % (e.url, e.error)
except BrowserError as e:
raise SearchError("Failed getting %s: %s" % (e.url, e.error))

return BeautifulSoup(page)
return BeautifulSoup(page, "html.parser")

def _extract_info(self, soup):
"""Extract total results
Expand Down Expand Up @@ -309,15 +307,15 @@ def _extract_title_url(self, result):
url = title_a['href']
match = re.match(r'/url\?q=((http|ftp|https)[^&]+)&', url)
if match:
url = urllib.unquote(match.group(1))
url = urllib.parse.unquote(match.group(1))
match = re.match(r'/interstitial\?url=((http|ftp|https)[^&]+)&', url)
if match:
url = urllib.unquote(match.group(1))
url = urllib.parse.unquote(match.group(1))
return title, url

def _extract_description(self, result):
"""Seems this is enough"""
desc = result.find('div', {'class': 's'}).find('span', {'class': 'st'})
desc = result.find('div', {'class': 'st'})
return desc

desc_div = result.find('span', 'st')
Expand Down Expand Up @@ -439,7 +437,7 @@ def _set_first_indexed_in_previous(self, interval):
try:
num = float(interval)
except ValueError:
raise SearchError, "Wrong parameter to first_indexed_in_previous: %s" % (str(interval))
raise SearchError("Wrong parameter to first_indexed_in_previous: %s" % (str(interval)))
self._first_indexed_in_previous = 'm' + str(interval)

first_indexed_in_previous = property(_get_first_indexed_in_previous, _set_first_indexed_in_previous, doc="possible values: day, week, month, year, or a float value of months")
Expand Down Expand Up @@ -503,7 +501,7 @@ def _get_results_page(self):
else:
url = GoogleVideoSearch.NEXT_PAGE_1

safe_url = [url % { 'query': urllib.quote_plus(self.query),
safe_url = [url % { 'query': urllib.parse.quote_plus(self.query),
'start': self._page * self._results_per_page,
'num': self._results_per_page,
'tld' : self._tld,
Expand All @@ -520,8 +518,8 @@ def _get_results_page(self):

try:
page = self.browser.get_page(safe_url)
except BrowserError, e:
raise SearchError, "Failed getting %s: %s" % (e.url, e.error)
except BrowserError as e:
raise SearchError("Failed getting %s: %s" % (e.url, e.error))

return BeautifulSoup(page)

Expand Down Expand Up @@ -588,7 +586,7 @@ def _extract_title_url(self, result):
url = title_a['href']
match = re.match(r'/url\?q=(http[^&]+)&', url)
if match:
url = urllib.unquote(match.group(1))
url = urllib.parse.unquote(match.group(1))
return title, url

def _extract_description(self, result):
Expand Down Expand Up @@ -707,7 +705,7 @@ def _set_first_indexed_in_previous(self, interval):
try:
num = float(interval)
except ValueError:
raise SearchError, "Wrong parameter to first_indexed_in_previous: %s" % (str(interval))
raise SearchError("Wrong parameter to first_indexed_in_previous: %s" % (str(interval)))
self._first_indexed_in_previous = 'm' + str(interval)

first_indexed_in_previous = property(_get_first_indexed_in_previous, _set_first_indexed_in_previous, doc="possible values: day, week, month, year, or a float value of months")
Expand Down Expand Up @@ -771,7 +769,7 @@ def _get_results_page(self):
else:
url = GoogleImageSearch.NEXT_PAGE_1

safe_url = [url % { 'query': urllib.quote_plus(self.query),
safe_url = [url % { 'query': urllib.parse.quote_plus(self.query),
'start': self._page * self._results_per_page,
'num': self._results_per_page,
'tld' : self._tld,
Expand All @@ -788,8 +786,8 @@ def _get_results_page(self):

try:
page = self.browser.get_page(safe_url)
except BrowserError, e:
raise SearchError, "Failed getting %s: %s" % (e.url, e.error)
except BrowserError as e:
raise SearchError("Failed getting %s: %s" % (e.url, e.error))

return BeautifulSoup(page)

Expand Down Expand Up @@ -842,7 +840,7 @@ def _extract_title_url(self, result):
url = title_a['href']
match = re.match(r'/url\?q=(http[^&]+)&', url)
if match:
url = urllib.unquote(match.group(1))
url = urllib.parse.unquote(match.group(1))
return title, url

def _extract_description(self, result):
Expand Down Expand Up @@ -964,7 +962,7 @@ def _set_first_indexed_in_previous(self, interval):
try:
num = float(interval)
except ValueError:
raise SearchError, "Wrong parameter to first_indexed_in_previous: %s" % (str(interval))
raise SearchError("Wrong parameter to first_indexed_in_previous: %s" % (str(interval)))
self._first_indexed_in_previous = 'm' + str(interval)

first_indexed_in_previous = property(_get_first_indexed_in_previous, _set_first_indexed_in_previous, doc="possible values: day, week, month, year, or a float value of months")
Expand Down Expand Up @@ -1028,7 +1026,7 @@ def _get_results_page(self):
else:
url = GoogleFaceImageSearch.NEXT_PAGE_1

safe_url = [url % { 'query': urllib.quote_plus(self.query),
safe_url = [url % { 'query': urllib.parse.quote_plus(self.query),
'start': self._page * self._results_per_page,
'num': self._results_per_page,
'tld' : self._tld,
Expand All @@ -1045,8 +1043,8 @@ def _get_results_page(self):

try:
page = self.browser.get_page(safe_url)
except BrowserError, e:
raise SearchError, "Failed getting %s: %s" % (e.url, e.error)
except BrowserError as e:
raise SearchError("Failed getting %s: %s" % (e.url, e.error))
return BeautifulSoup(page)

def _extract_info(self, soup):
Expand All @@ -1068,8 +1066,8 @@ def _extract_info(self, soup):
matches = re.search(r'(\d+)', txt, re.U)

if not matches:
print self._re_search_strings[0]
print txt
print(self._re_search_strings[0])
print(txt)
return empty_info
return {'from': 0, 'to': 0, 'total': int(matches.group(1))}

Expand Down Expand Up @@ -1104,10 +1102,10 @@ def _extract_title_url(self, result):
url = title_a['href']
match = re.match(r'/url\?q=((http|ftp|https)[^&]+)&', url)
if match:
url = urllib.unquote(match.group(1))
url = urllib.parse.unquote(match.group(1))
match = re.match(r'/interstitial\?url=((http|ftp|https)[^&]+)&', url)
if match:
url = urllib.unquote(match.group(1))
url = urllib.parse.unquote(match.group(1))
return title, url

def _extract_description(self, result):
Expand Down Expand Up @@ -1208,7 +1206,7 @@ def _extract_title_url(self, result):
url = title_a['href']
match = re.match(r'/url\?q=(http[^&]+)&', url)
if match:
url = urllib.unquote(match.group(1))
url = urllib.parse.unquote(match.group(1))
return title, url

def _extract_description(self, result):
Expand Down

0 comments on commit 85d1cab

Please sign in to comment.