forked from imclab/sbi.py
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sbi.py
169 lines (117 loc) · 4.47 KB
/
sbi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# coding: utf-8
import random
import re
# Python 3 compatibility
try:
import urlparse
except ImportError:
import urllib.parse as urlparse
from bs4 import BeautifulSoup
import requests
__version__ = '0.0.7'
__all__ = ['search_by', 'SBIResult', 'OhShitCAPTCHA']
class OhShitCAPTCHA(Exception):
"""
Google: You Shall Not Pass!!!
"""
class SBIResult(object):
def __init__(self):
self.result_page = None
self.all_sizes_page = None
self.best_guess = None
self.images = []
def __bool__(self):
return bool(self.images)
__nonzero__ = __bool__
def __len__(self):
return len(self.images)
def __repr__(self):
return '<SBIResult [best_guess: %s]>' % (self.best_guess)
def to_dict(self):
return self.__dict__
# from: http://techblog.willshouse.com/2012/01/03/most-common-user-agents/
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9) AppleWebKit/537.71 (KHTML, like Gecko) Version/7.0 Safari/537.71',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:25.0) Gecko/20100101 Firefox/25.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:25.0) Gecko/20100101 Firefox/25.0',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
]
GOOGLE_BASE_URL = 'http://www.google.com/'
GOOGLE_SEARCH_BY_ENDPOINT = 'http://images.google.com/searchbyimage?hl=en&image_url='
def fire_request(url, referer):
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip,deflate',
'Accept-Language': 'en-US,en;q=0.8,zh-TW;q=0.6,zh;q=0.4',
'Cache-Control': 'no-cache',
'Connection': 'close',
'DNT': '1',
'Pragma': 'no-cache',
'Referer': referer,
'User-Agent': random.choice(USER_AGENTS),
}
r = requests.get(url, headers=headers)
content = r.content
return content
def cook_soup(text):
soup = BeautifulSoup(text)
captcha_input = soup.find_all('input', {'name': 'captcha'})
if captcha_input:
raise OhShitCAPTCHA
return soup
def extract_best_guess(html):
match = re.search(b'Best guess for this image.*?>(.*?)</a>', html, re.IGNORECASE | re.MULTILINE)
if match:
text = match.group(1)
text = text.title()
else:
text = ''
return text
def search_by(url=None, file=None):
"""
TODO: support file
"""
image_url = url
# image_file = file
"""
Search result page
"""
result_url = GOOGLE_SEARCH_BY_ENDPOINT + image_url
referer = 'http://www.google.com/imghp'
result_html = fire_request(result_url, referer)
result = SBIResult()
result.result_page = result_url
result.best_guess = extract_best_guess(result_html)
soup = cook_soup(result_html)
all_sizes_a_tag = soup.find('a', text='All sizes')
# No other sizes of this image found
if not all_sizes_a_tag:
return result
all_sizes_href = all_sizes_a_tag['href']
all_sizes_url = urlparse.urljoin(GOOGLE_BASE_URL, all_sizes_href)
result.all_sizes_page = all_sizes_url
"""
All sizes page
"""
all_sizes_html = fire_request(all_sizes_url, referer=all_sizes_url)
soup = cook_soup(all_sizes_html)
img_links = soup.find_all('a', {'class': 'rg_l'})
images = []
for a in img_links:
url = a['href']
parse_result = urlparse.urlparse(url)
querystring = parse_result.query
querystring_dict = urlparse.parse_qs(querystring)
image = {}
image['url'] = querystring_dict['imgurl'][0]
image['width'] = int(querystring_dict['w'][0])
image['height'] = int(querystring_dict['h'][0])
images.append(image)
result.images = images
return result