Skip to content

Commit

Permalink
Increase number of results to 100 fossasia#59 (fossasia#62)
Browse files Browse the repository at this point in the history
Updated google_search and overloaded the get_google_page function by
introducing Start Index Parameter,the start index is sent to Google URL
as http request parameter(This involves 10 http calls to google to fetch
100 result)
  • Loading branch information
fazeem84 authored and mariobehling committed Jun 15, 2017
1 parent 471bbb0 commit d01c179
Showing 1 changed file with 19 additions and 7 deletions.
26 changes: 19 additions & 7 deletions app/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,16 @@ def get_google_page(query):
payload = {'q': query}
response = requests.get('https://www.google.com/search', headers=header, params=payload)
return response

def get_google_page(query,startIndex):
""" Fetch the google search results page
Returns : Results Page
"""
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36'}
payload = {'q': query,'start':startIndex}
response = requests.get('https://www.google.com/search', headers=header, params=payload)
return response


def google_search(query):
Expand All @@ -88,12 +98,13 @@ def google_search(query):
[[Tile1,url1], [Title2, url2],..]
"""
urls = []
response = get_google_page(query)
soup = BeautifulSoup(response.text, 'html.parser')
for h3 in soup.findAll('h3', {'class': 'r'}):
links = h3.find('a')
urls.append({'title': links.getText(),
'link': links.get('href')})
for count in range(0,10):
response = get_google_page(query,count*10)
soup = BeautifulSoup(response.text, 'html.parser')
for h3 in soup.findAll('h3', {'class': 'r'}):
links = h3.find('a')
urls.append({'title': links.getText(),
'link': links.get('href')})

return urls

Expand Down Expand Up @@ -154,4 +165,5 @@ def feedgen(query, engine):
urls = bing_search(query)
result = urls
print(result)
return result
print(len(result))
return result

0 comments on commit d01c179

Please sign in to comment.