Skip to content

Commit

Permalink
Merge pull request fossasia#13 from rhemon/bing
Browse files Browse the repository at this point in the history
  • Loading branch information
mariobehling authored Jan 3, 2017
2 parents 914d36f + c113935 commit 55fe18b
Showing 1 changed file with 65 additions and 8 deletions.
73 changes: 65 additions & 8 deletions rss-generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@


search_engines = {'g':("GOOGLE SEARCH RESULTS", "htps://www.google.com", "Google search results for %s"),
'd':("DUCKDUCKGO SEARCH RESULTS", "htps://www.duckduckgo.com", "Duckduckgo search results for %s")
'd':("DUCKDUCKGO SEARCH RESULTS", "htps://www.duckduckgo.com", "Duckduckgo search results for %s"),
'b': ("BING SEARCH RESULTS", "https://www.bing.com", "Bing search results for %s")
}

query = ''
Expand All @@ -20,26 +21,44 @@ def generateFeed(urls, stype):
fg.title(feed[0])
fg.link(href = feed[1], rel='alternate')
fg.description(feed[2]%query)

for url in urls:
fe = fg.add_entry()
fe.title(url[0])
fe.link({'href': url[1], 'rel': 'alternate'})
print fg.rss_str(pretty=True)
print fg.rss_str(pretty=True)
##Write to file
file_name = 'data/%s.xml'%query
fg.rss_file(file_name)

def bing_search(query):
''' Search bing for the query and return set of urls
Returns: urls (list)
[[Tile1,url1], [Title2, url2],..]
'''
urls = []
response = get_bing_page(query)
soup = BeautifulSoup(response.read(), 'html5lib')
# Search for all relevant 'div' tags with having the results
for li in soup.findAll('li', attrs = {'class' : ['b_algo']}):
# search for title
title = li.h2.text.replace("\n",'').replace(" ","")
# get anchor tag having the link
url = li.h2.a['href']
# get the short description
desc = li.find('p').text
url_entry = [title, url, desc]
urls.append(url_entry)
return urls

def duckduckgo_search(query):
''' Search google for the query and return set of urls
Returns: urls (list)
[[Tile1,url1], [Title2, url2],..]
'''
urls = []
SEARCH_ENDPOINT = "https://duckduckgo.com/html/"
resp = requests.get(SEARCH_ENDPOINT, params = {'q' : query})
soup = BeautifulSoup(resp.content, 'html5lib')
response = get_duckduckgo_page(query)
soup = BeautifulSoup(response.read(), 'html5lib')

# Search for all relevant 'div' tags with having the results
for div in soup.findAll('div', attrs = {'class' : ['result', 'results_links', 'results_links_deep', 'web-result']}):
Expand Down Expand Up @@ -80,6 +99,42 @@ def get_results_page(query):
br.form['q'] = query
return br.submit()

def get_duckduckgo_page(query):
"""
Fetch the duckduckgo search results page
:param query: String to be searched on duckduckgo
:return: Result page containing search results
"""
br = mechanize.Browser()
br.set_handle_robots(False) # Google's robot.txt prevents from scrapping
br.addheaders = [('User-agent', 'Mozilla/5.0')]
br.open('http://www.duckduckgo.com/html/')
br.select_form(name='x')
br.form['q'] = query
return br.submit()

def get_bing_page(query):
"""
Fetch the bing search results page
:param query: String to be searched on bing
:return: Result page containing search results
"""
br = mechanize.Browser()
br.set_handle_robots(False) # Google's robot.txt prevents from scrapping
br.addheaders = [('User-agent', 'Mozilla/5.0')]
br.open('http://www.bing.com/search')
formcount = 0
for form in br.forms():
if str(form.attrs["id"]) == "sb_form":
break
formcount += 1
br.select_form(nr=formcount)
br.form['q'] = query
return br.submit()


def read_in():
lines = sys.stdin.readlines()
return json.loads(lines[0])
Expand All @@ -93,10 +148,12 @@ def main():
print query
if stype == 'g':
urls = google_search(query)
else:
elif stype == 'd':
urls = duckduckgo_search(query)
else:
urls = bing_search(query)
generateFeed(urls, stype)


if __name__ == "__main__":
main()

0 comments on commit 55fe18b

Please sign in to comment.