Merge pull request fossasia#13 from rhemon/bing

Fix for fossasia#11 and fossasia#12
ishucr7 · Jan 3, 2017 · 55fe18b · 55fe18b
2 parents 914d36f + c113935
commit 55fe18b
Showing 1 changed file with 65 additions and 8 deletions.
diff --git a/rss-generator.py b/rss-generator.py
@@ -7,7 +7,8 @@
 
 
 search_engines = {'g':("GOOGLE SEARCH RESULTS", "htps://www.google.com", "Google search results for %s"),
- 'd':("DUCKDUCKGO SEARCH RESULTS", "htps://www.duckduckgo.com", "Duckduckgo search results for %s")
+ 'd':("DUCKDUCKGO SEARCH RESULTS", "htps://www.duckduckgo.com", "Duckduckgo search results for %s"),
+ 'b': ("BING SEARCH RESULTS", "https://www.bing.com", "Bing search results for %s")
  }
 
 query = ''
@@ -20,26 +21,44 @@ def generateFeed(urls, stype):
  fg.title(feed[0])
  fg.link(href = feed[1], rel='alternate')
  fg.description(feed[2]%query)
- 
+
  for url in urls:
  fe = fg.add_entry()
  fe.title(url[0])
  fe.link({'href': url[1], 'rel': 'alternate'})
- print fg.rss_str(pretty=True) 
+ print fg.rss_str(pretty=True)
  ##Write to file
  file_name = 'data/%s.xml'%query
  fg.rss_file(file_name)
 
+def bing_search(query):
+ ''' Search bing for the query and return set of urls
+ Returns: urls (list)
+ [[Tile1,url1], [Title2, url2],..]
+ '''
+ urls = []
+ response = get_bing_page(query)
+ soup = BeautifulSoup(response.read(), 'html5lib')
+ # Search for all relevant 'div' tags with having the results
+ for li in soup.findAll('li', attrs = {'class' : ['b_algo']}):
+ # search for title
+ title = li.h2.text.replace("\n",'').replace(" ","")
+ # get anchor tag having the link
+ url = li.h2.a['href']
+ # get the short description
+ desc = li.find('p').text
+ url_entry = [title, url, desc]
+ urls.append(url_entry)
+ return urls
 
 def duckduckgo_search(query):
  ''' Search google for the query and return set of urls
  Returns: urls (list)
  [[Tile1,url1], [Title2, url2],..]
  '''
  urls = []
- SEARCH_ENDPOINT = "https://duckduckgo.com/html/"
- resp = requests.get(SEARCH_ENDPOINT, params = {'q' : query})
- soup = BeautifulSoup(resp.content, 'html5lib')
+ response = get_duckduckgo_page(query)
+ soup = BeautifulSoup(response.read(), 'html5lib')
 
  # Search for all relevant 'div' tags with having the results
  for div in soup.findAll('div', attrs = {'class' : ['result', 'results_links', 'results_links_deep', 'web-result']}):
@@ -80,6 +99,42 @@ def get_results_page(query):
  br.form['q'] = query
  return br.submit()
 
+def get_duckduckgo_page(query):
+ """
+ Fetch the duckduckgo search results page
+
+ :param query: String to be searched on duckduckgo
+ :return: Result page containing search results
+ """
+ br = mechanize.Browser()
+ br.set_handle_robots(False) # Google's robot.txt prevents from scrapping
+ br.addheaders = [('User-agent', 'Mozilla/5.0')]
+ br.open('http://www.duckduckgo.com/html/')
+ br.select_form(name='x')
+ br.form['q'] = query
+ return br.submit()
+
+def get_bing_page(query):
+ """
+ Fetch the bing search results page
+
+ :param query: String to be searched on bing
+ :return: Result page containing search results
+ """
+ br = mechanize.Browser()
+ br.set_handle_robots(False) # Google's robot.txt prevents from scrapping
+ br.addheaders = [('User-agent', 'Mozilla/5.0')]
+ br.open('http://www.bing.com/search')
+ formcount = 0
+ for form in br.forms():
+ if str(form.attrs["id"]) == "sb_form":
+ break
+ formcount += 1
+ br.select_form(nr=formcount)
+ br.form['q'] = query
+ return br.submit()
+
+
 def read_in():
  lines = sys.stdin.readlines()
  return json.loads(lines[0])
@@ -93,10 +148,12 @@ def main():
  print query
  if stype == 'g':
  urls = google_search(query)
- else:
+ elif stype == 'd':
  urls = duckduckgo_search(query)
+ else:
+ urls = bing_search(query)
  generateFeed(urls, stype)
 
- 
+
 if __name__ == "__main__":
  main()