forked from searx/searx
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Alice Ferrazzi
authored and
Alice Ferrazzi
committed
Aug 12, 2018
1 parent
b8978b4
commit b7886bb
Showing
2 changed files
with
126 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
""" | ||
Gentoo Wiki | ||
@website https://wiki.gentoo.org | ||
@provide-api no (Mediawiki provides API, but Arch Wiki blocks access to it | ||
@using-api no | ||
@results HTML | ||
@stable no (HTML can change) | ||
@parse url, title | ||
""" | ||
|
||
from lxml import html | ||
from searx.engines.xpath import extract_text | ||
from searx.url_utils import urlencode, urljoin | ||
|
||
# engine dependent config | ||
categories = ['it'] | ||
language_support = True | ||
paging = True | ||
base_url = 'https://wiki.gentoo.org' | ||
|
||
# xpath queries | ||
xpath_results = '//ul[@class="mw-search-results"]/li' | ||
xpath_link = './/div[@class="mw-search-result-heading"]/a' | ||
|
||
|
||
# cut 'en' from 'en-US', 'de' from 'de-CH', and so on | ||
def locale_to_lang_code(locale): | ||
if locale.find('-') >= 0: | ||
locale = locale.split('-')[0] | ||
return locale | ||
|
||
|
||
# wikis for some languages were moved off from the main site, we need to make | ||
# requests to correct URLs to be able to get results in those languages | ||
lang_urls = { | ||
'en': { | ||
'base': 'https://wiki.gentoo.org', | ||
'search': '/index.php?title=Special:Search&offset={offset}&{query}' | ||
} | ||
} | ||
|
||
|
||
# get base & search URLs for selected language | ||
def get_lang_urls(language): | ||
if language in lang_urls: | ||
return lang_urls[language] | ||
return lang_urls['en'] | ||
|
||
|
||
# Language names to build search requests for | ||
# those languages which are hosted on the main site. | ||
main_langs = { | ||
'ar': 'العربية', | ||
'bg': 'Български', | ||
'cs': 'Česky', | ||
'da': 'Dansk', | ||
'el': 'Ελληνικά', | ||
'es': 'Español', | ||
'he': 'עברית', | ||
'hr': 'Hrvatski', | ||
'hu': 'Magyar', | ||
'it': 'Italiano', | ||
'ko': '한국어', | ||
'lt': 'Lietuviškai', | ||
'nl': 'Nederlands', | ||
'pl': 'Polski', | ||
'pt': 'Português', | ||
'ru': 'Русский', | ||
'sl': 'Slovenský', | ||
'th': 'ไทย', | ||
'uk': 'Українська', | ||
'zh': '简体中文' | ||
} | ||
supported_languages = dict(lang_urls, **main_langs) | ||
|
||
|
||
# do search-request | ||
def request(query, params): | ||
# translate the locale (e.g. 'en-US') to language code ('en') | ||
language = locale_to_lang_code(params['language']) | ||
|
||
# if our language is hosted on the main site, we need to add its name | ||
# to the query in order to narrow the results to that language | ||
if language in main_langs: | ||
query += b' (' + main_langs[language] + b')' | ||
|
||
# prepare the request parameters | ||
query = urlencode({'search': query}) | ||
offset = (params['pageno'] - 1) * 20 | ||
|
||
# get request URLs for our language of choice | ||
urls = get_lang_urls(language) | ||
search_url = urls['base'] + urls['search'] | ||
|
||
params['url'] = search_url.format(query=query, offset=offset) | ||
|
||
return params | ||
|
||
|
||
# get response from search-request | ||
def response(resp): | ||
# get the base URL for the language in which request was made | ||
language = locale_to_lang_code(resp.search_params['language']) | ||
base_url = get_lang_urls(language)['base'] | ||
|
||
results = [] | ||
|
||
dom = html.fromstring(resp.text) | ||
|
||
# parse results | ||
for result in dom.xpath(xpath_results): | ||
link = result.xpath(xpath_link)[0] | ||
href = urljoin(base_url, link.attrib.get('href')) | ||
title = extract_text(link) | ||
|
||
results.append({'url': href, | ||
'title': title}) | ||
|
||
return results |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters