Ponysearch/searx/engines/wikipedia.py

"""
 Wikipedia (Web)

 @website     https://{language}.wikipedia.org
 @provide-api yes

 @using-api   yes
 @results     JSON
 @stable      yes
 @parse       url, infobox
"""

from json import loads
from lxml.html import fromstring
from searx.url_utils import quote, urlencode

# search-url
base_url = u'https://{language}.wikipedia.org/'
search_url = base_url + u'w/api.php?'\
    'action=query'\
    '&format=json'\
    '&{query}'\
    '&prop=extracts|pageimages'\
    '&exintro'\
    '&explaintext'\
    '&pithumbsize=300'\
    '&redirects'
supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'


# set language in base_url
def url_lang(lang):
    lang = lang.split('-')[0]
    if lang not in supported_languages:
        language = 'en'
    else:
        language = lang

    return language


# do search-request
def request(query, params):
    if query.islower():
        query = u'{0}|{1}'.format(query.decode('utf-8'), query.decode('utf-8').title()).encode('utf-8')

    params['url'] = search_url.format(query=urlencode({'titles': query}),
                                      language=url_lang(params['language']))

    return params


# get first meaningful paragraph
# this should filter out disambiguation pages and notes above first paragraph
# "magic numbers" were obtained by fine tuning
def extract_first_paragraph(content, title, image):
    first_paragraph = None

    failed_attempts = 0
    for paragraph in content.split('\n'):

        starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35)
        length = len(paragraph)

        if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)):
            first_paragraph = paragraph
            break

        failed_attempts += 1
        if failed_attempts > 3:
            return None

    return first_paragraph


# get response from search-request
def response(resp):
    results = []

    search_result = loads(resp.text)

    # wikipedia article's unique id
    # first valid id is assumed to be the requested article
    for article_id in search_result['query']['pages']:
        page = search_result['query']['pages'][article_id]
        if int(article_id) > 0:
            break

    if int(article_id) < 0:
        return []

    title = page.get('title')

    image = page.get('thumbnail')
    if image:
        image = image.get('source')

    extract = page.get('extract')

    summary = extract_first_paragraph(extract, title, image)

    # link to wikipedia article
    wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \
        + 'wiki/' + quote(title.replace(' ', '_').encode('utf8'))

    results.append({'url': wikipedia_link, 'title': title})

    results.append({'infobox': title,
                    'id': wikipedia_link,
                    'content': summary,
                    'img_src': image,
                    'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})

    return results


# get supported languages from their site
def _fetch_supported_languages(resp):
    supported_languages = {}
    dom = fromstring(resp.text)
    tables = dom.xpath('//table[contains(@class,"sortable")]')
    for table in tables:
        # exclude header row
        trs = table.xpath('.//tr')[1:]
        for tr in trs:
            td = tr.xpath('./td')
            code = td[3].xpath('./a')[0].text
            name = td[2].xpath('./a')[0].text
            english_name = td[1].xpath('./a')[0].text
            articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
            # exclude languages with too few articles
            if articles >= 100:
                supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}

    return supported_languages
[enh] wikipedia infobox creates simple multilingual infobox using wikipedia's api 2016-03-14 07:32:36 +01:00			`"""`
			`Wikipedia (Web)`

			`@website https://{language}.wikipedia.org`
			`@provide-api yes`

			`@using-api yes`
			`@results JSON`
			`@stable yes`
			`@parse url, infobox`
			`"""`

			`from json import loads`
[mod] fetch supported languages for several engines utils/fetch_languages.py gets languages supported by each engine and generates engines_languages.json with each engine's supported language. 2016-11-06 03:51:38 +01:00			`from lxml.html import fromstring`
[enh] py3 compatibility 2016-11-30 18:43:03 +01:00			`from searx.url_utils import quote, urlencode`
[enh] add supported_languages on engines and auto-generate languages.py 2016-08-06 06:34:56 +02:00
[enh] wikipedia infobox creates simple multilingual infobox using wikipedia's api 2016-03-14 07:32:36 +01:00			`# search-url`
[enh] py3 compatibility 2016-11-30 18:43:03 +01:00			`base_url = u'https://{language}.wikipedia.org/'`
			`search_url = base_url + u'w/api.php?'\`
[enh] wikipedia infobox creates simple multilingual infobox using wikipedia's api 2016-03-14 07:32:36 +01:00			`'action=query'\`
			`'&format=json'\`
			`'&{query}'\`
			`'&prop=extracts\|pageimages'\`
			`'&exintro'\`
			`'&explaintext'\`
			`'&pithumbsize=300'\`
			`'&redirects'`
[mod] fetch supported languages for several engines utils/fetch_languages.py gets languages supported by each engine and generates engines_languages.json with each engine's supported language. 2016-11-06 03:51:38 +01:00			`supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'`
[enh] wikipedia infobox creates simple multilingual infobox using wikipedia's api 2016-03-14 07:32:36 +01:00

			`# set language in base_url`
			`def url_lang(lang):`
[enh] add supported_languages on engines and auto-generate languages.py 2016-08-06 06:34:56 +02:00			`lang = lang.split('-')[0]`
remove 'all' option from search languages 2017-07-20 22:47:20 +02:00			`if lang not in supported_languages:`
[enh] wikipedia infobox creates simple multilingual infobox using wikipedia's api 2016-03-14 07:32:36 +01:00			`language = 'en'`
			`else:`
[enh] add supported_languages on engines and auto-generate languages.py 2016-08-06 06:34:56 +02:00			`language = lang`
[enh] wikipedia infobox creates simple multilingual infobox using wikipedia's api 2016-03-14 07:32:36 +01:00
[enh] py3 compatibility 2016-11-30 18:43:03 +01:00			`return language`
[enh] wikipedia infobox creates simple multilingual infobox using wikipedia's api 2016-03-14 07:32:36 +01:00

			`# do search-request`
			`def request(query, params):`
			`if query.islower():`
[enh] py3 compatibility 2016-11-30 18:43:03 +01:00			`query = u'{0}\|{1}'.format(query.decode('utf-8'), query.decode('utf-8').title()).encode('utf-8')`
[enh] wikipedia infobox creates simple multilingual infobox using wikipedia's api 2016-03-14 07:32:36 +01:00
[enh] py3 compatibility 2016-11-30 18:43:03 +01:00			`params['url'] = search_url.format(query=urlencode({'titles': query}),`
			`language=url_lang(params['language']))`
[enh] wikipedia infobox creates simple multilingual infobox using wikipedia's api 2016-03-14 07:32:36 +01:00
			`return params`


			`# get first meaningful paragraph`
			`# this should filter out disambiguation pages and notes above first paragraph`
			`# "magic numbers" were obtained by fine tuning`
			`def extract_first_paragraph(content, title, image):`
			`first_paragraph = None`

			`failed_attempts = 0`
			`for paragraph in content.split('\n'):`

			`starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35)`
			`length = len(paragraph)`

			`if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)):`
			`first_paragraph = paragraph`
			`break`

			`failed_attempts += 1`
			`if failed_attempts > 3:`
			`return None`

			`return first_paragraph`


			`# get response from search-request`
			`def response(resp):`
			`results = []`

[enh] py3 compatibility 2016-11-30 18:43:03 +01:00			`search_result = loads(resp.text)`
[enh] wikipedia infobox creates simple multilingual infobox using wikipedia's api 2016-03-14 07:32:36 +01:00
			`# wikipedia article's unique id`
			`# first valid id is assumed to be the requested article`
			`for article_id in search_result['query']['pages']:`
			`page = search_result['query']['pages'][article_id]`
			`if int(article_id) > 0:`
			`break`

			`if int(article_id) < 0:`
			`return []`

			`title = page.get('title')`

			`image = page.get('thumbnail')`
			`if image:`
			`image = image.get('source')`

			`extract = page.get('extract')`

			`summary = extract_first_paragraph(extract, title, image)`

			`# link to wikipedia article`
[enh] py3 compatibility 2016-11-30 18:43:03 +01:00			`wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \`
[fix] urls merge in infobox (#593) TODO: merge attributes 2016-06-24 07:38:17 +02:00			`+ 'wiki/' + quote(title.replace(' ', '_').encode('utf8'))`
[enh] wikipedia infobox creates simple multilingual infobox using wikipedia's api 2016-03-14 07:32:36 +01:00
			`results.append({'url': wikipedia_link, 'title': title})`

			`results.append({'infobox': title,`
			`'id': wikipedia_link,`
			`'content': summary,`
			`'img_src': image,`
			`'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})`

			`return results`
[mod] fetch supported languages for several engines utils/fetch_languages.py gets languages supported by each engine and generates engines_languages.json with each engine's supported language. 2016-11-06 03:51:38 +01:00

			`# get supported languages from their site`
tests for _fetch_supported_languages in engines and refactor method to make it testable without making requests 2016-12-15 07:34:43 +01:00			`def _fetch_supported_languages(resp):`
[mod] fetch supported languages for several engines utils/fetch_languages.py gets languages supported by each engine and generates engines_languages.json with each engine's supported language. 2016-11-06 03:51:38 +01:00			`supported_languages = {}`
tests for _fetch_supported_languages in engines and refactor method to make it testable without making requests 2016-12-15 07:34:43 +01:00			`dom = fromstring(resp.text)`
[mod] fetch supported languages for several engines utils/fetch_languages.py gets languages supported by each engine and generates engines_languages.json with each engine's supported language. 2016-11-06 03:51:38 +01:00			`tables = dom.xpath('//table[contains(@class,"sortable")]')`
			`for table in tables:`
			`# exclude header row`
			`trs = table.xpath('.//tr')[1:]`
			`for tr in trs:`
			`td = tr.xpath('./td')`
			`code = td[3].xpath('./a')[0].text`
			`name = td[2].xpath('./a')[0].text`
			`english_name = td[1].xpath('./a')[0].text`
			`articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))`
minor fixes in utils/fetch_languages.py 2016-12-17 05:14:14 +01:00			`# exclude languages with too few articles`
change language list to only include languages with a minimum of engines that support them. users can still query lesser supported through the :lang_code bang. 2016-12-29 06:24:56 +01:00			`if articles >= 100:`
[mod] fetch supported languages for several engines utils/fetch_languages.py gets languages supported by each engine and generates engines_languages.json with each engine's supported language. 2016-11-06 03:51:38 +01:00			`supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}`

			`return supported_languages`