2021-01-13 11:31:25 +01:00
# SPDX-License-Identifier: AGPL-3.0-or-later
2016-03-14 07:32:36 +01:00
"""
Wikipedia ( Web )
"""
2020-08-06 17:42:46 +02:00
from urllib . parse import quote
2016-03-14 07:32:36 +01:00
from json import loads
2022-10-08 16:22:26 +02:00
from lxml import html
2020-09-08 07:05:21 +02:00
from searx . utils import match_language , searx_useragent
2022-10-08 16:22:26 +02:00
from searx import network
from searx . enginelib . traits import EngineTraits
engine_traits : EngineTraits
2016-08-06 06:34:56 +02:00
2021-01-13 11:31:25 +01:00
# about
about = {
" website " : ' https://www.wikipedia.org/ ' ,
" wikidata_id " : ' Q52 ' ,
" official_api_documentation " : ' https://en.wikipedia.org/api/ ' ,
" use_official_api " : True ,
" require_api_key " : False ,
" results " : ' JSON ' ,
}
2022-08-01 17:01:59 +02:00
send_accept_language_header = True
2016-03-14 07:32:36 +01:00
# search-url
2020-08-06 17:42:46 +02:00
search_url = ' https:// {language} .wikipedia.org/api/rest_v1/page/summary/ {title} '
2016-11-06 03:51:38 +01:00
supported_languages_url = ' https://meta.wikimedia.org/wiki/List_of_Wikipedias '
2021-02-09 05:56:45 +01:00
language_variants = { " zh " : ( " zh-cn " , " zh-hk " , " zh-mo " , " zh-my " , " zh-sg " , " zh-tw " ) }
2016-03-14 07:32:36 +01:00
# set language in base_url
def url_lang ( lang ) :
2019-01-06 15:27:46 +01:00
lang_pre = lang . split ( ' - ' ) [ 0 ]
2019-01-07 21:28:58 +01:00
if lang_pre == ' all ' or lang_pre not in supported_languages and lang_pre not in language_aliases :
2019-01-06 15:27:46 +01:00
return ' en '
2018-11-26 06:32:48 +01:00
return match_language ( lang , supported_languages , language_aliases ) . split ( ' - ' ) [ 0 ]
2016-03-14 07:32:36 +01:00
# do search-request
def request ( query , params ) :
if query . islower ( ) :
2020-09-08 07:05:21 +02:00
query = query . title ( )
2016-03-14 07:32:36 +01:00
2021-02-09 05:56:45 +01:00
language = url_lang ( params [ ' language ' ] )
2021-12-27 09:26:22 +01:00
params [ ' url ' ] = search_url . format ( title = quote ( query ) , language = language )
2021-02-09 05:56:45 +01:00
2020-09-08 07:05:21 +02:00
params [ ' headers ' ] [ ' User-Agent ' ] = searx_useragent ( )
2020-12-09 21:23:20 +01:00
params [ ' raise_for_httperror ' ] = False
2020-12-04 20:04:39 +01:00
params [ ' soft_max_redirects ' ] = 2
2020-09-08 07:05:21 +02:00
2016-03-14 07:32:36 +01:00
return params
# get response from search-request
def response ( resp ) :
2020-12-04 20:04:39 +01:00
if resp . status_code == 404 :
2016-03-14 07:32:36 +01:00
return [ ]
2021-02-11 12:29:21 +01:00
if resp . status_code == 400 :
try :
api_result = loads ( resp . text )
except :
pass
else :
2021-12-27 09:26:22 +01:00
if (
api_result [ ' type ' ] == ' https://mediawiki.org/wiki/HyperSwitch/errors/bad_request '
and api_result [ ' detail ' ] == ' title-invalid-characters '
) :
2021-02-11 12:29:21 +01:00
return [ ]
2022-10-08 16:22:26 +02:00
network . raise_for_httperror ( resp )
2016-03-14 07:32:36 +01:00
2020-09-08 07:05:21 +02:00
results = [ ]
api_result = loads ( resp . text )
2016-03-14 07:32:36 +01:00
2020-09-08 07:05:21 +02:00
# skip disambiguation pages
2020-12-07 17:42:05 +01:00
if api_result . get ( ' type ' ) != ' standard ' :
2020-09-08 07:05:21 +02:00
return [ ]
2016-03-14 07:32:36 +01:00
2021-03-25 08:31:39 +01:00
title = api_result [ ' title ' ]
2020-09-08 07:05:21 +02:00
wikipedia_link = api_result [ ' content_urls ' ] [ ' desktop ' ] [ ' page ' ]
2016-03-14 07:32:36 +01:00
results . append ( { ' url ' : wikipedia_link , ' title ' : title } )
2021-12-27 09:26:22 +01:00
results . append (
{
' infobox ' : title ,
' id ' : wikipedia_link ,
' content ' : api_result . get ( ' extract ' , ' ' ) ,
' img_src ' : api_result . get ( ' thumbnail ' , { } ) . get ( ' source ' ) ,
' urls ' : [ { ' title ' : ' Wikipedia ' , ' url ' : wikipedia_link } ] ,
}
)
2016-03-14 07:32:36 +01:00
return results
2016-11-06 03:51:38 +01:00
# get supported languages from their site
2016-12-15 07:34:43 +01:00
def _fetch_supported_languages ( resp ) :
2016-11-06 03:51:38 +01:00
supported_languages = { }
2022-10-08 16:22:26 +02:00
dom = html . fromstring ( resp . text )
2016-11-06 03:51:38 +01:00
tables = dom . xpath ( ' //table[contains(@class, " sortable " )] ' )
for table in tables :
# exclude header row
trs = table . xpath ( ' .//tr ' ) [ 1 : ]
for tr in trs :
td = tr . xpath ( ' ./td ' )
code = td [ 3 ] . xpath ( ' ./a ' ) [ 0 ] . text
2023-01-29 11:01:02 +01:00
name = td [ 1 ] . xpath ( ' ./a ' ) [ 0 ] . text
2016-11-06 03:51:38 +01:00
english_name = td [ 1 ] . xpath ( ' ./a ' ) [ 0 ] . text
2023-01-29 11:01:02 +01:00
articles = int ( td [ 4 ] . xpath ( ' ./a ' ) [ 0 ] . text . replace ( ' , ' , ' ' ) )
2016-12-17 05:14:14 +01:00
# exclude languages with too few articles
2016-12-29 06:24:56 +01:00
if articles > = 100 :
2021-02-26 07:49:15 +01:00
supported_languages [ code ] = { " name " : name , " english_name " : english_name }
2016-11-06 03:51:38 +01:00
return supported_languages
2022-10-08 16:22:26 +02:00
# Nonstandard language codes
#
# These Wikipedias use language codes that do not conform to the ISO 639
# standard (which is how wiki subdomains are chosen nowadays).
lang_map = {
' be-tarask ' : ' bel ' ,
' ak ' : ' aka ' ,
' als ' : ' gsw ' ,
' bat-smg ' : ' sgs ' ,
' cbk-zam ' : ' cbk ' ,
' fiu-vro ' : ' vro ' ,
' map-bms ' : ' map ' ,
' nrm ' : ' nrf ' ,
' roa-rup ' : ' rup ' ,
' nds-nl ' : ' nds ' ,
#'roa-tara: – invented code used for the Tarantino Wikipedia (again, roa is the standard code for the large family of Romance languages that the Tarantino dialect falls within)
#'simple: – invented code used for the Simple English Wikipedia (not the official IETF code en-simple)
' zh-classical ' : ' zh_Hant ' ,
' zh-min-nan ' : ' nan ' ,
' zh-yue ' : ' yue ' ,
' an ' : ' arg ' ,
}
unknown_langs = [
' ab ' , # Abkhazian
' alt ' , # Southern Altai
' an ' , # Aragonese
' ang ' , # Anglo-Saxon
' arc ' , # Aramaic
' ary ' , # Moroccan Arabic
' av ' , # Avar
' ba ' , # Bashkir
' be-tarask ' ,
' bar ' , # Bavarian
' bcl ' , # Central Bicolano
' bh ' , # Bhojpuri
' bi ' , # Bislama
' bjn ' , # Banjar
' blk ' , # Pa'O
' bpy ' , # Bishnupriya Manipuri
' bxr ' , # Buryat
' cbk-zam ' , # Zamboanga Chavacano
' co ' , # Corsican
' cu ' , # Old Church Slavonic
' dty ' , # Doteli
' dv ' , # Divehi
' ext ' , # Extremaduran
' fj ' , # Fijian
' frp ' , # Franco-Provençal
' gan ' , # Gan
' gom ' , # Goan Konkani
' hif ' , # Fiji Hindi
' ilo ' , # Ilokano
' inh ' , # Ingush
' jbo ' , # Lojban
' kaa ' , # Karakalpak
' kbd ' , # Kabardian Circassian
' kg ' , # Kongo
' koi ' , # Komi-Permyak
' krc ' , # Karachay-Balkar
' kv ' , # Komi
' lad ' , # Ladino
' lbe ' , # Lak
' lez ' , # Lezgian
' li ' , # Limburgish
' ltg ' , # Latgalian
' mdf ' , # Moksha
' mnw ' , # Mon
' mwl ' , # Mirandese
' myv ' , # Erzya
' na ' , # Nauruan
' nah ' , # Nahuatl
' nov ' , # Novial
' nrm ' , # Norman
' pag ' , # Pangasinan
' pam ' , # Kapampangan
' pap ' , # Papiamentu
' pdc ' , # Pennsylvania German
' pfl ' , # Palatinate German
' roa-rup ' , # Aromanian
' sco ' , # Scots
' sco ' , # Scots (https://sco.wikipedia.org) is not known by babel, Scottish Gaelic (https://gd.wikipedia.org) is known by babel
' sh ' , # Serbo-Croatian
' simple ' , # simple english is not know as a natural language different to english (babel)
' sm ' , # Samoan
' srn ' , # Sranan
' stq ' , # Saterland Frisian
' szy ' , # Sakizaya
' tcy ' , # Tulu
' tet ' , # Tetum
' tpi ' , # Tok Pisin
' trv ' , # Seediq
' ty ' , # Tahitian
' tyv ' , # Tuvan
' udm ' , # Udmurt
' vep ' , # Vepsian
' vls ' , # West Flemish
' vo ' , # Volapük
' wa ' , # Walloon
' xal ' , # Kalmyk
]
def fetch_traits ( engine_traits : EngineTraits ) :
""" Fetch languages from Wikipedia """
# pylint: disable=import-outside-toplevel
engine_traits . data_type = ' supported_languages ' # deprecated
import babel
from searx . locales import language_tag
resp = network . get ( ' https://meta.wikimedia.org/wiki/List_of_Wikipedias ' )
if not resp . ok :
print ( " ERROR: response from Wikipedia is not OK. " )
dom = html . fromstring ( resp . text )
for row in dom . xpath ( ' //table[contains(@class, " sortable " )]//tbody/tr ' ) :
cols = row . xpath ( ' ./td ' )
if not cols :
continue
cols = [ c . text_content ( ) . strip ( ) for c in cols ]
articles = int ( cols [ 4 ] . replace ( ' , ' , ' ' ) . replace ( ' - ' , ' 0 ' ) )
users = int ( cols [ 8 ] . replace ( ' , ' , ' ' ) . replace ( ' - ' , ' 0 ' ) )
depth = cols [ 11 ] . strip ( ' - ' )
if articles < 1000 :
# exclude languages with too few articles
continue
# depth: rough indicator of a Wikipedia’ s quality, showing how
# frequently its articles are updated.
if depth == ' ' :
if users < 1000 :
# depth is not calculated --> at least 1000 user should registered
continue
elif int ( depth ) < 20 :
continue
eng_tag = cols [ 3 ]
if eng_tag in unknown_langs :
continue
try :
sxng_tag = language_tag ( babel . Locale . parse ( lang_map . get ( eng_tag , eng_tag ) ) )
except babel . UnknownLocaleError :
print ( " ERROR: %s -> %s is unknown by babel " % ( cols [ 1 ] , eng_tag ) )
continue
conflict = engine_traits . languages . get ( sxng_tag )
if conflict :
if conflict != eng_tag :
print ( " CONFLICT: babel %s --> %s , %s " % ( sxng_tag , conflict , eng_tag ) )
continue
engine_traits . languages [ sxng_tag ] = eng_tag
engine_traits . languages [ ' zh_Hans ' ] = ' zh '