forked from Ponysearch/Ponysearch
16f0db4493
This patch replaces the *full of magic* ``utils.match_language`` function by a ``locales.match_locale``. The ``locales.match_locale`` function is based on the ``locales.build_engine_locales`` introduced in9ae409a0
[1]. In the past SearXNG did only support a search by a language but not in a region. This has been changed a long time ago and regions have been added to SearXNG core but not to the engines. The ``utils.match_language`` was the function to handle the different aspects of language/regions in SearXNG core and the supported *languages* in the engine. The ``utils.match_language`` did it with some magic and works good for most use cases but fails in some edge case. To replace the concurrence of languages and regions in the SearXNG core the ``locales.build_engine_locales`` was introduced in9ae409a0
[1]. With the last patches all engines has been migrated to a ``fetch_traits`` and a language/region concept that is based on ``locales.build_engine_locales``. To summarize: there is no longer a need for the ``locales.match_language``. [1] https://github.com/searxng/searxng/pull/1652 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
305 lines
11 KiB
Python
Executable file
305 lines
11 KiB
Python
Executable file
#!/usr/bin/env python
|
|
# lint: pylint
|
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
|
|
"""Fetch website description from websites and from
|
|
:origin:`searx/engines/wikidata.py` engine.
|
|
|
|
Output file: :origin:`searx/data/engine_descriptions.json`.
|
|
|
|
"""
|
|
|
|
# pylint: disable=invalid-name, global-statement
|
|
|
|
import json
|
|
from urllib.parse import urlparse
|
|
from os.path import join
|
|
|
|
from lxml.html import fromstring
|
|
|
|
from searx.engines import wikidata, set_loggers
|
|
from searx.utils import extract_text
|
|
from searx.locales import LOCALE_NAMES, locales_initialize, match_locale
|
|
from searx import searx_dir
|
|
from searx.utils import gen_useragent, detect_language
|
|
import searx.search
|
|
import searx.network
|
|
|
|
set_loggers(wikidata, 'wikidata')
|
|
locales_initialize()
|
|
|
|
SPARQL_WIKIPEDIA_ARTICLE = """
|
|
SELECT DISTINCT ?item ?name
|
|
WHERE {
|
|
hint:Query hint:optimizer "None".
|
|
VALUES ?item { %IDS% }
|
|
?article schema:about ?item ;
|
|
schema:inLanguage ?lang ;
|
|
schema:name ?name ;
|
|
schema:isPartOf [ wikibase:wikiGroup "wikipedia" ] .
|
|
FILTER(?lang in (%LANGUAGES_SPARQL%)) .
|
|
FILTER (!CONTAINS(?name, ':')) .
|
|
}
|
|
"""
|
|
|
|
SPARQL_DESCRIPTION = """
|
|
SELECT DISTINCT ?item ?itemDescription
|
|
WHERE {
|
|
VALUES ?item { %IDS% }
|
|
?item schema:description ?itemDescription .
|
|
FILTER (lang(?itemDescription) in (%LANGUAGES_SPARQL%))
|
|
}
|
|
ORDER BY ?itemLang
|
|
"""
|
|
|
|
NOT_A_DESCRIPTION = [
|
|
'web site',
|
|
'site web',
|
|
'komputa serĉilo',
|
|
'interreta serĉilo',
|
|
'bilaketa motor',
|
|
'web search engine',
|
|
'wikimedia täpsustuslehekülg',
|
|
]
|
|
|
|
SKIP_ENGINE_SOURCE = [
|
|
# fmt: off
|
|
('gitlab', 'wikidata')
|
|
# descriptions are about wikipedia disambiguation pages
|
|
# fmt: on
|
|
]
|
|
|
|
LANGUAGES = LOCALE_NAMES.keys()
|
|
WIKIPEDIA_LANGUAGES = {'language': 'wikipedia_language'}
|
|
LANGUAGES_SPARQL = ''
|
|
IDS = None
|
|
|
|
descriptions = {}
|
|
wd_to_engine_name = {}
|
|
|
|
|
|
def normalize_description(description):
|
|
for c in [chr(c) for c in range(0, 31)]:
|
|
description = description.replace(c, ' ')
|
|
description = ' '.join(description.strip().split())
|
|
return description
|
|
|
|
|
|
def update_description(engine_name, lang, description, source, replace=True):
|
|
if not isinstance(description, str):
|
|
return
|
|
description = normalize_description(description)
|
|
if description.lower() == engine_name.lower():
|
|
return
|
|
if description.lower() in NOT_A_DESCRIPTION:
|
|
return
|
|
if (engine_name, source) in SKIP_ENGINE_SOURCE:
|
|
return
|
|
if ' ' not in description:
|
|
# skip unique word description (like "website")
|
|
return
|
|
if replace or lang not in descriptions[engine_name]:
|
|
descriptions[engine_name][lang] = [description, source]
|
|
|
|
|
|
def get_wikipedia_summary(lang, pageid):
|
|
params = {'language': lang.replace('_', '-'), 'headers': {}}
|
|
searx.engines.engines['wikipedia'].request(pageid, params)
|
|
try:
|
|
response = searx.network.get(params['url'], headers=params['headers'], timeout=10)
|
|
response.raise_for_status()
|
|
api_result = json.loads(response.text)
|
|
return api_result.get('extract')
|
|
except Exception: # pylint: disable=broad-except
|
|
return None
|
|
|
|
|
|
def get_website_description(url, lang1, lang2=None):
|
|
headers = {
|
|
'User-Agent': gen_useragent(),
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
'DNT': '1',
|
|
'Upgrade-Insecure-Requests': '1',
|
|
'Sec-GPC': '1',
|
|
'Cache-Control': 'max-age=0',
|
|
}
|
|
if lang1 is not None:
|
|
lang_list = [lang1]
|
|
if lang2 is not None:
|
|
lang_list.append(lang2)
|
|
headers['Accept-Language'] = f'{",".join(lang_list)};q=0.8'
|
|
try:
|
|
response = searx.network.get(url, headers=headers, timeout=10)
|
|
response.raise_for_status()
|
|
except Exception: # pylint: disable=broad-except
|
|
return (None, None)
|
|
|
|
try:
|
|
html = fromstring(response.text)
|
|
except ValueError:
|
|
html = fromstring(response.content)
|
|
|
|
description = extract_text(html.xpath('/html/head/meta[@name="description"]/@content'))
|
|
if not description:
|
|
description = extract_text(html.xpath('/html/head/meta[@property="og:description"]/@content'))
|
|
if not description:
|
|
description = extract_text(html.xpath('/html/head/title'))
|
|
lang = extract_text(html.xpath('/html/@lang'))
|
|
if lang is None and len(lang1) > 0:
|
|
lang = lang1
|
|
lang = detect_language(description) or lang or 'en'
|
|
lang = lang.split('_')[0]
|
|
lang = lang.split('-')[0]
|
|
return (lang, description)
|
|
|
|
|
|
def initialize():
|
|
global IDS, WIKIPEDIA_LANGUAGES, LANGUAGES_SPARQL
|
|
searx.search.initialize()
|
|
wikipedia_engine = searx.engines.engines['wikipedia']
|
|
WIKIPEDIA_LANGUAGES = {language: wikipedia_engine.url_lang(language.replace('_', '-')) for language in LANGUAGES}
|
|
WIKIPEDIA_LANGUAGES['nb_NO'] = 'no'
|
|
LANGUAGES_SPARQL = ', '.join(f"'{l}'" for l in set(WIKIPEDIA_LANGUAGES.values()))
|
|
for engine_name, engine in searx.engines.engines.items():
|
|
descriptions[engine_name] = {}
|
|
wikidata_id = getattr(engine, "about", {}).get('wikidata_id')
|
|
if wikidata_id is not None:
|
|
wd_to_engine_name.setdefault(wikidata_id, set()).add(engine_name)
|
|
|
|
IDS = ' '.join(list(map(lambda wd_id: 'wd:' + wd_id, wd_to_engine_name.keys())))
|
|
|
|
|
|
def fetch_wikidata_descriptions():
|
|
searx.network.set_timeout_for_thread(60)
|
|
result = wikidata.send_wikidata_query(
|
|
SPARQL_DESCRIPTION.replace('%IDS%', IDS).replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL)
|
|
)
|
|
if result is not None:
|
|
for binding in result['results']['bindings']:
|
|
wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '')
|
|
wikidata_lang = binding['itemDescription']['xml:lang']
|
|
description = binding['itemDescription']['value']
|
|
for engine_name in wd_to_engine_name[wikidata_id]:
|
|
for lang in LANGUAGES:
|
|
if WIKIPEDIA_LANGUAGES[lang] == wikidata_lang:
|
|
update_description(engine_name, lang, description, 'wikidata')
|
|
|
|
|
|
def fetch_wikipedia_descriptions():
|
|
result = wikidata.send_wikidata_query(
|
|
SPARQL_WIKIPEDIA_ARTICLE.replace('%IDS%', IDS).replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL)
|
|
)
|
|
if result is not None:
|
|
for binding in result['results']['bindings']:
|
|
wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '')
|
|
wikidata_lang = binding['name']['xml:lang']
|
|
pageid = binding['name']['value']
|
|
for engine_name in wd_to_engine_name[wikidata_id]:
|
|
for lang in LANGUAGES:
|
|
if WIKIPEDIA_LANGUAGES[lang] == wikidata_lang:
|
|
description = get_wikipedia_summary(lang, pageid)
|
|
update_description(engine_name, lang, description, 'wikipedia')
|
|
|
|
|
|
def normalize_url(url):
|
|
url = url.replace('{language}', 'en')
|
|
url = urlparse(url)._replace(path='/', params='', query='', fragment='').geturl()
|
|
url = url.replace('https://api.', 'https://')
|
|
return url
|
|
|
|
|
|
def fetch_website_description(engine_name, website):
|
|
default_lang, default_description = get_website_description(website, None, None)
|
|
if default_lang is None or default_description is None:
|
|
# the front page can't be fetched: skip this engine
|
|
return
|
|
|
|
wikipedia_languages_r = {V: K for K, V in WIKIPEDIA_LANGUAGES.items()}
|
|
languages = ['en', 'es', 'pt', 'ru', 'tr', 'fr']
|
|
languages = languages + [l for l in LANGUAGES if l not in languages]
|
|
|
|
previous_matched_lang = None
|
|
previous_count = 0
|
|
for lang in languages:
|
|
if lang not in descriptions[engine_name]:
|
|
fetched_lang, desc = get_website_description(website, lang, WIKIPEDIA_LANGUAGES[lang])
|
|
if fetched_lang is None or desc is None:
|
|
continue
|
|
matched_lang = match_locale(fetched_lang, LANGUAGES, fallback=None)
|
|
if matched_lang is None:
|
|
fetched_wikipedia_lang = match_locale(fetched_lang, WIKIPEDIA_LANGUAGES.values(), fallback=None)
|
|
matched_lang = wikipedia_languages_r.get(fetched_wikipedia_lang)
|
|
if matched_lang is not None:
|
|
update_description(engine_name, matched_lang, desc, website, replace=False)
|
|
# check if desc changed with the different lang values
|
|
if matched_lang == previous_matched_lang:
|
|
previous_count += 1
|
|
if previous_count == 6:
|
|
# the website has returned the same description for 6 different languages in Accept-Language header
|
|
# stop now
|
|
break
|
|
else:
|
|
previous_matched_lang = matched_lang
|
|
previous_count = 0
|
|
|
|
|
|
def fetch_website_descriptions():
|
|
for engine_name, engine in searx.engines.engines.items():
|
|
website = getattr(engine, "about", {}).get('website')
|
|
if website is None and hasattr(engine, "search_url"):
|
|
website = normalize_url(getattr(engine, "search_url"))
|
|
if website is None and hasattr(engine, "base_url"):
|
|
website = normalize_url(getattr(engine, "base_url"))
|
|
if website is not None:
|
|
fetch_website_description(engine_name, website)
|
|
|
|
|
|
def get_engine_descriptions_filename():
|
|
return join(join(searx_dir, "data"), "engine_descriptions.json")
|
|
|
|
|
|
def get_output():
|
|
"""
|
|
From descriptions[engine][language] = [description, source]
|
|
To
|
|
|
|
* output[language][engine] = description_and_source
|
|
* description_and_source can be:
|
|
* [description, source]
|
|
* description (if source = "wikipedia")
|
|
* [f"engine:lang", "ref"] (reference to another existing description)
|
|
"""
|
|
output = {locale: {} for locale in LOCALE_NAMES}
|
|
|
|
seen_descriptions = {}
|
|
|
|
for engine_name, lang_descriptions in descriptions.items():
|
|
for language, description in lang_descriptions.items():
|
|
if description[0] in seen_descriptions:
|
|
ref = seen_descriptions[description[0]]
|
|
description = [f'{ref[0]}:{ref[1]}', 'ref']
|
|
else:
|
|
seen_descriptions[description[0]] = (engine_name, language)
|
|
if description[1] == 'wikipedia':
|
|
description = description[0]
|
|
output.setdefault(language, {}).setdefault(engine_name, description)
|
|
|
|
return output
|
|
|
|
|
|
def main():
|
|
initialize()
|
|
print('Fetching wikidata descriptions')
|
|
fetch_wikidata_descriptions()
|
|
print('Fetching wikipedia descriptions')
|
|
fetch_wikipedia_descriptions()
|
|
print('Fetching website descriptions')
|
|
fetch_website_descriptions()
|
|
|
|
output = get_output()
|
|
with open(get_engine_descriptions_filename(), 'w', encoding='utf8') as f:
|
|
f.write(json.dumps(output, indent=1, separators=(',', ':'), ensure_ascii=False))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|