Merge pull request #2347 from return42/mod-lang-detection

If language recognition fails use the Accept-Language
This commit is contained in:
Markus Heiser 2023-04-25 15:46:26 +02:00 committed by GitHub
commit 45529f51a1
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 106 additions and 71 deletions

View file

@ -8,9 +8,10 @@
from base64 import urlsafe_b64encode, urlsafe_b64decode from base64 import urlsafe_b64encode, urlsafe_b64decode
from zlib import compress, decompress from zlib import compress, decompress
from urllib.parse import parse_qs, urlencode from urllib.parse import parse_qs, urlencode
from typing import Iterable, Dict, List from typing import Iterable, Dict, List, Optional
import flask import flask
import babel
from searx import settings, autocomplete from searx import settings, autocomplete
from searx.enginelib import Engine from searx.enginelib import Engine
@ -287,10 +288,65 @@ class PluginsSetting(BooleanChoices):
return [item[len('plugin_') :] for item in items] return [item[len('plugin_') :] for item in items]
class ClientPref:
"""Container to assemble client prefferences and settings."""
# hint: searx.webapp.get_client_settings should be moved into this class
locale: babel.Locale
"""Locale prefered by the client."""
def __init__(self, locale: Optional[babel.Locale] = None):
self.locale = locale
@property
def locale_tag(self):
if self.locale is None:
return None
tag = self.locale.language
if self.locale.territory:
tag += '-' + self.locale.territory
return tag
@classmethod
def from_http_request(cls, http_request: flask.Request):
"""Build ClientPref object from HTTP request.
- `Accept-Language used for locale setting
<https://www.w3.org/International/questions/qa-accept-lang-locales.en>`__
"""
al_header = http_request.headers.get("Accept-Language")
if not al_header:
return cls(locale=None)
pairs = []
for l in al_header.split(','):
# fmt: off
lang, qvalue = [_.strip() for _ in (l.split(';') + ['q=1',])[:2]]
# fmt: on
try:
qvalue = float(qvalue.split('=')[-1])
locale = babel.Locale.parse(lang, sep='-')
except (ValueError, babel.core.UnknownLocaleError):
continue
pairs.append((locale, qvalue))
pairs.sort(reverse=True, key=lambda x: x[1])
return cls(locale=pairs[0][0])
class Preferences: class Preferences:
"""Validates and saves preferences to cookies""" """Validates and saves preferences to cookies"""
def __init__(self, themes: List[str], categories: List[str], engines: Dict[str, Engine], plugins: Iterable[Plugin]): def __init__(
self,
themes: List[str],
categories: List[str],
engines: Dict[str, Engine],
plugins: Iterable[Plugin],
client: Optional[ClientPref] = None,
):
super().__init__() super().__init__()
self.key_value_settings: Dict[str, Setting] = { self.key_value_settings: Dict[str, Setting] = {
@ -414,6 +470,7 @@ class Preferences:
self.engines = EnginesSetting('engines', engines=engines.values()) self.engines = EnginesSetting('engines', engines=engines.values())
self.plugins = PluginsSetting('plugins', plugins=plugins) self.plugins = PluginsSetting('plugins', plugins=plugins)
self.tokens = SetSetting('tokens') self.tokens = SetSetting('tokens')
self.client = client or ClientPref()
self.unknown_params: Dict[str, str] = {} self.unknown_params: Dict[str, str] = {}
def get_as_url_params(self): def get_as_url_params(self):

View file

@ -22,7 +22,6 @@ from searx.network import initialize as initialize_network, check_network_config
from searx.metrics import initialize as initialize_metrics, counter_inc, histogram_observe_time from searx.metrics import initialize as initialize_metrics, counter_inc, histogram_observe_time
from searx.search.processors import PROCESSORS, initialize as initialize_processors from searx.search.processors import PROCESSORS, initialize as initialize_processors
from searx.search.checker import initialize as initialize_checker from searx.search.checker import initialize as initialize_checker
from searx.utils import detect_language
logger = logger.getChild('search') logger = logger.getChild('search')
@ -40,57 +39,19 @@ def initialize(settings_engines=None, enable_checker=False, check_network=False,
initialize_checker() initialize_checker()
def replace_auto_language(search_query: SearchQuery):
"""
Do nothing except if `search_query.lang` is "auto".
In this case:
* the value "auto" is replaced by the detected language of the query.
The default value is "all" when no language is detected.
* `search_query.locale` is updated accordingly
Use :py:obj:`searx.utils.detect_language` with `only_search_languages=True` to keep
only languages supported by the engines.
"""
if search_query.lang != 'auto':
return
detected_lang = detect_language(search_query.query, threshold=0.3, only_search_languages=True)
if detected_lang is None:
# fallback to 'all' if no language has been detected
search_query.lang = 'all'
search_query.locale = None
return
search_query.lang = detected_lang
try:
search_query.locale = babel.Locale.parse(search_query.lang)
except babel.core.UnknownLocaleError:
search_query.locale = None
class Search: class Search:
"""Search information container""" """Search information container"""
__slots__ = "search_query", "result_container", "start_time", "actual_timeout" __slots__ = "search_query", "result_container", "start_time", "actual_timeout"
def __init__(self, search_query: SearchQuery): def __init__(self, search_query: SearchQuery):
"""Initialize the Search """Initialize the Search"""
search_query is copied
"""
# init vars # init vars
super().__init__() super().__init__()
self.search_query = search_query
self.result_container = ResultContainer() self.result_container = ResultContainer()
self.start_time = None self.start_time = None
self.actual_timeout = None self.actual_timeout = None
self.search_query = copy(search_query)
self.update_search_query(self.search_query)
def update_search_query(self, search_query: SearchQuery):
"""Update search_query.
call replace_auto_language to replace the "auto" language
"""
replace_auto_language(search_query)
def search_external_bang(self): def search_external_bang(self):
""" """

View file

@ -31,7 +31,7 @@ search:
autocomplete_min: 4 autocomplete_min: 4
# Default search language - leave blank to detect from browser information or # Default search language - leave blank to detect from browser information or
# use codes from 'languages.py' # use codes from 'languages.py'
default_lang: "" default_lang: "auto"
# Available languages # Available languages
# languages: # languages:
# - all # - all

View file

@ -6,6 +6,7 @@ from searx.query import RawTextQuery
from searx.engines import categories, engines from searx.engines import categories, engines
from searx.search import SearchQuery, EngineRef from searx.search import SearchQuery, EngineRef
from searx.preferences import Preferences, is_locked from searx.preferences import Preferences, is_locked
from searx.utils import detect_language
# remove duplicate queries. # remove duplicate queries.
@ -214,7 +215,27 @@ def parse_engine_data(form):
def get_search_query_from_webapp( def get_search_query_from_webapp(
preferences: Preferences, form: Dict[str, str] preferences: Preferences, form: Dict[str, str]
) -> Tuple[SearchQuery, RawTextQuery, List[EngineRef], List[EngineRef]]: ) -> Tuple[SearchQuery, RawTextQuery, List[EngineRef], List[EngineRef], str]:
"""Assemble data from preferences and request.form (from the HTML form) needed
in a search query.
The returned tuple consits of:
1. instance of :py:obj:`searx.search.SearchQuery`
2. instance of :py:obj:`searx.query.RawTextQuery`
3. list of :py:obj:`searx.search.EngineRef` instances
4. string with the *selected locale* of the query
About language/locale: if the client selects the alias ``auto`` the
``SearchQuery`` object is build up by the :py:obj:`detected language
<searx.utils.detect_language>`. If language recognition does not have a
match the language preferred by the :py:obj:`Preferences.client` is used.
If client does not have a preference, the default ``all`` is used.
The *selected locale* in the tuple always represents the selected
language/locale and might differ from the language recognition.
"""
# no text for the query ? # no text for the query ?
if not form.get('q'): if not form.get('q'):
raise SearxParameterException('q', '') raise SearxParameterException('q', '')
@ -229,13 +250,19 @@ def get_search_query_from_webapp(
# set query # set query
query = raw_text_query.getQuery() query = raw_text_query.getQuery()
query_pageno = parse_pageno(form) query_pageno = parse_pageno(form)
query_lang = parse_lang(preferences, form, raw_text_query)
query_safesearch = parse_safesearch(preferences, form) query_safesearch = parse_safesearch(preferences, form)
query_time_range = parse_time_range(form) query_time_range = parse_time_range(form)
query_timeout = parse_timeout(form, raw_text_query) query_timeout = parse_timeout(form, raw_text_query)
external_bang = raw_text_query.external_bang external_bang = raw_text_query.external_bang
engine_data = parse_engine_data(form) engine_data = parse_engine_data(form)
query_lang = parse_lang(preferences, form, raw_text_query)
selected_locale = query_lang
if query_lang == 'auto':
query_lang = detect_language(query, threshold=0.8, only_search_languages=True)
query_lang = query_lang or preferences.client.locale_tag or 'all'
if not is_locked('categories') and raw_text_query.specific: if not is_locked('categories') and raw_text_query.specific:
# if engines are calculated from query, # if engines are calculated from query,
# set categories by using that information # set categories by using that information
@ -265,4 +292,5 @@ def get_search_query_from_webapp(
raw_text_query, raw_text_query,
query_engineref_list_unknown, query_engineref_list_unknown,
query_engineref_list_notoken, query_engineref_list_notoken,
selected_locale,
) )

View file

@ -84,6 +84,7 @@ from searx.webutils import (
from searx.webadapter import ( from searx.webadapter import (
get_search_query_from_webapp, get_search_query_from_webapp,
get_selected_categories, get_selected_categories,
parse_lang,
) )
from searx.utils import ( from searx.utils import (
html_to_text, html_to_text,
@ -96,6 +97,7 @@ from searx.plugins import Plugin, plugins, initialize as plugin_initialize
from searx.plugins.oa_doi_rewrite import get_doi_resolver from searx.plugins.oa_doi_rewrite import get_doi_resolver
from searx.preferences import ( from searx.preferences import (
Preferences, Preferences,
ClientPref,
ValidationException, ValidationException,
) )
from searx.answerers import ( from searx.answerers import (
@ -221,16 +223,9 @@ babel = Babel(app, locale_selector=get_locale)
def _get_browser_language(req, lang_list): def _get_browser_language(req, lang_list):
for lang in req.headers.get("Accept-Language", "en").split(","): client = ClientPref.from_http_request(req)
if ';' in lang: locale = match_locale(client.locale_tag, lang_list, fallback='en')
lang = lang.split(';')[0] return locale
if '-' in lang:
lang_parts = lang.split('-')
lang = "{}-{}".format(lang_parts[0], lang_parts[-1].upper())
locale = match_locale(lang, lang_list, fallback=None)
if locale is not None:
return locale
return 'en'
def _get_locale_rfc5646(locale): def _get_locale_rfc5646(locale):
@ -446,11 +441,7 @@ def render(template_name: str, **kwargs):
kwargs['rtl'] = True kwargs['rtl'] = True
if 'current_language' not in kwargs: if 'current_language' not in kwargs:
_locale = request.preferences.get_value('language') kwargs['current_language'] = parse_lang(request.preferences, {}, RawTextQuery('', []))
if _locale in ('auto', 'all'):
kwargs['current_language'] = _locale
else:
kwargs['current_language'] = match_locale(_locale, settings['search']['languages'])
# values from settings # values from settings
kwargs['search_formats'] = [x for x in settings['search']['formats'] if x != 'html'] kwargs['search_formats'] = [x for x in settings['search']['formats'] if x != 'html']
@ -512,7 +503,10 @@ def pre_request():
request.timings = [] # pylint: disable=assigning-non-slot request.timings = [] # pylint: disable=assigning-non-slot
request.errors = [] # pylint: disable=assigning-non-slot request.errors = [] # pylint: disable=assigning-non-slot
preferences = Preferences(themes, list(categories.keys()), engines, plugins) # pylint: disable=redefined-outer-name client_pref = ClientPref.from_http_request(request)
# pylint: disable=redefined-outer-name
preferences = Preferences(themes, list(categories.keys()), engines, plugins, client_pref)
user_agent = request.headers.get('User-Agent', '').lower() user_agent = request.headers.get('User-Agent', '').lower()
if 'webkit' in user_agent and 'android' in user_agent: if 'webkit' in user_agent and 'android' in user_agent:
preferences.key_value_settings['method'].value = 'GET' preferences.key_value_settings['method'].value = 'GET'
@ -681,7 +675,9 @@ def search():
raw_text_query = None raw_text_query = None
result_container = None result_container = None
try: try:
search_query, raw_text_query, _, _ = get_search_query_from_webapp(request.preferences, request.form) search_query, raw_text_query, _, _, selected_locale = get_search_query_from_webapp(
request.preferences, request.form
)
# search = Search(search_query) # without plugins # search = Search(search_query) # without plugins
search = SearchWithPlugins(search_query, request.user_plugins, request) # pylint: disable=redefined-outer-name search = SearchWithPlugins(search_query, request.user_plugins, request) # pylint: disable=redefined-outer-name
@ -812,13 +808,6 @@ def search():
) )
) )
if search_query.lang in ('auto', 'all'):
current_language = search_query.lang
else:
current_language = match_locale(
search_query.lang, settings['search']['languages'], fallback=request.preferences.get_value("language")
)
# search_query.lang contains the user choice (all, auto, en, ...) # search_query.lang contains the user choice (all, auto, en, ...)
# when the user choice is "auto", search.search_query.lang contains the detected language # when the user choice is "auto", search.search_query.lang contains the detected language
# otherwise it is equals to search_query.lang # otherwise it is equals to search_query.lang
@ -841,7 +830,7 @@ def search():
result_container.unresponsive_engines result_container.unresponsive_engines
), ),
current_locale = request.preferences.get_value("locale"), current_locale = request.preferences.get_value("locale"),
current_language = current_language, current_language = selected_locale,
search_language = match_locale( search_language = match_locale(
search.search_query.lang, search.search_query.lang,
settings['search']['languages'], settings['search']['languages'],