Merge pull request #28 from searxng/mod-processors-error-message

[mod] processors: show identical error messages on /search and /stats
This commit is contained in:
Alexandre Flament 2021-04-27 16:50:41 +02:00 committed by GitHub
commit 87e914e398
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 135 additions and 84 deletions

View file

@ -1,37 +1,49 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Implement request processores used by engine-types.
"""
__all__ = [
'EngineProcessor',
'OfflineProcessor',
'OnlineProcessor',
'OnlineDictionaryProcessor',
'OnlineCurrencyProcessor',
'processors',
]
from searx import logger
import searx.engines as engines
from .online import OnlineProcessor from .online import OnlineProcessor
from .offline import OfflineProcessor from .offline import OfflineProcessor
from .online_dictionary import OnlineDictionaryProcessor from .online_dictionary import OnlineDictionaryProcessor
from .online_currency import OnlineCurrencyProcessor from .online_currency import OnlineCurrencyProcessor
from .abstract import EngineProcessor from .abstract import EngineProcessor
from searx import logger
import searx.engines as engines
__all__ = ['EngineProcessor', 'OfflineProcessor', 'OnlineProcessor',
'OnlineDictionaryProcessor', 'OnlineCurrencyProcessor', 'processors']
logger = logger.getChild('search.processors') logger = logger.getChild('search.processors')
processors = {} processors = {}
"""Cache request processores, stored by *engine-name* (:py:func:`initialize`)"""
def get_processor_class(engine_type): def get_processor_class(engine_type):
"""Return processor class according to the ``engine_type``"""
for c in [OnlineProcessor, OfflineProcessor, OnlineDictionaryProcessor, OnlineCurrencyProcessor]: for c in [OnlineProcessor, OfflineProcessor, OnlineDictionaryProcessor, OnlineCurrencyProcessor]:
if c.engine_type == engine_type: if c.engine_type == engine_type:
return c return c
return None return None
def get_processor(engine, engine_name): def get_processor(engine, engine_name):
"""Return processor instance that fits to ``engine.engine.type``)"""
engine_type = getattr(engine, 'engine_type', 'online') engine_type = getattr(engine, 'engine_type', 'online')
processor_class = get_processor_class(engine_type) processor_class = get_processor_class(engine_type)
if processor_class: if processor_class:
return processor_class(engine, engine_name) return processor_class(engine, engine_name)
else:
return None return None
def initialize(engine_list): def initialize(engine_list):
"""Initialize all engines and store a processor for each engine in :py:obj:`processors`."""
engines.initialize_engines(engine_list) engines.initialize_engines(engine_list)
for engine_name, engine in engines.engines.items(): for engine_name, engine in engines.engines.items():
processor = get_processor(engine, engine_name) processor = get_processor(engine, engine_name)

View file

@ -1,4 +1,9 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Abstract base classes for engine request processores.
"""
import threading import threading
from abc import abstractmethod, ABC from abc import abstractmethod, ABC
@ -10,12 +15,13 @@ from searx.network import get_time_for_thread, get_network
from searx.metrics import histogram_observe, counter_inc, count_exception, count_error from searx.metrics import histogram_observe, counter_inc, count_exception, count_error
from searx.exceptions import SearxEngineAccessDeniedException from searx.exceptions import SearxEngineAccessDeniedException
logger = logger.getChild('searx.search.processor') logger = logger.getChild('searx.search.processor')
SUSPENDED_STATUS = {} SUSPENDED_STATUS = {}
# pylint: disable=missing-function-docstring
class SuspendedStatus: class SuspendedStatus:
"""Class to handle suspend state."""
__slots__ = 'suspend_end_time', 'suspend_reason', 'continuous_errors', 'lock' __slots__ = 'suspend_end_time', 'suspend_reason', 'continuous_errors', 'lock'
@ -49,6 +55,7 @@ class SuspendedStatus:
class EngineProcessor(ABC): class EngineProcessor(ABC):
"""Base classes used for all types of reqest processores."""
__slots__ = 'engine', 'engine_name', 'lock', 'suspended_status' __slots__ = 'engine', 'engine_name', 'lock', 'suspended_status'
@ -59,22 +66,28 @@ class EngineProcessor(ABC):
key = id(key) if key else self.engine_name key = id(key) if key else self.engine_name
self.suspended_status = SUSPENDED_STATUS.setdefault(key, SuspendedStatus()) self.suspended_status = SUSPENDED_STATUS.setdefault(key, SuspendedStatus())
def handle_exception(self, result_container, reason, exception, suspend=False, display_exception=True): def handle_exception(self, result_container, exception_or_message, suspend=False):
# update result_container # update result_container
error_message = str(exception) if display_exception and exception else None if isinstance(exception_or_message, BaseException):
result_container.add_unresponsive_engine(self.engine_name, reason, error_message) exception_class = exception_or_message.__class__
module_name = getattr(exception_class, '__module__', 'builtins')
module_name = '' if module_name == 'builtins' else module_name + '.'
error_message = module_name + exception_class.__qualname__
else:
error_message = exception_or_message
result_container.add_unresponsive_engine(self.engine_name, error_message)
# metrics # metrics
counter_inc('engine', self.engine_name, 'search', 'count', 'error') counter_inc('engine', self.engine_name, 'search', 'count', 'error')
if exception: if isinstance(exception_or_message, BaseException):
count_exception(self.engine_name, exception) count_exception(self.engine_name, exception_or_message)
else: else:
count_error(self.engine_name, reason) count_error(self.engine_name, exception_or_message)
# suspend the engine ? # suspend the engine ?
if suspend: if suspend:
suspended_time = None suspended_time = None
if isinstance(exception, SearxEngineAccessDeniedException): if isinstance(exception_or_message, SearxEngineAccessDeniedException):
suspended_time = exception.suspended_time suspended_time = exception_or_message.suspended_time
self.suspended_status.suspend(suspended_time, reason) # pylint: disable=no-member self.suspended_status.suspend(suspended_time, error_message) # pylint: disable=no-member
def _extend_container_basic(self, result_container, start_time, search_results): def _extend_container_basic(self, result_container, start_time, search_results):
# update result_container # update result_container
@ -91,7 +104,7 @@ class EngineProcessor(ABC):
def extend_container(self, result_container, start_time, search_results): def extend_container(self, result_container, start_time, search_results):
if getattr(threading.current_thread(), '_timeout', False): if getattr(threading.current_thread(), '_timeout', False):
# the main thread is not waiting anymore # the main thread is not waiting anymore
self.handle_exception(result_container, 'Timeout', None) self.handle_exception(result_container, 'timeout', None)
else: else:
# check if the engine accepted the request # check if the engine accepted the request
if search_results is not None: if search_results is not None:
@ -138,8 +151,6 @@ class EngineProcessor(ABC):
tests = getattr(self.engine, 'additional_tests', {}) tests = getattr(self.engine, 'additional_tests', {})
tests.update(self.get_default_tests()) tests.update(self.get_default_tests())
return tests return tests
else:
return tests
def get_default_tests(self): def get_default_tests(self): # pylint: disable=no-self-use
return {} return {}

View file

@ -1,13 +1,17 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Processores for engine-type: ``offline``
"""
from searx import logger from searx import logger
from searx.search.processors.abstract import EngineProcessor from .abstract import EngineProcessor
logger = logger.getChild('searx.search.processor.offline') logger = logger.getChild('searx.search.processor.offline')
class OfflineProcessor(EngineProcessor): class OfflineProcessor(EngineProcessor):
"""Processor class used by ``offline`` engines"""
engine_type = 'offline' engine_type = 'offline'
@ -21,6 +25,6 @@ class OfflineProcessor(EngineProcessor):
except ValueError as e: except ValueError as e:
# do not record the error # do not record the error
logger.exception('engine {0} : invalid input : {1}'.format(self.engine_name, e)) logger.exception('engine {0} : invalid input : {1}'.format(self.engine_name, e))
except Exception as e: except Exception as e: # pylint: disable=broad-except
self.handle_exception(result_container, 'unexpected crash', e) self.handle_exception(result_container, e)
logger.exception('engine {0} : exception : {1}'.format(self.engine_name, e)) logger.exception('engine {0} : exception : {1}'.format(self.engine_name, e))

View file

@ -1,24 +1,29 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Processores for engine-type: ``online``
"""
from time import time from time import time
import asyncio import asyncio
import httpx import httpx
import searx.network import searx.network
from searx import logger from searx import logger
from searx.utils import gen_useragent from searx.utils import gen_useragent
from searx.exceptions import (SearxEngineAccessDeniedException, SearxEngineCaptchaException, from searx.exceptions import (
SearxEngineTooManyRequestsException,) SearxEngineAccessDeniedException,
SearxEngineCaptchaException,
SearxEngineTooManyRequestsException,
)
from searx.metrics.error_recorder import count_error from searx.metrics.error_recorder import count_error
from .abstract import EngineProcessor
from searx.search.processors.abstract import EngineProcessor
logger = logger.getChild('searx.search.processor.online') logger = logger.getChild('searx.search.processor.online')
def default_request_params(): def default_request_params():
"""Default request parameters for ``online`` engines."""
return { return {
'method': 'GET', 'method': 'GET',
'headers': {}, 'headers': {},
@ -31,6 +36,7 @@ def default_request_params():
class OnlineProcessor(EngineProcessor): class OnlineProcessor(EngineProcessor):
"""Processor class for ``online`` engines."""
engine_type = 'online' engine_type = 'online'
@ -130,7 +136,7 @@ class OnlineProcessor(EngineProcessor):
self.extend_container(result_container, start_time, search_results) self.extend_container(result_container, start_time, search_results)
except (httpx.TimeoutException, asyncio.TimeoutError) as e: except (httpx.TimeoutException, asyncio.TimeoutError) as e:
# requests timeout (connect or read) # requests timeout (connect or read)
self.handle_exception(result_container, 'HTTP timeout', e, suspend=True, display_exception=False) self.handle_exception(result_container, e, suspend=True)
logger.error("engine {0} : HTTP requests timeout" logger.error("engine {0} : HTTP requests timeout"
"(search duration : {1} s, timeout: {2} s) : {3}" "(search duration : {1} s, timeout: {2} s) : {3}"
.format(self.engine_name, time() - start_time, .format(self.engine_name, time() - start_time,
@ -138,23 +144,23 @@ class OnlineProcessor(EngineProcessor):
e.__class__.__name__)) e.__class__.__name__))
except (httpx.HTTPError, httpx.StreamError) as e: except (httpx.HTTPError, httpx.StreamError) as e:
# other requests exception # other requests exception
self.handle_exception(result_container, 'HTTP error', e, suspend=True, display_exception=False) self.handle_exception(result_container, e, suspend=True)
logger.exception("engine {0} : requests exception" logger.exception("engine {0} : requests exception"
"(search duration : {1} s, timeout: {2} s) : {3}" "(search duration : {1} s, timeout: {2} s) : {3}"
.format(self.engine_name, time() - start_time, .format(self.engine_name, time() - start_time,
timeout_limit, timeout_limit,
e)) e))
except SearxEngineCaptchaException as e: except SearxEngineCaptchaException as e:
self.handle_exception(result_container, 'CAPTCHA required', e, suspend=True, display_exception=False) self.handle_exception(result_container, e, suspend=True)
logger.exception('engine {0} : CAPTCHA'.format(self.engine_name)) logger.exception('engine {0} : CAPTCHA'.format(self.engine_name))
except SearxEngineTooManyRequestsException as e: except SearxEngineTooManyRequestsException as e:
self.handle_exception(result_container, 'too many requests', e, suspend=True, display_exception=False) self.handle_exception(result_container, e, suspend=True)
logger.exception('engine {0} : Too many requests'.format(self.engine_name)) logger.exception('engine {0} : Too many requests'.format(self.engine_name))
except SearxEngineAccessDeniedException as e: except SearxEngineAccessDeniedException as e:
self.handle_exception(result_container, 'blocked', e, suspend=True, display_exception=False) self.handle_exception(result_container, e, suspend=True)
logger.exception('engine {0} : Searx is blocked'.format(self.engine_name)) logger.exception('engine {0} : Searx is blocked'.format(self.engine_name))
except Exception as e: except Exception as e: # pylint: disable=broad-except
self.handle_exception(result_container, 'unexpected crash', e, display_exception=False) self.handle_exception(result_container, e)
logger.exception('engine {0} : exception : {1}'.format(self.engine_name, e)) logger.exception('engine {0} : exception : {1}'.format(self.engine_name, e))
def get_default_tests(self): def get_default_tests(self):

View file

@ -1,4 +1,8 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Processores for engine-type: ``online_currency``
"""
import unicodedata import unicodedata
import re import re
@ -6,32 +10,31 @@ import re
from searx.data import CURRENCIES from searx.data import CURRENCIES
from .online import OnlineProcessor from .online import OnlineProcessor
parser_re = re.compile('.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I) parser_re = re.compile('.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I)
# pylint: disable=missing-function-docstring
def normalize_name(name): def normalize_name(name):
name = name.lower().replace('-', ' ').rstrip('s') name = name.lower().replace('-', ' ').rstrip('s')
name = re.sub(' +', ' ', name) name = re.sub(' +', ' ', name)
return unicodedata.normalize('NFKD', name).lower() return unicodedata.normalize('NFKD', name).lower()
def name_to_iso4217(name): def name_to_iso4217(name):
global CURRENCIES global CURRENCIES # pylint: disable=global-statement
name = normalize_name(name) name = normalize_name(name)
currency = CURRENCIES['names'].get(name, [name]) currency = CURRENCIES['names'].get(name, [name])
if isinstance(currency, str): if isinstance(currency, str):
return currency return currency
return currency[0] return currency[0]
def iso4217_to_name(iso4217, language): def iso4217_to_name(iso4217, language):
global CURRENCIES global CURRENCIES # pylint: disable=global-statement
return CURRENCIES['iso4217'].get(iso4217, {}).get(language, iso4217) return CURRENCIES['iso4217'].get(iso4217, {}).get(language, iso4217)
class OnlineCurrencyProcessor(OnlineProcessor): class OnlineCurrencyProcessor(OnlineProcessor):
"""Processor class used by ``online_currency`` engines."""
engine_type = 'online_currency' engine_type = 'online_currency'
def get_params(self, search_query, engine_category): def get_params(self, search_query, engine_category):

View file

@ -1,15 +1,18 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Processores for engine-type: ``online_dictionary``
"""
import re import re
from searx.utils import is_valid_lang from searx.utils import is_valid_lang
from .online import OnlineProcessor from .online import OnlineProcessor
parser_re = re.compile('.*?([a-z]+)-([a-z]+) ([^ ]+)$', re.I) parser_re = re.compile('.*?([a-z]+)-([a-z]+) ([^ ]+)$', re.I)
class OnlineDictionaryProcessor(OnlineProcessor): class OnlineDictionaryProcessor(OnlineProcessor):
"""Processor class used by ``online_dictionnary`` engines."""
engine_type = 'online_dictionnary' engine_type = 'online_dictionnary'

View file

@ -172,28 +172,34 @@ _category_names = (gettext('files'),
gettext('science')) gettext('science'))
# #
exception_classname_to_label = { timeout_text = gettext('timeout')
"searx.exceptions.SearxEngineCaptchaException": gettext("CAPTCHA"), parsing_error_text = gettext('parsing error')
"searx.exceptions.SearxEngineTooManyRequestsException": gettext("too many requests"), http_protocol_error_text = gettext('HTTP protocol error')
"searx.exceptions.SearxEngineAccessDeniedException": gettext("access denied"), network_error_text = gettext('network error')
"searx.exceptions.SearxEngineAPIException": gettext("server API error"), exception_classname_to_text = {
"httpx.TimeoutException": gettext("HTTP timeout"), None: gettext('unexpected crash'),
"httpx.ConnectTimeout": gettext("HTTP timeout"), 'timeout': timeout_text,
"httpx.ReadTimeout": gettext("HTTP timeout"), 'asyncio.TimeoutError': timeout_text,
"httpx.WriteTimeout": gettext("HTTP timeout"), 'httpx.TimeoutException': timeout_text,
"httpx.HTTPStatusError": gettext("HTTP error"), 'httpx.ConnectTimeout': timeout_text,
"httpx.ConnectError": gettext("HTTP connection error"), 'httpx.ReadTimeout': timeout_text,
"httpx.RemoteProtocolError": gettext("HTTP protocol error"), 'httpx.WriteTimeout': timeout_text,
"httpx.LocalProtocolError": gettext("HTTP protocol error"), 'httpx.HTTPStatusError': gettext('HTTP error'),
"httpx.ProtocolError": gettext("HTTP protocol error"), 'httpx.ConnectError': gettext("HTTP connection error"),
"httpx.ReadError": gettext("network error"), 'httpx.RemoteProtocolError': http_protocol_error_text,
"httpx.WriteError": gettext("network error"), 'httpx.LocalProtocolError': http_protocol_error_text,
"httpx.ProxyError": gettext("proxy error"), 'httpx.ProtocolError': http_protocol_error_text,
"searx.exceptions.SearxEngineXPathException": gettext("parsing error"), 'httpx.ReadError': network_error_text,
"KeyError": gettext("parsing error"), 'httpx.WriteError': network_error_text,
"json.decoder.JSONDecodeError": gettext("parsing error"), 'httpx.ProxyError': gettext("proxy error"),
"lxml.etree.ParserError": gettext("parsing error"), 'searx.exceptions.SearxEngineCaptchaException': gettext("CAPTCHA"),
None: gettext("unexpected crash"), 'searx.exceptions.SearxEngineTooManyRequestsException': gettext("too many requests"),
'searx.exceptions.SearxEngineAccessDeniedException': gettext("access denied"),
'searx.exceptions.SearxEngineAPIException': gettext("server API error"),
'searx.exceptions.SearxEngineXPathException': parsing_error_text,
'KeyError': parsing_error_text,
'json.decoder.JSONDecodeError': parsing_error_text,
'lxml.etree.ParserError': parsing_error_text,
} }
_flask_babel_get_translations = flask_babel.get_translations _flask_babel_get_translations = flask_babel.get_translations
@ -786,15 +792,21 @@ def search():
def __get_translated_errors(unresponsive_engines): def __get_translated_errors(unresponsive_engines):
translated_errors = set() translated_errors = []
for unresponsive_engine in unresponsive_engines: # make a copy unresponsive_engines to avoid "RuntimeError: Set changed size during iteration"
error_msg = gettext(unresponsive_engine[1]) # it happens when an engine modifies the ResultContainer after the search_multiple_requests method
# has stopped waiting
for unresponsive_engine in list(unresponsive_engines):
error_user_text = exception_classname_to_text.get(unresponsive_engine[1])
if not error_user_text:
error_user_text = exception_classname_to_text[None]
error_msg = gettext(error_user_text)
if unresponsive_engine[2]: if unresponsive_engine[2]:
error_msg = "{} {}".format(error_msg, unresponsive_engine[2]) error_msg = "{} {}".format(error_msg, unresponsive_engine[2])
if unresponsive_engine[3]: if unresponsive_engine[3]:
error_msg = gettext('Suspended') + ': ' + error_msg error_msg = gettext('Suspended') + ': ' + error_msg
translated_errors.add((unresponsive_engine[0], error_msg)) translated_errors.append((unresponsive_engine[0], error_msg))
return translated_errors return sorted(translated_errors, key=lambda e: e[0])
@app.route('/about', methods=['GET']) @app.route('/about', methods=['GET'])
@ -944,14 +956,14 @@ def preferences():
# the first element has the highest percentage rate. # the first element has the highest percentage rate.
reliabilities_errors = [] reliabilities_errors = []
for error in errors: for error in errors:
error_user_message = None error_user_text = None
if error.get('secondary') or 'exception_classname' not in error: if error.get('secondary') or 'exception_classname' not in error:
continue continue
error_user_message = exception_classname_to_label.get(error.get('exception_classname')) error_user_text = exception_classname_to_text.get(error.get('exception_classname'))
if not error: if not error:
error_user_message = exception_classname_to_label[None] error_user_text = exception_classname_to_text[None]
if error_user_message not in reliabilities_errors: if error_user_text not in reliabilities_errors:
reliabilities_errors.append(error_user_message) reliabilities_errors.append(error_user_text)
reliabilities[e.name]['errors'] = reliabilities_errors reliabilities[e.name]['errors'] = reliabilities_errors
# supports # supports