From fe419e355bf1527c51e3aee98495d08b89510320 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Fri, 15 Jul 2022 18:38:32 +0200 Subject: [PATCH] The checker requires Redis Remove the abstraction in searx.shared.SharedDict. Implement a basic and dedicated scheduler for the checker using a Redis script. --- searx/plugins/limiter.py | 5 +- searx/search/checker/__init__.py | 2 + searx/search/checker/background.py | 114 ++++++++++++++--------------- searx/search/checker/scheduler.lua | 36 +++++++++ searx/search/checker/scheduler.py | 57 +++++++++++++++ searx/settings_defaults.py | 3 +- searx/shared/__init__.py | 41 +---------- searx/shared/redisdb.py | 18 ++--- searx/shared/shared_abstract.py | 22 ------ searx/shared/shared_simple.py | 40 ---------- searx/shared/shared_uwsgi.py | 64 ---------------- searx/webapp.py | 2 + 12 files changed, 167 insertions(+), 237 deletions(-) create mode 100644 searx/search/checker/scheduler.lua create mode 100644 searx/search/checker/scheduler.py delete mode 100644 searx/shared/shared_abstract.py delete mode 100644 searx/shared/shared_simple.py delete mode 100644 searx/shared/shared_uwsgi.py diff --git a/searx/plugins/limiter.py b/searx/plugins/limiter.py index c6f5d6a0f..c11fd506b 100644 --- a/searx/plugins/limiter.py +++ b/searx/plugins/limiter.py @@ -93,9 +93,8 @@ def init(app, settings): if not settings['server']['limiter']: return False - logger.debug("init limiter DB") # pylint: disable=undefined-variable - if not redisdb.init(): - logger.error("init limiter DB failed!!!") # pylint: disable=undefined-variable + if not redisdb.client(): + logger.error("The limiter requires Redis") # pylint: disable=undefined-variable return False app.before_request(pre_request) diff --git a/searx/search/checker/__init__.py b/searx/search/checker/__init__.py index 85b9178df..7d779a282 100644 --- a/searx/search/checker/__init__.py +++ b/searx/search/checker/__init__.py @@ -2,3 +2,5 @@ from .impl import Checker from .background import initialize, get_result + +__all__ = ('Checker', 'initialize', 'get_result') diff --git a/searx/search/checker/background.py b/searx/search/checker/background.py index 3908245f8..e5bd642c0 100644 --- a/searx/search/checker/background.py +++ b/searx/search/checker/background.py @@ -1,26 +1,28 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint # pylint: disable=missing-module-docstring -# pyright: strict +# pyright: basic import json -import random import time import threading import os import signal -from typing import Dict, Union, List, Any, Tuple +from typing import Dict, Union, List, Any, Tuple, Optional from typing_extensions import TypedDict, Literal +import redis.exceptions + from searx import logger, settings, searx_debug +from searx.shared.redisdb import client as get_redis_client from searx.exceptions import SearxSettingsException from searx.search.processors import PROCESSORS from searx.search.checker import Checker -from searx.shared import schedule, storage # pyright: ignore +from searx.search.checker.scheduler import scheduler_function -CHECKER_RESULT = 'CHECKER_RESULT' -running = threading.Lock() +REDIS_RESULT_KEY = 'SearXNG_checker_result' +REDIS_LOCK_KEY = 'SearXNG_checker_lock' CheckerResult = Union['CheckerOk', 'CheckerErr', 'CheckerOther'] @@ -77,20 +79,24 @@ def _get_interval(every: Any, error_msg: str) -> Tuple[int, int]: return (every[0], every[1]) -def _get_every(): - every = settings.get('checker', {}).get('scheduling', {}).get('every', (300, 1800)) - return _get_interval(every, 'checker.scheduling.every is not a int or list') - - def get_result() -> CheckerResult: - serialized_result = storage.get_str(CHECKER_RESULT) - if serialized_result is not None: - return json.loads(serialized_result) - return {'status': 'unknown'} + client = get_redis_client() + if client is None: + # without Redis, the checker is disabled + return {'status': 'disabled'} + serialized_result: Optional[bytes] = client.get(REDIS_RESULT_KEY) + if serialized_result is None: + # the Redis key does not exist + return {'status': 'unknown'} + return json.loads(serialized_result) def _set_result(result: CheckerResult): - storage.set_str(CHECKER_RESULT, json.dumps(result)) + client = get_redis_client() + if client is None: + # without Redis, the function does nothing + return + client.set(REDIS_RESULT_KEY, json.dumps(result)) def _timestamp(): @@ -98,41 +104,29 @@ def _timestamp(): def run(): - if not running.acquire(blocking=False): # pylint: disable=consider-using-with - return try: - logger.info('Starting checker') - result: CheckerOk = {'status': 'ok', 'engines': {}, 'timestamp': _timestamp()} - for name, processor in PROCESSORS.items(): - logger.debug('Checking %s engine', name) - checker = Checker(processor) - checker.run() - if checker.test_results.successful: - result['engines'][name] = {'success': True} - else: - result['engines'][name] = {'success': False, 'errors': checker.test_results.errors} + # use a Redis lock to make sure there is no checker running at the same time + # (this should not happen, this is a safety measure) + with get_redis_client().lock(REDIS_LOCK_KEY, blocking_timeout=60, timeout=3600): + logger.info('Starting checker') + result: CheckerOk = {'status': 'ok', 'engines': {}, 'timestamp': _timestamp()} + for name, processor in PROCESSORS.items(): + logger.debug('Checking %s engine', name) + checker = Checker(processor) + checker.run() + if checker.test_results.successful: + result['engines'][name] = {'success': True} + else: + result['engines'][name] = {'success': False, 'errors': checker.test_results.errors} - _set_result(result) - logger.info('Check done') + _set_result(result) + logger.info('Check done') + except redis.exceptions.LockError: + _set_result({'status': 'error', 'timestamp': _timestamp()}) + logger.exception('Error while running the checker') except Exception: # pylint: disable=broad-except _set_result({'status': 'error', 'timestamp': _timestamp()}) logger.exception('Error while running the checker') - finally: - running.release() - - -def _run_with_delay(): - every = _get_every() - delay = random.randint(0, every[1] - every[0]) - logger.debug('Start checker in %i seconds', delay) - time.sleep(delay) - run() - - -def _start_scheduling(): - every = _get_every() - if schedule(every[0], _run_with_delay): - run() def _signal_handler(_signum: int, _frame: Any): @@ -147,27 +141,31 @@ def initialize(): logger.info('Send SIGUSR1 signal to pid %i to start the checker', os.getpid()) signal.signal(signal.SIGUSR1, _signal_handler) - # disabled by default - _set_result({'status': 'disabled'}) - # special case when debug is activate - if searx_debug and settings.get('checker', {}).get('off_when_debug', True): + if searx_debug and settings['checker']['off_when_debug']: logger.info('debug mode: checker is disabled') return # check value of checker.scheduling.every now - scheduling = settings.get('checker', {}).get('scheduling', None) + scheduling = settings['checker']['scheduling'] if scheduling is None or not scheduling: logger.info('Checker scheduler is disabled') return - # - _set_result({'status': 'unknown'}) + # make sure there is a Redis connection + if get_redis_client() is None: + logger.error('The checker requires Redis') + return - start_after = scheduling.get('start_after', (300, 1800)) - start_after = _get_interval(start_after, 'checker.scheduling.start_after is not a int or list') - delay = random.randint(start_after[0], start_after[1]) - logger.info('Start checker in %i seconds', delay) - t = threading.Timer(delay, _start_scheduling) + # start the background scheduler + every_range = _get_interval(scheduling.get('every', (300, 1800)), 'checker.scheduling.every is not a int or list') + start_after_range = _get_interval( + scheduling.get('start_after', (300, 1800)), 'checker.scheduling.start_after is not a int or list' + ) + t = threading.Thread( + target=scheduler_function, + args=(start_after_range[0], start_after_range[1], every_range[0], every_range[1], run), + name='checker_scheduler', + ) t.daemon = True t.start() diff --git a/searx/search/checker/scheduler.lua b/searx/search/checker/scheduler.lua new file mode 100644 index 000000000..b3c6023fe --- /dev/null +++ b/searx/search/checker/scheduler.lua @@ -0,0 +1,36 @@ +-- SPDX-License-Identifier: AGPL-3.0-or-later +-- +-- This script is not a string in scheduler.py, so editors can provide syntax highlighting. + +-- The Redis KEY is defined here and not in Python on purpose: +-- only this LUA script can read and update this key to avoid lock and concurrency issues. +local redis_key = 'SearXNG_checker_next_call_ts' + +local now = redis.call('TIME')[1] +local start_after_from = ARGV[1] +local start_after_to = ARGV[2] +local every_from = ARGV[3] +local every_to = ARGV[4] + +local next_call_ts = redis.call('GET', redis_key) + +if (next_call_ts == false or next_call_ts == nil) then + -- the scheduler has never run on this Redis instance, so: + -- 1/ the scheduler does not run now + -- 2/ the next call is a random time between start_after_from and start_after_to + local delay = start_after_from + math.random(start_after_to - start_after_from) + redis.call('SET', redis_key, now + delay) + return { false, delay } +end + +-- next_call_ts is defined +-- --> if now is lower than next_call_ts then we don't run the embedded checker +-- --> if now is higher then we update next_call_ts and ask to run the embedded checker now. +local call_now = next_call_ts <= now +if call_now then + -- the checker runs now, define the timestamp of the next call: + -- this is a random delay between every_from and every_to + local periodic_delay = every_from + math.random(every_to - every_from) + next_call_ts = redis.call('INCRBY', redis_key, periodic_delay) +end +return { call_now, next_call_ts - now } diff --git a/searx/search/checker/scheduler.py b/searx/search/checker/scheduler.py new file mode 100644 index 000000000..1ae635951 --- /dev/null +++ b/searx/search/checker/scheduler.py @@ -0,0 +1,57 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +# pylint: disable=missing-module-docstring +"""Lame scheduler which use Redis as a source of truth: +* the Redis key SearXNG_checker_next_call_ts contains the next time the embedded checker should run. +* to avoid lock, a unique Redis script reads and updates the Redis key SearXNG_checker_next_call_ts. +* this Redis script returns a list of two elements: + * the first one is a boolean. If True, the embedded checker must run now in this worker. + * the second element is the delay in second to wait before the next call to the Redis script. + +This scheduler is not generic on purpose: if more feature are required, a dedicate scheduler must be used +(= a better scheduler should not use the web workers) +""" + +import logging +import time +import importlib +from typing import Callable + +from searx.shared.redisdb import client as get_redis_client +from searx.redislib import lua_script_storage + + +logger = logging.getLogger('searx.search.checker') + + +def scheduler_function(start_after_from: int, start_after_to: int, every_from: int, every_to: int, callback: Callable): + """Run the checker periodically. The function never returns. + + Parameters: + * start_after_from and start_after_to: when to call "callback" for the first on the Redis instance + * every_from and every_to: after the first call, how often to call "callback" + + There is no issue: + * to call this function is multiple workers + * to kill workers at any time as long there is one at least one worker + """ + scheduler_now_script = importlib.resources.read_text(__package__, "scheduler.lua") + while True: + # ask the Redis script what to do + # the script says + # * if the checker must run now. + # * how to long to way before calling the script again (it can be call earlier, but not later). + script = lua_script_storage(get_redis_client(), scheduler_now_script) + call_now, wait_time = script(args=[start_after_from, start_after_to, every_from, every_to]) + + # does the worker run the checker now? + if call_now: + # run the checker + try: + callback() + except Exception: # pylint: disable=broad-except + logger.exception("Error calling the embedded checker") + # only worker display the wait_time + logger.info("Next call to the checker in %s seconds", wait_time) + # wait until the next call + time.sleep(wait_time) diff --git a/searx/settings_defaults.py b/searx/settings_defaults.py index 6575b0b0c..a4f6ce607 100644 --- a/searx/settings_defaults.py +++ b/searx/settings_defaults.py @@ -225,7 +225,8 @@ SCHEMA = { 'plugins': SettingsValue(list, []), 'enabled_plugins': SettingsValue((None, list), None), 'checker': { - 'off_when_debug': SettingsValue(bool, True), + 'off_when_debug': SettingsValue(bool, True, None), + 'scheduling': SettingsValue((None, dict), None, None), }, 'categories_as_tabs': SettingsValue(dict, CATEGORIES_AS_TABS), 'engines': SettingsValue(list, []), diff --git a/searx/shared/__init__.py b/searx/shared/__init__.py index d10ddb33d..2c7fc9f8b 100644 --- a/searx/shared/__init__.py +++ b/searx/shared/__init__.py @@ -1,39 +1,6 @@ # SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Initialization of a *shared* storage. +""" -import logging -import importlib - -logger = logging.getLogger('searx.shared') - -__all__ = ['SharedDict', 'schedule'] - -try: - uwsgi = importlib.import_module('uwsgi') -except: - # no uwsgi - from .shared_simple import SimpleSharedDict as SharedDict, schedule - - logger.info('Use shared_simple implementation') -else: - try: - uwsgi.cache_update('dummy', b'dummy') - if uwsgi.cache_get('dummy') != b'dummy': - raise Exception() - except: - # uwsgi.ini configuration problem: disable all scheduling - logger.error( - 'uwsgi.ini configuration error, add this line to your uwsgi.ini\n' - 'cache2 = name=searxngcache,items=2000,blocks=2000,blocksize=4096,bitmap=1' - ) - from .shared_simple import SimpleSharedDict as SharedDict - - def schedule(delay, func, *args): - return False - - else: - # uwsgi - from .shared_uwsgi import UwsgiCacheSharedDict as SharedDict, schedule - - logger.info('Use shared_uwsgi implementation') - -storage = SharedDict() +from . import redisdb diff --git a/searx/shared/redisdb.py b/searx/shared/redisdb.py index bb7a0eeb4..d0071f72c 100644 --- a/searx/shared/redisdb.py +++ b/searx/shared/redisdb.py @@ -26,26 +26,20 @@ import redis from searx import get_setting -logger = logging.getLogger('searx.shared.redis') +logger = logging.getLogger('searx.shared.redisdb') _client = None -def client(): - global _client # pylint: disable=global-statement - if _client is None: - # not thread safe: in the worst case scenario, two or more clients are - # initialized only one is kept, the others are garbage collected. - _client = redis.Redis.from_url(get_setting('redis.url')) +def client() -> redis.Redis: return _client -def init(): +def initialize(): + global _client # pylint: disable=global-statement try: - c = client() - logger.info("connected redis DB --> %s", c.acl_whoami()) - return True + _client = redis.Redis.from_url(get_setting('redis.url')) + logger.info("connected redis: %s", get_setting('redis.url')) except redis.exceptions.ConnectionError as exc: _pw = pwd.getpwuid(os.getuid()) logger.error("[%s (%s)] can't connect redis DB ...", _pw.pw_name, _pw.pw_uid) logger.error(" %s", exc) - return False diff --git a/searx/shared/shared_abstract.py b/searx/shared/shared_abstract.py deleted file mode 100644 index af4be30ae..000000000 --- a/searx/shared/shared_abstract.py +++ /dev/null @@ -1,22 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# pyright: strict -from abc import ABC, abstractmethod -from typing import Optional - - -class SharedDict(ABC): - @abstractmethod - def get_int(self, key: str) -> Optional[int]: - pass - - @abstractmethod - def set_int(self, key: str, value: int): - pass - - @abstractmethod - def get_str(self, key: str) -> Optional[str]: - pass - - @abstractmethod - def set_str(self, key: str, value: str): - pass diff --git a/searx/shared/shared_simple.py b/searx/shared/shared_simple.py deleted file mode 100644 index 2b9d4c2da..000000000 --- a/searx/shared/shared_simple.py +++ /dev/null @@ -1,40 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later - -import threading -from typing import Optional - -from . import shared_abstract - - -class SimpleSharedDict(shared_abstract.SharedDict): - - __slots__ = ('d',) - - def __init__(self): - self.d = {} - - def get_int(self, key: str) -> Optional[int]: - return self.d.get(key, None) - - def set_int(self, key: str, value: int): - self.d[key] = value - - def get_str(self, key: str) -> Optional[str]: - return self.d.get(key, None) - - def set_str(self, key: str, value: str): - self.d[key] = value - - -def schedule(delay, func, *args): - def call_later(): - t = threading.Timer(delay, wrapper) - t.daemon = True - t.start() - - def wrapper(): - call_later() - func(*args) - - call_later() - return True diff --git a/searx/shared/shared_uwsgi.py b/searx/shared/shared_uwsgi.py deleted file mode 100644 index 0248c6234..000000000 --- a/searx/shared/shared_uwsgi.py +++ /dev/null @@ -1,64 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later - -import time -from typing import Optional -import uwsgi # pyright: ignore # pylint: disable=E0401 -from . import shared_abstract - - -_last_signal = 10 - - -class UwsgiCacheSharedDict(shared_abstract.SharedDict): - def get_int(self, key: str) -> Optional[int]: - value = uwsgi.cache_get(key) - if value is None: - return value - else: - return int.from_bytes(value, 'big') - - def set_int(self, key: str, value: int): - b = value.to_bytes(4, 'big') - uwsgi.cache_update(key, b) - - def get_str(self, key: str) -> Optional[str]: - value = uwsgi.cache_get(key) - if value is None: - return value - else: - return value.decode('utf-8') - - def set_str(self, key: str, value: str): - b = value.encode('utf-8') - uwsgi.cache_update(key, b) - - -def schedule(delay, func, *args): - """ - Can be implemented using a spooler. - https://uwsgi-docs.readthedocs.io/en/latest/PythonDecorators.html - - To make the uwsgi configuration simple, use the alternative implementation. - """ - global _last_signal - - def sighandler(signum): - now = int(time.time()) - key = 'scheduler_call_time_signal_' + str(signum) - uwsgi.lock() - try: - updating = uwsgi.cache_get(key) - if updating is not None: - updating = int.from_bytes(updating, 'big') - if now - updating < delay: - return - uwsgi.cache_update(key, now.to_bytes(4, 'big')) - finally: - uwsgi.unlock() - func(*args) - - signal_num = _last_signal - _last_signal += 1 - uwsgi.register_signal(signal_num, 'worker', sighandler) - uwsgi.add_timer(signal_num, delay) - return True diff --git a/searx/webapp.py b/searx/webapp.py index 5c3fbae8b..4f334a9d0 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -120,6 +120,7 @@ from searx.locales import ( # renaming names from searx imports ... from searx.autocomplete import search_autocomplete, backends as autocomplete_backends from searx.languages import language_codes as languages +from searx.shared.redisdb import initialize as redis_initialize from searx.search import SearchWithPlugins, initialize as search_initialize from searx.network import stream as http_stream, set_context_network_name from searx.search.checker import get_result as checker_get_result @@ -1384,6 +1385,7 @@ werkzeug_reloader = flask_run_development or (searx_debug and __name__ == "__mai if not werkzeug_reloader or (werkzeug_reloader and os.environ.get("WERKZEUG_RUN_MAIN") == "true"): locales_initialize() _INFO_PAGES = infopage.InfoPageSet() + redis_initialize() plugin_initialize(app) search_initialize(enable_checker=True, check_network=True, enable_metrics=settings['general']['enable_metrics'])