From b8c7c2c9aa604fd1fb7be5559c9ad025ceb17aa4 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 28 May 2023 18:58:31 +0200 Subject: [PATCH] [mod] botdetection - improve ip_limit and link_token methods - counting requests in LONG_WINDOW and BURST_WINDOW is not needed when the request is validated by the link_token method [1] - renew a ping-key on validation [2], this is needed for infinite scrolling, where no new token (CSS) is loaded. / this does not fix the BURST_MAX issue in the vanilla limiter - normalize the counter names of the ip_limit method to 'ip_limit.*' - just integrate the ip_limit method straight forward in the limiter plugin / non intermediate code --> ip_limit now returns None or a werkzeug.Response object that can be passed by the plugin to the flask application / non intermediate code that returns a tuple [1] https://github.com/searxng/searxng/pull/2357#issuecomment-1566113277 [2] https://github.com/searxng/searxng/pull/2357#discussion_r1208542206 [3] https://github.com/searxng/searxng/pull/2357#issuecomment-1566125979 Signed-off-by: Markus Heiser --- searx/botdetection/__init__.py | 16 +--- searx/botdetection/_helpers.py | 93 ++++++++++++++++++++++ searx/botdetection/http_accept.py | 8 +- searx/botdetection/http_accept_encoding.py | 8 +- searx/botdetection/http_accept_language.py | 8 +- searx/botdetection/http_connection.py | 8 +- searx/botdetection/http_user_agent.py | 11 ++- searx/botdetection/ip_limit.py | 61 +++++++------- searx/botdetection/limiter.py | 11 ++- searx/botdetection/link_token.py | 43 +++++++--- searx/plugins/limiter.py | 14 +--- 11 files changed, 197 insertions(+), 84 deletions(-) create mode 100644 searx/botdetection/_helpers.py diff --git a/searx/botdetection/__init__.py b/searx/botdetection/__init__.py index 78a7d30f3..b4de0f9c8 100644 --- a/searx/botdetection/__init__.py +++ b/searx/botdetection/__init__.py @@ -9,18 +9,4 @@ The methods implemented in this python package are use by the :ref:`limiter src` """ -import flask - - -def dump_request(request: flask.Request): - return ( - "%s: '%s'" % (request.headers.get('X-Forwarded-For'), request.path) - + " || form: %s" % request.form - + " || Accept: %s" % request.headers.get('Accept') - + " || Accept-Language: %s" % request.headers.get('Accept-Language') - + " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding') - + " || Content-Type: %s" % request.headers.get('Content-Type') - + " || Content-Length: %s" % request.headers.get('Content-Length') - + " || Connection: %s" % request.headers.get('Connection') - + " || User-Agent: %s" % request.headers.get('User-Agent') - ) +from ._helpers import dump_request diff --git a/searx/botdetection/_helpers.py b/searx/botdetection/_helpers.py new file mode 100644 index 000000000..b034b980b --- /dev/null +++ b/searx/botdetection/_helpers.py @@ -0,0 +1,93 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +# pylint: disable=missing-module-docstring, invalid-name + +from typing import Optional +import flask +import werkzeug + +from searx import logger + +logger = logger.getChild('botdetection') + + +def dump_request(request: flask.Request): + return ( + "%s: %s" % (get_real_ip(request), request.path) + + " || X-Forwarded-For: %s" % request.headers.get('X-Forwarded-For') + + " || X-Real-IP: %s" % request.headers.get('X-Real-IP') + + " || form: %s" % request.form + + " || Accept: %s" % request.headers.get('Accept') + + " || Accept-Language: %s" % request.headers.get('Accept-Language') + + " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding') + + " || Content-Type: %s" % request.headers.get('Content-Type') + + " || Content-Length: %s" % request.headers.get('Content-Length') + + " || Connection: %s" % request.headers.get('Connection') + + " || User-Agent: %s" % request.headers.get('User-Agent') + ) + + +def too_many_requests(request: flask.Request, log_msg: str) -> Optional[werkzeug.Response]: + log_prefix = 'BLOCK %s: ' % get_real_ip(request) + logger.debug(log_prefix + log_msg) + return flask.make_response(('Too Many Requests', 429)) + + +def get_real_ip(request: flask.Request) -> str: + """Returns real IP of the request. Since not all proxies set all the HTTP + headers and incoming headers can be faked it may happen that the IP cannot + be determined correctly. + + .. sidebar:: :py:obj:`flask.Request.remote_addr` + + SearXNG uses Werkzeug's ProxyFix_ (with it default ``x_for=1``). + + This function tries to get the remote IP in the order listed below, + additional some tests are done and if inconsistencies or errors are + detected, they are logged. + + The remote IP of the request is taken from (first match): + + - X-Forwarded-For_ header + - `X-real-IP header `__ + - :py:obj:`flask.Request.remote_addr` + + .. _ProxyFix: + https://werkzeug.palletsprojects.com/middleware/proxy_fix/ + + .. _X-Forwarded-For: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For + + """ + + forwarded_for = request.headers.get("X-Forwarded-For") + real_ip = request.headers.get('X-Real-IP') + remote_addr = request.remote_addr + logger.debug("X-Forwarded-For: %s || X-Real-IP: %s || request.remote_addr: %s", forwarded_for, real_ip, remote_addr) + + if not forwarded_for: + logger.error("X-Forwarded-For header is not set!") + else: + from .limiter import get_cfg # pylint: disable=import-outside-toplevel, cyclic-import + + forwarded_for = [x.strip() for x in forwarded_for.split(',')] + x_for: int = get_cfg()['real_ip.x_for'] + forwarded_for = forwarded_for[-min(len(forwarded_for), x_for)] + + if not real_ip: + logger.error("X-Real-IP header is not set!") + + if forwarded_for and real_ip and forwarded_for != real_ip: + logger.warning("IP from X-Real-IP (%s) is not equal to IP from X-Forwarded-For (%s)", real_ip, forwarded_for) + + if forwarded_for and remote_addr and forwarded_for != remote_addr: + logger.warning( + "IP from WSGI environment (%s) is not equal to IP from X-Forwarded-For (%s)", remote_addr, forwarded_for + ) + + if real_ip and remote_addr and real_ip != remote_addr: + logger.warning("IP from WSGI environment (%s) is not equal to IP from X-Real-IP (%s)", remote_addr, real_ip) + + request_ip = forwarded_for or real_ip or remote_addr or '0.0.0.0' + logger.debug("get_real_ip() -> %s", request_ip) + return request_ip diff --git a/searx/botdetection/http_accept.py b/searx/botdetection/http_accept.py index 23670a283..60e2330ae 100644 --- a/searx/botdetection/http_accept.py +++ b/searx/botdetection/http_accept.py @@ -15,13 +15,15 @@ Accept_ header .. """ # pylint: disable=unused-argument -from typing import Optional, Tuple +from typing import Optional import flask +import werkzeug from searx.tools import config +from ._helpers import too_many_requests -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: if 'text/html' not in request.accept_mimetypes: - return 429, "bot detected, HTTP header Accept did not contain text/html" + return too_many_requests(request, "HTTP header Accept did not contain text/html") return None diff --git a/searx/botdetection/http_accept_encoding.py b/searx/botdetection/http_accept_encoding.py index 191249711..5301c5d9d 100644 --- a/searx/botdetection/http_accept_encoding.py +++ b/searx/botdetection/http_accept_encoding.py @@ -16,14 +16,16 @@ bot if the Accept-Encoding_ header .. """ # pylint: disable=unused-argument -from typing import Optional, Tuple +from typing import Optional import flask +import werkzeug from searx.tools import config +from ._helpers import too_many_requests -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')] if not ('gzip' in accept_list or 'deflate' in accept_list): - return 429, "bot detected, HTTP header Accept-Encoding did not contain gzip nor deflate" + return too_many_requests(request, "HTTP header Accept-Encoding did not contain gzip nor deflate") return None diff --git a/searx/botdetection/http_accept_language.py b/searx/botdetection/http_accept_language.py index 558a216cf..060f67ec0 100644 --- a/searx/botdetection/http_accept_language.py +++ b/searx/botdetection/http_accept_language.py @@ -13,13 +13,15 @@ if the Accept-Language_ header is unset. """ # pylint: disable=unused-argument -from typing import Optional, Tuple +from typing import Optional import flask +import werkzeug from searx.tools import config +from ._helpers import too_many_requests -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: if request.headers.get('Accept-Language', '').strip() == '': - return 429, "bot detected, missing HTTP header Accept-Language" + return too_many_requests(request, "missing HTTP header Accept-Language") return None diff --git a/searx/botdetection/http_connection.py b/searx/botdetection/http_connection.py index 0ef24a7b8..e718dfe3f 100644 --- a/searx/botdetection/http_connection.py +++ b/searx/botdetection/http_connection.py @@ -13,13 +13,15 @@ the Connection_ header is set to ``close``. """ # pylint: disable=unused-argument -from typing import Optional, Tuple +from typing import Optional import flask +import werkzeug from searx.tools import config +from ._helpers import too_many_requests -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: if request.headers.get('Connection', '').strip() == 'close': - return 429, "bot detected, HTTP header 'Connection=close'" + return too_many_requests(request, "HTTP header 'Connection=close") return None diff --git a/searx/botdetection/http_user_agent.py b/searx/botdetection/http_user_agent.py index 3d1ec9173..70309e975 100644 --- a/searx/botdetection/http_user_agent.py +++ b/searx/botdetection/http_user_agent.py @@ -14,11 +14,13 @@ the User-Agent_ header is unset or matches the regular expression """ # pylint: disable=unused-argument -from typing import Optional, Tuple +from typing import Optional import re import flask +import werkzeug from searx.tools import config +from ._helpers import too_many_requests USER_AGENT = ( @@ -48,11 +50,8 @@ def regexp_user_agent(): return _regexp -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: user_agent = request.headers.get('User-Agent', 'unknown') if regexp_user_agent().match(user_agent): - return ( - 429, - f"bot detected, HTTP header User-Agent: {user_agent}", - ) + return too_many_requests(request, f"bot detected, HTTP header User-Agent: {user_agent}") return None diff --git a/searx/botdetection/ip_limit.py b/searx/botdetection/ip_limit.py index 9cffff7f0..e7fa57187 100644 --- a/searx/botdetection/ip_limit.py +++ b/searx/botdetection/ip_limit.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint """.. _botdetection.ip_limit: Method ``ip_limit`` @@ -37,16 +39,18 @@ droped. """ -from typing import Optional, Tuple +from typing import Optional import flask +import werkzeug from searx.tools import config - from searx import redisdb from searx import logger from searx.redislib import incr_sliding_window, drop_counter from . import link_token +from ._helpers import too_many_requests + logger = logger.getChild('botdetection.ip_limit') @@ -81,50 +85,51 @@ SUSPICIOUS_IP_MAX = 3 """Maximum requests from one suspicious IP in the :py:obj:`SUSPICIOUS_IP_WINDOW`.""" -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]: +def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: + # pylint: disable=too-many-return-statements redis_client = redisdb.client() - x_forwarded_for = request.headers.get('X-Forwarded-For', '') - if not x_forwarded_for: + client_ip = request.headers.get('X-Forwarded-For', '') + if not client_ip: logger.error("missing HTTP header X-Forwarded-For") if request.args.get('format', 'html') != 'html': - c = incr_sliding_window(redis_client, 'IP limit - API_WONDOW:' + x_forwarded_for, API_WONDOW) + c = incr_sliding_window(redis_client, 'ip_limit.API_WONDOW:' + client_ip, API_WONDOW) if c > API_MAX: - return 429, "BLOCK %s: API limit exceeded" - - suspicious = False - suspicious_ip_counter = 'IP limit - SUSPICIOUS_IP_WINDOW:' + x_forwarded_for + return too_many_requests(request, "too many request in API_WINDOW") if cfg['botdetection.ip_limit.link_token']: - suspicious = link_token.is_suspicious(request) - if suspicious: + suspicious = link_token.is_suspicious(request, True) + + if not suspicious: + # this IP is no longer suspicious: release ip again / delete the counter of this IP + drop_counter(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + client_ip) + return None # this IP is suspicious: count requests from this IP - c = incr_sliding_window(redis_client, suspicious_ip_counter, SUSPICIOUS_IP_WINDOW) + c = incr_sliding_window(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + client_ip, SUSPICIOUS_IP_WINDOW) if c > SUSPICIOUS_IP_MAX: - return 429, f"bot detected, too many request from {x_forwarded_for} in SUSPICIOUS_IP_WINDOW" + logger.error("BLOCK: too many request from %s in SUSPICIOUS_IP_WINDOW (redirect to /)", client_ip) + return flask.redirect(flask.url_for('index'), code=302) - c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW) + c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + client_ip, BURST_WINDOW) if c > BURST_MAX_SUSPICIOUS: - return 429, f"bot detected, too many request from {x_forwarded_for} in BURST_MAX_SUSPICIOUS" + return too_many_requests(request, "too many request in BURST_WINDOW (BURST_MAX_SUSPICIOUS)") - c = incr_sliding_window(redis_client, 'IP limit - LONG_WINDOW:' + x_forwarded_for, LONG_WINDOW) + c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + client_ip, LONG_WINDOW) if c > LONG_MAX_SUSPICIOUS: - return 429, f"bot detected, too many request from {x_forwarded_for} in LONG_MAX_SUSPICIOUS" + return too_many_requests(request, "too many request in LONG_WINDOW (LONG_MAX_SUSPICIOUS)") - else: + return None - if cfg['botdetection.ip_limit.link_token']: - # this IP is no longer suspicious: release ip again / delete the counter of this IP - drop_counter(redis_client, suspicious_ip_counter) + # vanilla limiter without extensions counts BURST_MAX and LONG_MAX + c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + client_ip, BURST_WINDOW) + if c > BURST_MAX: + return too_many_requests(request, "too many request in BURST_WINDOW (BURST_MAX)") - c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW) - if c > BURST_MAX: - return 429, f"bot detected, too many request from {x_forwarded_for} in BURST_MAX" + c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + client_ip, LONG_WINDOW) + if c > LONG_MAX: + return too_many_requests(request, "too many request in LONG_WINDOW (LONG_MAX)") - c = incr_sliding_window(redis_client, 'IP limit - LONG_WINDOW:' + x_forwarded_for, LONG_WINDOW) - if c > LONG_MAX: - return 429, f"bot detected, too many request from {x_forwarded_for} in LONG_MAX" return None diff --git a/searx/botdetection/limiter.py b/searx/botdetection/limiter.py index cc1e00b3c..93826684f 100644 --- a/searx/botdetection/limiter.py +++ b/searx/botdetection/limiter.py @@ -42,6 +42,7 @@ from pathlib import Path import flask import pytomlpp as toml +from searx import logger from searx.tools import config from searx.botdetection import ( http_accept, @@ -62,7 +63,13 @@ CFG_DEPRECATED = { # "dummy.old.foo": "config 'dummy.old.foo' exists only for tests. Don't use it in your real project config." } -CFG = config.Config({}, {}) +CFG = None + + +def get_cfg() -> config.Config: + if CFG is None: + init_cfg(logger) + return CFG def init_cfg(log): @@ -73,7 +80,7 @@ def init_cfg(log): log.warning("missing config file: %s", LIMITER_CFG) return - log.warning("load config file: %s", LIMITER_CFG) + log.info("load config file: %s", LIMITER_CFG) try: upd_cfg = toml.load(LIMITER_CFG) except toml.DecodeError as exc: diff --git a/searx/botdetection/link_token.py b/searx/botdetection/link_token.py index 8ef215f6c..376d06d61 100644 --- a/searx/botdetection/link_token.py +++ b/searx/botdetection/link_token.py @@ -47,15 +47,24 @@ from searx.redislib import secret_hash TOKEN_LIVE_TIME = 600 """Livetime (sec) of limiter's CSS token.""" +PING_LIVE_TIME = 3600 +"""Livetime (sec) of the ping-key from a client (request)""" + PING_KEY = 'SearXNG_limiter.ping' +"""Prefix of all ping-keys generated by :py:obj:`get_ping_key`""" + TOKEN_KEY = 'SearXNG_limiter.token' +"""Key for which the current token is stored in the DB""" logger = logger.getChild('botdetection.link_token') -def is_suspicious(request: flask.Request): +def is_suspicious(request: flask.Request, renew: bool = False): """Checks if there is a valid ping for this request, if not this request is - rated as *suspicious*""" + rated as *suspicious*. If a valid ping exists and argument ``renew`` is + ``True`` the expire time of this ping is reset to :py:obj:`PING_LIVE_TIME`. + + """ redis_client = redisdb.client() if not redis_client: return False @@ -69,12 +78,19 @@ def is_suspicious(request: flask.Request): ) return True - logger.debug("found ping for this request: %s", ping_key) + if renew: + redis_client.set(ping_key, 1, ex=PING_LIVE_TIME) + + logger.debug("found ping for client request: %s", ping_key) return False def ping(request: flask.Request, token: str): - """This function is called by a request to URL ``/client.css``""" + """This function is called by a request to URL ``/client.css``. If + ``token`` is valid a :py:obj:`PING_KEY` for the client is stored in the DB. + The expire time of this ping-key is :py:obj:`PING_LIVE_TIME`. + + """ redis_client = redisdb.client() if not redis_client: return @@ -82,19 +98,24 @@ def ping(request: flask.Request, token: str): return ping_key = get_ping_key(request) logger.debug("store ping for: %s", ping_key) - redis_client.set(ping_key, 1, ex=TOKEN_LIVE_TIME) + redis_client.set(ping_key, 1, ex=PING_LIVE_TIME) def get_ping_key(request: flask.Request): - """Generates a hashed key that fits (more or less) to a request. At least - X-Forwarded-For_ is needed to be able to assign the request to an IP. + """Generates a hashed key that fits (more or less) to a client (request). + At least X-Forwarded-For_ is needed to be able to assign the request to an + IP. """ - return secret_hash( + return ( PING_KEY - + request.headers.get('X-Forwarded-For', '') - + request.headers.get('Accept-Language', '') - + request.headers.get('User-Agent', '') + + "[" + + secret_hash( + request.headers.get('X-Forwarded-For', '') + + request.headers.get('Accept-Language', '') + + request.headers.get('User-Agent', '') + ) + + "]" ) diff --git a/searx/plugins/limiter.py b/searx/plugins/limiter.py index 92b0aa2a0..7edbb1ce0 100644 --- a/searx/plugins/limiter.py +++ b/searx/plugins/limiter.py @@ -20,16 +20,10 @@ logger = logger.getChild('limiter') def pre_request(): """See :ref:`flask.Flask.before_request`""" - - val = limiter.filter_request(flask.request) - if val is not None: - http_status, msg = val - client_ip = flask.request.headers.get('X-Forwarded-For', '') - logger.error("BLOCK (IP %s): %s" % (client_ip, msg)) - return 'Too Many Requests', http_status - - logger.debug("OK: %s" % dump_request(flask.request)) - return None + ret_val = limiter.filter_request(flask.request) + if ret_val is None: + logger.debug("OK: %s" % dump_request(flask.request)) + return ret_val def init(app: flask.Flask, settings) -> bool: