forked from Ponysearch/Ponysearch
[fix] engine: duckduckgo - CAPTCHA detection
The previous implementation could not distinguish a CAPTCHA response from an ordinary result list. In the previous implementation a CAPTCHA was taken as a result list where no items are in. DDG does not block IPs. Instead, a CAPTCHA wall is placed in front of request on a dubious request. Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
parent
88caa1d7db
commit
050451347b
2 changed files with 18 additions and 5 deletions
|
@ -25,6 +25,7 @@ from searx.network import get # see https://github.com/searxng/searxng/issues/7
|
||||||
from searx import redisdb
|
from searx import redisdb
|
||||||
from searx.enginelib.traits import EngineTraits
|
from searx.enginelib.traits import EngineTraits
|
||||||
from searx.utils import extr
|
from searx.utils import extr
|
||||||
|
from searx.exceptions import SearxEngineCaptchaException
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
import logging
|
import logging
|
||||||
|
@ -292,6 +293,15 @@ def request(query, params):
|
||||||
return params
|
return params
|
||||||
|
|
||||||
|
|
||||||
|
def detect_ddg_captcha(dom):
|
||||||
|
"""In case of CAPTCHA ddg open its own *not a Robot* dialog and is
|
||||||
|
not redirected to CAPTCHA page.
|
||||||
|
"""
|
||||||
|
if eval_xpath(dom, "//form[@id='challenge-form']"):
|
||||||
|
# set suspend time to zero is OK --> ddg does not block the IP
|
||||||
|
raise SearxEngineCaptchaException(suspended_time=0)
|
||||||
|
|
||||||
|
|
||||||
def response(resp):
|
def response(resp):
|
||||||
|
|
||||||
if resp.status_code == 303:
|
if resp.status_code == 303:
|
||||||
|
@ -299,6 +309,7 @@ def response(resp):
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
doc = lxml.html.fromstring(resp.text)
|
doc = lxml.html.fromstring(resp.text)
|
||||||
|
detect_ddg_captcha(doc)
|
||||||
|
|
||||||
result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table')
|
result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table')
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
"""Exception types raised by SearXNG modules.
|
"""Exception types raised by SearXNG modules.
|
||||||
"""
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
|
|
||||||
|
@ -61,7 +62,7 @@ class SearxEngineAccessDeniedException(SearxEngineResponseException):
|
||||||
"""This settings contains the default suspended time (default 86400 sec / 1
|
"""This settings contains the default suspended time (default 86400 sec / 1
|
||||||
day)."""
|
day)."""
|
||||||
|
|
||||||
def __init__(self, suspended_time: int = None, message: str = 'Access denied'):
|
def __init__(self, suspended_time: int | None = None, message: str = 'Access denied'):
|
||||||
"""Generic exception to raise when an engine denies access to the results.
|
"""Generic exception to raise when an engine denies access to the results.
|
||||||
|
|
||||||
:param suspended_time: How long the engine is going to be suspended in
|
:param suspended_time: How long the engine is going to be suspended in
|
||||||
|
@ -70,12 +71,13 @@ class SearxEngineAccessDeniedException(SearxEngineResponseException):
|
||||||
:param message: Internal message. Defaults to ``Access denied``
|
:param message: Internal message. Defaults to ``Access denied``
|
||||||
:type message: str
|
:type message: str
|
||||||
"""
|
"""
|
||||||
suspended_time = suspended_time or self._get_default_suspended_time()
|
if suspended_time is None:
|
||||||
|
suspended_time = self._get_default_suspended_time()
|
||||||
super().__init__(message + ', suspended_time=' + str(suspended_time))
|
super().__init__(message + ', suspended_time=' + str(suspended_time))
|
||||||
self.suspended_time = suspended_time
|
self.suspended_time = suspended_time
|
||||||
self.message = message
|
self.message = message
|
||||||
|
|
||||||
def _get_default_suspended_time(self):
|
def _get_default_suspended_time(self) -> int:
|
||||||
from searx import get_setting # pylint: disable=C0415
|
from searx import get_setting # pylint: disable=C0415
|
||||||
|
|
||||||
return get_setting(self.SUSPEND_TIME_SETTING)
|
return get_setting(self.SUSPEND_TIME_SETTING)
|
||||||
|
@ -88,7 +90,7 @@ class SearxEngineCaptchaException(SearxEngineAccessDeniedException):
|
||||||
"""This settings contains the default suspended time (default 86400 sec / 1
|
"""This settings contains the default suspended time (default 86400 sec / 1
|
||||||
day)."""
|
day)."""
|
||||||
|
|
||||||
def __init__(self, suspended_time=None, message='CAPTCHA'):
|
def __init__(self, suspended_time: int | None = None, message='CAPTCHA'):
|
||||||
super().__init__(message=message, suspended_time=suspended_time)
|
super().__init__(message=message, suspended_time=suspended_time)
|
||||||
|
|
||||||
|
|
||||||
|
@ -102,7 +104,7 @@ class SearxEngineTooManyRequestsException(SearxEngineAccessDeniedException):
|
||||||
"""This settings contains the default suspended time (default 3660 sec / 1
|
"""This settings contains the default suspended time (default 3660 sec / 1
|
||||||
hour)."""
|
hour)."""
|
||||||
|
|
||||||
def __init__(self, suspended_time=None, message='Too many request'):
|
def __init__(self, suspended_time: int | None = None, message='Too many request'):
|
||||||
super().__init__(message=message, suspended_time=suspended_time)
|
super().__init__(message=message, suspended_time=suspended_time)
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue