Merge remote-tracking branch 'upstream/master'

2025-02-20 14:50:32 +01:00 · 2025-02-20 14:50:32 +01:00 · 5d7b7ec199
commit 5d7b7ec199
parent ab2c5b4606 caf0dd5372
8 changed files with 181 additions and 96 deletions
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
@ -175,3 +175,4 @@ features or generally made searx better:
 - Daniel Kukula `<https://github.com/dkuku>`
 - Patrick Evans `https://github.com/holysoles`
 - Daniel Mowitz `<https://daniel.mowitz.rocks>`
+- `Bearz314 <https://github.com/bearz314>`_
--- a/searx/botdetection/ip_limit.py
+++ b/searx/botdetection/ip_limit.py
@ -123,7 +123,9 @@ def filter_request(
        )
        if c > SUSPICIOUS_IP_MAX:
            logger.error("BLOCK: too many request from %s in SUSPICIOUS_IP_WINDOW (redirect to /)", network)
-            return flask.redirect(flask.url_for('index'), code=302)
+            response = flask.redirect(flask.url_for('index'), code=302)
+            response.headers["Cache-Control"] = "no-store, max-age=0"
+            return response

        c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW)
        if c > BURST_MAX_SUSPICIOUS:
--- a/searx/engines/brave.py
+++ b/searx/engines/brave.py
@ -254,14 +254,14 @@ def response(resp) -> EngineResults:
    if brave_category in ('search', 'goggles'):
        return _parse_search(resp)

+    if brave_category in ('news'):
+        return _parse_news(resp)
+
    datastr = extr(resp.text, "const data = ", ";\n").strip()

    json_data = js_variable_to_python(datastr)
    json_resp = json_data[1]['data']['body']['response']

-    if brave_category == 'news':
-        return _parse_news(json_resp['news'])
-
    if brave_category == 'images':
        return _parse_images(json_resp)
    if brave_category == 'videos':
@ -339,18 +339,31 @@ def _parse_search(resp) -> EngineResults:
    return result_list


-def _parse_news(json_resp) -> EngineResults:
-    result_list = EngineResults()
+def _parse_news(resp) -> EngineResults:
+
+    result_list = EngineResults()
+    dom = html.fromstring(resp.text)
+
+    for result in eval_xpath_list(dom, '//div[contains(@class, "results")]//div[@data-type="news"]'):
+
+        # import pdb
+        # pdb.set_trace()
+
+        url = eval_xpath_getindex(result, './/a[contains(@class, "result-header")]/@href', 0, default=None)
+        if url is None:
+            continue
+
+        title = extract_text(eval_xpath_list(result, './/span[contains(@class, "snippet-title")]'))
+        content = extract_text(eval_xpath_list(result, './/p[contains(@class, "desc")]'))
+        thumbnail = eval_xpath_getindex(result, './/div[contains(@class, "image-wrapper")]//img/@src', 0, default='')

-    for result in json_resp["results"]:
        item = {
-            'url': result['url'],
-            'title': result['title'],
-            'content': result['description'],
-            'publishedDate': _extract_published_date(result['age']),
+            "url": url,
+            "title": title,
+            "content": content,
+            "thumbnail": thumbnail,
        }
-        if result['thumbnail'] is not None:
-            item['thumbnail'] = result['thumbnail']['src']
+
        result_list.append(item)

    return result_list
--- a/searx/engines/mojeek.py
+++ b/searx/engines/mojeek.py
@ -67,11 +67,13 @@ def request(query, params):
    args = {
        'q': query,
        'safe': min(params['safesearch'], 1),
-        'fmt': search_type,
        language_param: traits.get_language(params['searxng_locale'], traits.custom['language_all']),
        region_param: traits.get_region(params['searxng_locale'], traits.custom['region_all']),
    }

+    if search_type:
+        args['fmt'] = search_type
+
    if search_type == '':
        args['s'] = 10 * (params['pageno'] - 1)

--- a/searx/engines/startpage.py
+++ b/searx/engines/startpage.py
@ -74,24 +74,25 @@ Startpage's category (for Web-search, News, Videos, ..) is set by

 .. hint::

-   The default category is ``web`` .. and other categories than ``web`` are not
-   yet implemented.
+  Supported categories are ``web``, ``news`` and ``images``.

 """
 # pylint: disable=too-many-statements
+from __future__ import annotations

-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 from collections import OrderedDict
 import re
 from unicodedata import normalize, combining
 from time import time
 from datetime import datetime, timedelta
+from json import loads

 import dateutil.parser
 import lxml.html
 import babel.localedata

-from searx.utils import extract_text, eval_xpath, gen_useragent
+from searx.utils import extr, extract_text, eval_xpath, gen_useragent, html_to_text, humanize_bytes, remove_pua_from_str
 from searx.network import get  # see https://github.com/searxng/searxng/issues/762
 from searx.exceptions import SearxEngineCaptchaException
 from searx.locales import region_tag
@ -250,22 +251,13 @@ def request(query, params):
    Additionally the arguments form Startpage's search form needs to be set in
    HTML POST data / compare ``<input>`` elements: :py:obj:`search_form_xpath`.
    """
-    if startpage_categ == 'web':
-        return _request_cat_web(query, params)
-
-    logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
-    return params
-
-
-def _request_cat_web(query, params):
-
    engine_region = traits.get_region(params['searxng_locale'], 'en-US')
    engine_language = traits.get_language(params['searxng_locale'], 'en')

    # build arguments
    args = {
        'query': query,
-        'cat': 'web',
+        'cat': startpage_categ,
        't': 'device',
        'sc': get_sc_code(params['searxng_locale'], params),  # hint: this func needs HTTP headers,
        'with_date': time_range_dict.get(params['time_range'], ''),
@ -317,73 +309,118 @@ def _request_cat_web(query, params):
    return params


-# get response from search-request
+def _parse_published_date(content: str) -> tuple[str, datetime | None]:
+    published_date = None
+
+    # check if search result starts with something like: "2 Sep 2014 ... "
+    if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
+        date_pos = content.find('...') + 4
+        date_string = content[0 : date_pos - 5]
+        # fix content string
+        content = content[date_pos:]
+
+        try:
+            published_date = dateutil.parser.parse(date_string, dayfirst=True)
+        except ValueError:
+            pass
+
+    # check if search result starts with something like: "5 days ago ... "
+    elif re.match(r"^[0-9]+ days? ago \.\.\. ", content):
+        date_pos = content.find('...') + 4
+        date_string = content[0 : date_pos - 5]
+
+        # calculate datetime
+        published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))  # type: ignore
+
+        # fix content string
+        content = content[date_pos:]
+
+    return content, published_date
+
+
+def _get_web_result(result):
+    content = html_to_text(result.get('description'))
+    content, publishedDate = _parse_published_date(content)
+
+    return {
+        'url': result['clickUrl'],
+        'title': html_to_text(result['title']),
+        'content': content,
+        'publishedDate': publishedDate,
+    }
+
+
+def _get_news_result(result):
+
+    title = remove_pua_from_str(html_to_text(result['title']))
+    content = remove_pua_from_str(html_to_text(result.get('description')))
+
+    publishedDate = None
+    if result.get('date'):
+        publishedDate = datetime.fromtimestamp(result['date'] / 1000)
+
+    thumbnailUrl = None
+    if result.get('thumbnailUrl'):
+        thumbnailUrl = base_url + result['thumbnailUrl']
+
+    return {
+        'url': result['clickUrl'],
+        'title': title,
+        'content': content,
+        'publishedDate': publishedDate,
+        'thumbnail': thumbnailUrl,
+    }
+
+
+def _get_image_result(result) -> dict[str, Any] | None:
+    url = result.get('altClickUrl')
+    if not url:
+        return None
+
+    thumbnailUrl = None
+    if result.get('thumbnailUrl'):
+        thumbnailUrl = base_url + result['thumbnailUrl']
+
+    resolution = None
+    if result.get('width') and result.get('height'):
+        resolution = f"{result['width']}x{result['height']}"
+
+    filesize = None
+    if result.get('filesize'):
+        size_str = ''.join(filter(str.isdigit, result['filesize']))
+        filesize = humanize_bytes(int(size_str))
+
+    return {
+        'template': 'images.html',
+        'url': url,
+        'title': html_to_text(result['title']),
+        'content': '',
+        'img_src': result.get('rawImageUrl'),
+        'thumbnail_src': thumbnailUrl,
+        'resolution': resolution,
+        'img_format': result.get('format'),
+        'filesize': filesize,
+    }
+
+
 def response(resp):
-    dom = lxml.html.fromstring(resp.text)
+    categ = startpage_categ.capitalize()
+    results_raw = '{' + extr(resp.text, f"React.createElement(UIStartpage.AppSerp{categ}, {{", '}})') + '}}'
+    results_json = loads(results_raw)
+    results_obj = results_json.get('render', {}).get('presenter', {}).get('regions', {})

-    if startpage_categ == 'web':
-        return _response_cat_web(dom)
-
-    logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
-    return []
-
-
-def _response_cat_web(dom):
    results = []
+    for results_categ in results_obj.get('mainline', []):
+        for item in results_categ.get('results', []):
+            if results_categ['display_type'] == 'web-google':
+                results.append(_get_web_result(item))
+            elif results_categ['display_type'] == 'news-bing':
+                results.append(_get_news_result(item))
+            elif 'images' in results_categ['display_type']:
+                item = _get_image_result(item)
+                if item:
+                    results.append(item)

-    # parse results
-    for result in eval_xpath(dom, '//div[@class="w-gl"]/div[contains(@class, "result")]'):
-        links = eval_xpath(result, './/a[contains(@class, "result-title result-link")]')
-        if not links:
-            continue
-        link = links[0]
-        url = link.attrib.get('href')
-
-        # block google-ad url's
-        if re.match(r"^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url):
-            continue
-
-        # block startpage search url's
-        if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url):
-            continue
-
-        title = extract_text(eval_xpath(link, 'h2'))
-        content = eval_xpath(result, './/p[contains(@class, "description")]')
-        content = extract_text(content, allow_none=True) or ''
-
-        published_date = None
-
-        # check if search result starts with something like: "2 Sep 2014 ... "
-        if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
-            date_pos = content.find('...') + 4
-            date_string = content[0 : date_pos - 5]
-            # fix content string
-            content = content[date_pos:]
-
-            try:
-                published_date = dateutil.parser.parse(date_string, dayfirst=True)
-            except ValueError:
-                pass
-
-        # check if search result starts with something like: "5 days ago ... "
-        elif re.match(r"^[0-9]+ days? ago \.\.\. ", content):
-            date_pos = content.find('...') + 4
-            date_string = content[0 : date_pos - 5]
-
-            # calculate datetime
-            published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))  # type: ignore
-
-            # fix content string
-            content = content[date_pos:]
-
-        if published_date:
-            # append result
-            results.append({'url': url, 'title': title, 'content': content, 'publishedDate': published_date})
-        else:
-            # append result
-            results.append({'url': url, 'title': title, 'content': content})
-
-    # return results
    return results


--- a/searx/settings.yml
+++ b/searx/settings.yml
@ -1451,9 +1451,12 @@ engines:
    frontend_url: https://srv.piped.video
    # Instance will be selected randomly, for more see https://piped-instances.kavin.rocks/
    backend_url:
-      - https://pipedapi.kavin.rocks
-      - https://pipedapi-libre.kavin.rocks
      - https://pipedapi.adminforge.de
+      - https://pipedapi.nosebs.ru
+      - https://pipedapi.ducks.party
+      - https://pipedapi.reallyaweso.me
+      - https://api.piped.private.coffee
+      - https://pipedapi.darkness.services

  - name: piped.music
    engine: piped
@ -1787,11 +1790,23 @@ engines:
  - name: startpage
    engine: startpage
    shortcut: sp
-    timeout: 6.0
-    disabled: true
+    startpage_categ: web
+    categories: [general, web]
    additional_tests:
      rosebud: *test_rosebud

+  - name: startpage news
+    engine: startpage
+    startpage_categ: news
+    categories: [news, web]
+    shortcut: spn
+
+  - name: startpage images
+    engine: startpage
+    startpage_categ: images
+    categories: [images, web]
+    shortcut: spi
+
  - name: tokyotoshokan
    engine: tokyotoshokan
    shortcut: tt
--- a/searx/utils.py
+++ b/searx/utils.py
@ -470,6 +470,21 @@ def ecma_unescape(string: str) -> str:
    return string


+def remove_pua_from_str(string):
+    """Removes unicode's "PRIVATE USE CHARACTER"s (PUA_) from a string.
+
+    _PUA: https://en.wikipedia.org/wiki/Private_Use_Areas
+    """
+    pua_ranges = ((0xE000, 0xF8FF), (0xF0000, 0xFFFFD), (0x100000, 0x10FFFD))
+    s = []
+    for c in string:
+        i = ord(c)
+        if any(a <= i <= b for (a, b) in pua_ranges):
+            continue
+        s.append(c)
+    return "".join(s)
+
+
 def get_string_replaces_function(replaces: Dict[str, str]) -> Callable[[str], str]:
    rep = {re.escape(k): v for k, v in replaces.items()}
    pattern = re.compile("|".join(rep.keys()))
--- a/searx/webapp.py
+++ b/searx/webapp.py
@ -594,7 +594,7 @@ def health():
@app.route('/client<token>.css', methods=['GET', 'POST'])
 def client_token(token=None):
    link_token.ping(sxng_request, token)
-    return Response('', mimetype='text/css')
+    return Response('', mimetype='text/css', headers={"Cache-Control": "no-store, max-age=0"})


@app.route('/rss.xsl', methods=['GET', 'POST'])