diff --git a/AUTHORS.rst b/AUTHORS.rst index 95d154b12..adf4eb7d9 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -175,3 +175,4 @@ features or generally made searx better: - Daniel Kukula `<https://github.com/dkuku>` - Patrick Evans `https://github.com/holysoles` - Daniel Mowitz `<https://daniel.mowitz.rocks>` +- `Bearz314 <https://github.com/bearz314>`_ diff --git a/searx/botdetection/ip_limit.py b/searx/botdetection/ip_limit.py index b4c6825b3..161a9826e 100644 --- a/searx/botdetection/ip_limit.py +++ b/searx/botdetection/ip_limit.py @@ -123,7 +123,9 @@ def filter_request( ) if c > SUSPICIOUS_IP_MAX: logger.error("BLOCK: too many request from %s in SUSPICIOUS_IP_WINDOW (redirect to /)", network) - return flask.redirect(flask.url_for('index'), code=302) + response = flask.redirect(flask.url_for('index'), code=302) + response.headers["Cache-Control"] = "no-store, max-age=0" + return response c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW) if c > BURST_MAX_SUSPICIOUS: diff --git a/searx/engines/brave.py b/searx/engines/brave.py index 828f6154e..90cce4045 100644 --- a/searx/engines/brave.py +++ b/searx/engines/brave.py @@ -254,14 +254,14 @@ def response(resp) -> EngineResults: if brave_category in ('search', 'goggles'): return _parse_search(resp) + if brave_category in ('news'): + return _parse_news(resp) + datastr = extr(resp.text, "const data = ", ";\n").strip() json_data = js_variable_to_python(datastr) json_resp = json_data[1]['data']['body']['response'] - if brave_category == 'news': - return _parse_news(json_resp['news']) - if brave_category == 'images': return _parse_images(json_resp) if brave_category == 'videos': @@ -339,18 +339,31 @@ def _parse_search(resp) -> EngineResults: return result_list -def _parse_news(json_resp) -> EngineResults: - result_list = EngineResults() +def _parse_news(resp) -> EngineResults: + + result_list = EngineResults() + dom = html.fromstring(resp.text) + + for result in eval_xpath_list(dom, '//div[contains(@class, "results")]//div[@data-type="news"]'): + + # import pdb + # pdb.set_trace() + + url = eval_xpath_getindex(result, './/a[contains(@class, "result-header")]/@href', 0, default=None) + if url is None: + continue + + title = extract_text(eval_xpath_list(result, './/span[contains(@class, "snippet-title")]')) + content = extract_text(eval_xpath_list(result, './/p[contains(@class, "desc")]')) + thumbnail = eval_xpath_getindex(result, './/div[contains(@class, "image-wrapper")]//img/@src', 0, default='') - for result in json_resp["results"]: item = { - 'url': result['url'], - 'title': result['title'], - 'content': result['description'], - 'publishedDate': _extract_published_date(result['age']), + "url": url, + "title": title, + "content": content, + "thumbnail": thumbnail, } - if result['thumbnail'] is not None: - item['thumbnail'] = result['thumbnail']['src'] + result_list.append(item) return result_list diff --git a/searx/engines/mojeek.py b/searx/engines/mojeek.py index df2302e8b..035279b06 100644 --- a/searx/engines/mojeek.py +++ b/searx/engines/mojeek.py @@ -67,11 +67,13 @@ def request(query, params): args = { 'q': query, 'safe': min(params['safesearch'], 1), - 'fmt': search_type, language_param: traits.get_language(params['searxng_locale'], traits.custom['language_all']), region_param: traits.get_region(params['searxng_locale'], traits.custom['region_all']), } + if search_type: + args['fmt'] = search_type + if search_type == '': args['s'] = 10 * (params['pageno'] - 1) diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index f90cf2ac9..54e05604b 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -74,24 +74,25 @@ Startpage's category (for Web-search, News, Videos, ..) is set by .. hint:: - The default category is ``web`` .. and other categories than ``web`` are not - yet implemented. + Supported categories are ``web``, ``news`` and ``images``. """ # pylint: disable=too-many-statements +from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from collections import OrderedDict import re from unicodedata import normalize, combining from time import time from datetime import datetime, timedelta +from json import loads import dateutil.parser import lxml.html import babel.localedata -from searx.utils import extract_text, eval_xpath, gen_useragent +from searx.utils import extr, extract_text, eval_xpath, gen_useragent, html_to_text, humanize_bytes, remove_pua_from_str from searx.network import get # see https://github.com/searxng/searxng/issues/762 from searx.exceptions import SearxEngineCaptchaException from searx.locales import region_tag @@ -250,22 +251,13 @@ def request(query, params): Additionally the arguments form Startpage's search form needs to be set in HTML POST data / compare ``<input>`` elements: :py:obj:`search_form_xpath`. """ - if startpage_categ == 'web': - return _request_cat_web(query, params) - - logger.error("Startpages's category '%' is not yet implemented.", startpage_categ) - return params - - -def _request_cat_web(query, params): - engine_region = traits.get_region(params['searxng_locale'], 'en-US') engine_language = traits.get_language(params['searxng_locale'], 'en') # build arguments args = { 'query': query, - 'cat': 'web', + 'cat': startpage_categ, 't': 'device', 'sc': get_sc_code(params['searxng_locale'], params), # hint: this func needs HTTP headers, 'with_date': time_range_dict.get(params['time_range'], ''), @@ -317,73 +309,118 @@ def _request_cat_web(query, params): return params -# get response from search-request +def _parse_published_date(content: str) -> tuple[str, datetime | None]: + published_date = None + + # check if search result starts with something like: "2 Sep 2014 ... " + if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content): + date_pos = content.find('...') + 4 + date_string = content[0 : date_pos - 5] + # fix content string + content = content[date_pos:] + + try: + published_date = dateutil.parser.parse(date_string, dayfirst=True) + except ValueError: + pass + + # check if search result starts with something like: "5 days ago ... " + elif re.match(r"^[0-9]+ days? ago \.\.\. ", content): + date_pos = content.find('...') + 4 + date_string = content[0 : date_pos - 5] + + # calculate datetime + published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) # type: ignore + + # fix content string + content = content[date_pos:] + + return content, published_date + + +def _get_web_result(result): + content = html_to_text(result.get('description')) + content, publishedDate = _parse_published_date(content) + + return { + 'url': result['clickUrl'], + 'title': html_to_text(result['title']), + 'content': content, + 'publishedDate': publishedDate, + } + + +def _get_news_result(result): + + title = remove_pua_from_str(html_to_text(result['title'])) + content = remove_pua_from_str(html_to_text(result.get('description'))) + + publishedDate = None + if result.get('date'): + publishedDate = datetime.fromtimestamp(result['date'] / 1000) + + thumbnailUrl = None + if result.get('thumbnailUrl'): + thumbnailUrl = base_url + result['thumbnailUrl'] + + return { + 'url': result['clickUrl'], + 'title': title, + 'content': content, + 'publishedDate': publishedDate, + 'thumbnail': thumbnailUrl, + } + + +def _get_image_result(result) -> dict[str, Any] | None: + url = result.get('altClickUrl') + if not url: + return None + + thumbnailUrl = None + if result.get('thumbnailUrl'): + thumbnailUrl = base_url + result['thumbnailUrl'] + + resolution = None + if result.get('width') and result.get('height'): + resolution = f"{result['width']}x{result['height']}" + + filesize = None + if result.get('filesize'): + size_str = ''.join(filter(str.isdigit, result['filesize'])) + filesize = humanize_bytes(int(size_str)) + + return { + 'template': 'images.html', + 'url': url, + 'title': html_to_text(result['title']), + 'content': '', + 'img_src': result.get('rawImageUrl'), + 'thumbnail_src': thumbnailUrl, + 'resolution': resolution, + 'img_format': result.get('format'), + 'filesize': filesize, + } + + def response(resp): - dom = lxml.html.fromstring(resp.text) + categ = startpage_categ.capitalize() + results_raw = '{' + extr(resp.text, f"React.createElement(UIStartpage.AppSerp{categ}, {{", '}})') + '}}' + results_json = loads(results_raw) + results_obj = results_json.get('render', {}).get('presenter', {}).get('regions', {}) - if startpage_categ == 'web': - return _response_cat_web(dom) - - logger.error("Startpages's category '%' is not yet implemented.", startpage_categ) - return [] - - -def _response_cat_web(dom): results = [] + for results_categ in results_obj.get('mainline', []): + for item in results_categ.get('results', []): + if results_categ['display_type'] == 'web-google': + results.append(_get_web_result(item)) + elif results_categ['display_type'] == 'news-bing': + results.append(_get_news_result(item)) + elif 'images' in results_categ['display_type']: + item = _get_image_result(item) + if item: + results.append(item) - # parse results - for result in eval_xpath(dom, '//div[@class="w-gl"]/div[contains(@class, "result")]'): - links = eval_xpath(result, './/a[contains(@class, "result-title result-link")]') - if not links: - continue - link = links[0] - url = link.attrib.get('href') - - # block google-ad url's - if re.match(r"^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url): - continue - - # block startpage search url's - if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url): - continue - - title = extract_text(eval_xpath(link, 'h2')) - content = eval_xpath(result, './/p[contains(@class, "description")]') - content = extract_text(content, allow_none=True) or '' - - published_date = None - - # check if search result starts with something like: "2 Sep 2014 ... " - if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content): - date_pos = content.find('...') + 4 - date_string = content[0 : date_pos - 5] - # fix content string - content = content[date_pos:] - - try: - published_date = dateutil.parser.parse(date_string, dayfirst=True) - except ValueError: - pass - - # check if search result starts with something like: "5 days ago ... " - elif re.match(r"^[0-9]+ days? ago \.\.\. ", content): - date_pos = content.find('...') + 4 - date_string = content[0 : date_pos - 5] - - # calculate datetime - published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) # type: ignore - - # fix content string - content = content[date_pos:] - - if published_date: - # append result - results.append({'url': url, 'title': title, 'content': content, 'publishedDate': published_date}) - else: - # append result - results.append({'url': url, 'title': title, 'content': content}) - - # return results return results diff --git a/searx/settings.yml b/searx/settings.yml index 105f88bc3..ed838358a 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -1451,9 +1451,12 @@ engines: frontend_url: https://srv.piped.video # Instance will be selected randomly, for more see https://piped-instances.kavin.rocks/ backend_url: - - https://pipedapi.kavin.rocks - - https://pipedapi-libre.kavin.rocks - https://pipedapi.adminforge.de + - https://pipedapi.nosebs.ru + - https://pipedapi.ducks.party + - https://pipedapi.reallyaweso.me + - https://api.piped.private.coffee + - https://pipedapi.darkness.services - name: piped.music engine: piped @@ -1787,11 +1790,23 @@ engines: - name: startpage engine: startpage shortcut: sp - timeout: 6.0 - disabled: true + startpage_categ: web + categories: [general, web] additional_tests: rosebud: *test_rosebud + - name: startpage news + engine: startpage + startpage_categ: news + categories: [news, web] + shortcut: spn + + - name: startpage images + engine: startpage + startpage_categ: images + categories: [images, web] + shortcut: spi + - name: tokyotoshokan engine: tokyotoshokan shortcut: tt diff --git a/searx/utils.py b/searx/utils.py index 4fcbd9e3c..c7a579451 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -470,6 +470,21 @@ def ecma_unescape(string: str) -> str: return string +def remove_pua_from_str(string): + """Removes unicode's "PRIVATE USE CHARACTER"s (PUA_) from a string. + + _PUA: https://en.wikipedia.org/wiki/Private_Use_Areas + """ + pua_ranges = ((0xE000, 0xF8FF), (0xF0000, 0xFFFFD), (0x100000, 0x10FFFD)) + s = [] + for c in string: + i = ord(c) + if any(a <= i <= b for (a, b) in pua_ranges): + continue + s.append(c) + return "".join(s) + + def get_string_replaces_function(replaces: Dict[str, str]) -> Callable[[str], str]: rep = {re.escape(k): v for k, v in replaces.items()} pattern = re.compile("|".join(rep.keys())) diff --git a/searx/webapp.py b/searx/webapp.py index 9d51b5e8c..7104853e8 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -594,7 +594,7 @@ def health(): @app.route('/client<token>.css', methods=['GET', 'POST']) def client_token(token=None): link_token.ping(sxng_request, token) - return Response('', mimetype='text/css') + return Response('', mimetype='text/css', headers={"Cache-Control": "no-store, max-age=0"}) @app.route('/rss.xsl', methods=['GET', 'POST'])