Merge remote-tracking branch 'upstream/master'

This commit is contained in:
Arcane Spark 2025-02-20 14:50:32 +01:00
commit 5d7b7ec199
8 changed files with 181 additions and 96 deletions

View file

@ -175,3 +175,4 @@ features or generally made searx better:
- Daniel Kukula `<https://github.com/dkuku>`
- Patrick Evans `https://github.com/holysoles`
- Daniel Mowitz `<https://daniel.mowitz.rocks>`
- `Bearz314 <https://github.com/bearz314>`_

View file

@ -123,7 +123,9 @@ def filter_request(
)
if c > SUSPICIOUS_IP_MAX:
logger.error("BLOCK: too many request from %s in SUSPICIOUS_IP_WINDOW (redirect to /)", network)
return flask.redirect(flask.url_for('index'), code=302)
response = flask.redirect(flask.url_for('index'), code=302)
response.headers["Cache-Control"] = "no-store, max-age=0"
return response
c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW)
if c > BURST_MAX_SUSPICIOUS:

View file

@ -254,14 +254,14 @@ def response(resp) -> EngineResults:
if brave_category in ('search', 'goggles'):
return _parse_search(resp)
if brave_category in ('news'):
return _parse_news(resp)
datastr = extr(resp.text, "const data = ", ";\n").strip()
json_data = js_variable_to_python(datastr)
json_resp = json_data[1]['data']['body']['response']
if brave_category == 'news':
return _parse_news(json_resp['news'])
if brave_category == 'images':
return _parse_images(json_resp)
if brave_category == 'videos':
@ -339,18 +339,31 @@ def _parse_search(resp) -> EngineResults:
return result_list
def _parse_news(json_resp) -> EngineResults:
result_list = EngineResults()
def _parse_news(resp) -> EngineResults:
result_list = EngineResults()
dom = html.fromstring(resp.text)
for result in eval_xpath_list(dom, '//div[contains(@class, "results")]//div[@data-type="news"]'):
# import pdb
# pdb.set_trace()
url = eval_xpath_getindex(result, './/a[contains(@class, "result-header")]/@href', 0, default=None)
if url is None:
continue
title = extract_text(eval_xpath_list(result, './/span[contains(@class, "snippet-title")]'))
content = extract_text(eval_xpath_list(result, './/p[contains(@class, "desc")]'))
thumbnail = eval_xpath_getindex(result, './/div[contains(@class, "image-wrapper")]//img/@src', 0, default='')
for result in json_resp["results"]:
item = {
'url': result['url'],
'title': result['title'],
'content': result['description'],
'publishedDate': _extract_published_date(result['age']),
"url": url,
"title": title,
"content": content,
"thumbnail": thumbnail,
}
if result['thumbnail'] is not None:
item['thumbnail'] = result['thumbnail']['src']
result_list.append(item)
return result_list

View file

@ -67,11 +67,13 @@ def request(query, params):
args = {
'q': query,
'safe': min(params['safesearch'], 1),
'fmt': search_type,
language_param: traits.get_language(params['searxng_locale'], traits.custom['language_all']),
region_param: traits.get_region(params['searxng_locale'], traits.custom['region_all']),
}
if search_type:
args['fmt'] = search_type
if search_type == '':
args['s'] = 10 * (params['pageno'] - 1)

View file

@ -74,24 +74,25 @@ Startpage's category (for Web-search, News, Videos, ..) is set by
.. hint::
The default category is ``web`` .. and other categories than ``web`` are not
yet implemented.
Supported categories are ``web``, ``news`` and ``images``.
"""
# pylint: disable=too-many-statements
from __future__ import annotations
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Any
from collections import OrderedDict
import re
from unicodedata import normalize, combining
from time import time
from datetime import datetime, timedelta
from json import loads
import dateutil.parser
import lxml.html
import babel.localedata
from searx.utils import extract_text, eval_xpath, gen_useragent
from searx.utils import extr, extract_text, eval_xpath, gen_useragent, html_to_text, humanize_bytes, remove_pua_from_str
from searx.network import get # see https://github.com/searxng/searxng/issues/762
from searx.exceptions import SearxEngineCaptchaException
from searx.locales import region_tag
@ -250,22 +251,13 @@ def request(query, params):
Additionally the arguments form Startpage's search form needs to be set in
HTML POST data / compare ``<input>`` elements: :py:obj:`search_form_xpath`.
"""
if startpage_categ == 'web':
return _request_cat_web(query, params)
logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
return params
def _request_cat_web(query, params):
engine_region = traits.get_region(params['searxng_locale'], 'en-US')
engine_language = traits.get_language(params['searxng_locale'], 'en')
# build arguments
args = {
'query': query,
'cat': 'web',
'cat': startpage_categ,
't': 'device',
'sc': get_sc_code(params['searxng_locale'], params), # hint: this func needs HTTP headers,
'with_date': time_range_dict.get(params['time_range'], ''),
@ -317,73 +309,118 @@ def _request_cat_web(query, params):
return params
# get response from search-request
def _parse_published_date(content: str) -> tuple[str, datetime | None]:
published_date = None
# check if search result starts with something like: "2 Sep 2014 ... "
if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
date_pos = content.find('...') + 4
date_string = content[0 : date_pos - 5]
# fix content string
content = content[date_pos:]
try:
published_date = dateutil.parser.parse(date_string, dayfirst=True)
except ValueError:
pass
# check if search result starts with something like: "5 days ago ... "
elif re.match(r"^[0-9]+ days? ago \.\.\. ", content):
date_pos = content.find('...') + 4
date_string = content[0 : date_pos - 5]
# calculate datetime
published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) # type: ignore
# fix content string
content = content[date_pos:]
return content, published_date
def _get_web_result(result):
content = html_to_text(result.get('description'))
content, publishedDate = _parse_published_date(content)
return {
'url': result['clickUrl'],
'title': html_to_text(result['title']),
'content': content,
'publishedDate': publishedDate,
}
def _get_news_result(result):
title = remove_pua_from_str(html_to_text(result['title']))
content = remove_pua_from_str(html_to_text(result.get('description')))
publishedDate = None
if result.get('date'):
publishedDate = datetime.fromtimestamp(result['date'] / 1000)
thumbnailUrl = None
if result.get('thumbnailUrl'):
thumbnailUrl = base_url + result['thumbnailUrl']
return {
'url': result['clickUrl'],
'title': title,
'content': content,
'publishedDate': publishedDate,
'thumbnail': thumbnailUrl,
}
def _get_image_result(result) -> dict[str, Any] | None:
url = result.get('altClickUrl')
if not url:
return None
thumbnailUrl = None
if result.get('thumbnailUrl'):
thumbnailUrl = base_url + result['thumbnailUrl']
resolution = None
if result.get('width') and result.get('height'):
resolution = f"{result['width']}x{result['height']}"
filesize = None
if result.get('filesize'):
size_str = ''.join(filter(str.isdigit, result['filesize']))
filesize = humanize_bytes(int(size_str))
return {
'template': 'images.html',
'url': url,
'title': html_to_text(result['title']),
'content': '',
'img_src': result.get('rawImageUrl'),
'thumbnail_src': thumbnailUrl,
'resolution': resolution,
'img_format': result.get('format'),
'filesize': filesize,
}
def response(resp):
dom = lxml.html.fromstring(resp.text)
categ = startpage_categ.capitalize()
results_raw = '{' + extr(resp.text, f"React.createElement(UIStartpage.AppSerp{categ}, {{", '}})') + '}}'
results_json = loads(results_raw)
results_obj = results_json.get('render', {}).get('presenter', {}).get('regions', {})
if startpage_categ == 'web':
return _response_cat_web(dom)
logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
return []
def _response_cat_web(dom):
results = []
for results_categ in results_obj.get('mainline', []):
for item in results_categ.get('results', []):
if results_categ['display_type'] == 'web-google':
results.append(_get_web_result(item))
elif results_categ['display_type'] == 'news-bing':
results.append(_get_news_result(item))
elif 'images' in results_categ['display_type']:
item = _get_image_result(item)
if item:
results.append(item)
# parse results
for result in eval_xpath(dom, '//div[@class="w-gl"]/div[contains(@class, "result")]'):
links = eval_xpath(result, './/a[contains(@class, "result-title result-link")]')
if not links:
continue
link = links[0]
url = link.attrib.get('href')
# block google-ad url's
if re.match(r"^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url):
continue
# block startpage search url's
if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url):
continue
title = extract_text(eval_xpath(link, 'h2'))
content = eval_xpath(result, './/p[contains(@class, "description")]')
content = extract_text(content, allow_none=True) or ''
published_date = None
# check if search result starts with something like: "2 Sep 2014 ... "
if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
date_pos = content.find('...') + 4
date_string = content[0 : date_pos - 5]
# fix content string
content = content[date_pos:]
try:
published_date = dateutil.parser.parse(date_string, dayfirst=True)
except ValueError:
pass
# check if search result starts with something like: "5 days ago ... "
elif re.match(r"^[0-9]+ days? ago \.\.\. ", content):
date_pos = content.find('...') + 4
date_string = content[0 : date_pos - 5]
# calculate datetime
published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) # type: ignore
# fix content string
content = content[date_pos:]
if published_date:
# append result
results.append({'url': url, 'title': title, 'content': content, 'publishedDate': published_date})
else:
# append result
results.append({'url': url, 'title': title, 'content': content})
# return results
return results

View file

@ -1451,9 +1451,12 @@ engines:
frontend_url: https://srv.piped.video
# Instance will be selected randomly, for more see https://piped-instances.kavin.rocks/
backend_url:
- https://pipedapi.kavin.rocks
- https://pipedapi-libre.kavin.rocks
- https://pipedapi.adminforge.de
- https://pipedapi.nosebs.ru
- https://pipedapi.ducks.party
- https://pipedapi.reallyaweso.me
- https://api.piped.private.coffee
- https://pipedapi.darkness.services
- name: piped.music
engine: piped
@ -1787,11 +1790,23 @@ engines:
- name: startpage
engine: startpage
shortcut: sp
timeout: 6.0
disabled: true
startpage_categ: web
categories: [general, web]
additional_tests:
rosebud: *test_rosebud
- name: startpage news
engine: startpage
startpage_categ: news
categories: [news, web]
shortcut: spn
- name: startpage images
engine: startpage
startpage_categ: images
categories: [images, web]
shortcut: spi
- name: tokyotoshokan
engine: tokyotoshokan
shortcut: tt

View file

@ -470,6 +470,21 @@ def ecma_unescape(string: str) -> str:
return string
def remove_pua_from_str(string):
"""Removes unicode's "PRIVATE USE CHARACTER"s (PUA_) from a string.
_PUA: https://en.wikipedia.org/wiki/Private_Use_Areas
"""
pua_ranges = ((0xE000, 0xF8FF), (0xF0000, 0xFFFFD), (0x100000, 0x10FFFD))
s = []
for c in string:
i = ord(c)
if any(a <= i <= b for (a, b) in pua_ranges):
continue
s.append(c)
return "".join(s)
def get_string_replaces_function(replaces: Dict[str, str]) -> Callable[[str], str]:
rep = {re.escape(k): v for k, v in replaces.items()}
pattern = re.compile("|".join(rep.keys()))

View file

@ -594,7 +594,7 @@ def health():
@app.route('/client<token>.css', methods=['GET', 'POST'])
def client_token(token=None):
link_token.ping(sxng_request, token)
return Response('', mimetype='text/css')
return Response('', mimetype='text/css', headers={"Cache-Control": "no-store, max-age=0"})
@app.route('/rss.xsl', methods=['GET', 'POST'])