Merge remote-tracking branch 'upstream/master'
This commit is contained in:
commit
5d7b7ec199
8 changed files with 181 additions and 96 deletions
AUTHORS.rst
searx
|
@ -175,3 +175,4 @@ features or generally made searx better:
|
|||
- Daniel Kukula `<https://github.com/dkuku>`
|
||||
- Patrick Evans `https://github.com/holysoles`
|
||||
- Daniel Mowitz `<https://daniel.mowitz.rocks>`
|
||||
- `Bearz314 <https://github.com/bearz314>`_
|
||||
|
|
|
@ -123,7 +123,9 @@ def filter_request(
|
|||
)
|
||||
if c > SUSPICIOUS_IP_MAX:
|
||||
logger.error("BLOCK: too many request from %s in SUSPICIOUS_IP_WINDOW (redirect to /)", network)
|
||||
return flask.redirect(flask.url_for('index'), code=302)
|
||||
response = flask.redirect(flask.url_for('index'), code=302)
|
||||
response.headers["Cache-Control"] = "no-store, max-age=0"
|
||||
return response
|
||||
|
||||
c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW)
|
||||
if c > BURST_MAX_SUSPICIOUS:
|
||||
|
|
|
@ -254,14 +254,14 @@ def response(resp) -> EngineResults:
|
|||
if brave_category in ('search', 'goggles'):
|
||||
return _parse_search(resp)
|
||||
|
||||
if brave_category in ('news'):
|
||||
return _parse_news(resp)
|
||||
|
||||
datastr = extr(resp.text, "const data = ", ";\n").strip()
|
||||
|
||||
json_data = js_variable_to_python(datastr)
|
||||
json_resp = json_data[1]['data']['body']['response']
|
||||
|
||||
if brave_category == 'news':
|
||||
return _parse_news(json_resp['news'])
|
||||
|
||||
if brave_category == 'images':
|
||||
return _parse_images(json_resp)
|
||||
if brave_category == 'videos':
|
||||
|
@ -339,18 +339,31 @@ def _parse_search(resp) -> EngineResults:
|
|||
return result_list
|
||||
|
||||
|
||||
def _parse_news(json_resp) -> EngineResults:
|
||||
result_list = EngineResults()
|
||||
def _parse_news(resp) -> EngineResults:
|
||||
|
||||
result_list = EngineResults()
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
for result in eval_xpath_list(dom, '//div[contains(@class, "results")]//div[@data-type="news"]'):
|
||||
|
||||
# import pdb
|
||||
# pdb.set_trace()
|
||||
|
||||
url = eval_xpath_getindex(result, './/a[contains(@class, "result-header")]/@href', 0, default=None)
|
||||
if url is None:
|
||||
continue
|
||||
|
||||
title = extract_text(eval_xpath_list(result, './/span[contains(@class, "snippet-title")]'))
|
||||
content = extract_text(eval_xpath_list(result, './/p[contains(@class, "desc")]'))
|
||||
thumbnail = eval_xpath_getindex(result, './/div[contains(@class, "image-wrapper")]//img/@src', 0, default='')
|
||||
|
||||
for result in json_resp["results"]:
|
||||
item = {
|
||||
'url': result['url'],
|
||||
'title': result['title'],
|
||||
'content': result['description'],
|
||||
'publishedDate': _extract_published_date(result['age']),
|
||||
"url": url,
|
||||
"title": title,
|
||||
"content": content,
|
||||
"thumbnail": thumbnail,
|
||||
}
|
||||
if result['thumbnail'] is not None:
|
||||
item['thumbnail'] = result['thumbnail']['src']
|
||||
|
||||
result_list.append(item)
|
||||
|
||||
return result_list
|
||||
|
|
|
@ -67,11 +67,13 @@ def request(query, params):
|
|||
args = {
|
||||
'q': query,
|
||||
'safe': min(params['safesearch'], 1),
|
||||
'fmt': search_type,
|
||||
language_param: traits.get_language(params['searxng_locale'], traits.custom['language_all']),
|
||||
region_param: traits.get_region(params['searxng_locale'], traits.custom['region_all']),
|
||||
}
|
||||
|
||||
if search_type:
|
||||
args['fmt'] = search_type
|
||||
|
||||
if search_type == '':
|
||||
args['s'] = 10 * (params['pageno'] - 1)
|
||||
|
||||
|
|
|
@ -74,24 +74,25 @@ Startpage's category (for Web-search, News, Videos, ..) is set by
|
|||
|
||||
.. hint::
|
||||
|
||||
The default category is ``web`` .. and other categories than ``web`` are not
|
||||
yet implemented.
|
||||
Supported categories are ``web``, ``news`` and ``images``.
|
||||
|
||||
"""
|
||||
# pylint: disable=too-many-statements
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import TYPE_CHECKING, Any
|
||||
from collections import OrderedDict
|
||||
import re
|
||||
from unicodedata import normalize, combining
|
||||
from time import time
|
||||
from datetime import datetime, timedelta
|
||||
from json import loads
|
||||
|
||||
import dateutil.parser
|
||||
import lxml.html
|
||||
import babel.localedata
|
||||
|
||||
from searx.utils import extract_text, eval_xpath, gen_useragent
|
||||
from searx.utils import extr, extract_text, eval_xpath, gen_useragent, html_to_text, humanize_bytes, remove_pua_from_str
|
||||
from searx.network import get # see https://github.com/searxng/searxng/issues/762
|
||||
from searx.exceptions import SearxEngineCaptchaException
|
||||
from searx.locales import region_tag
|
||||
|
@ -250,22 +251,13 @@ def request(query, params):
|
|||
Additionally the arguments form Startpage's search form needs to be set in
|
||||
HTML POST data / compare ``<input>`` elements: :py:obj:`search_form_xpath`.
|
||||
"""
|
||||
if startpage_categ == 'web':
|
||||
return _request_cat_web(query, params)
|
||||
|
||||
logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
|
||||
return params
|
||||
|
||||
|
||||
def _request_cat_web(query, params):
|
||||
|
||||
engine_region = traits.get_region(params['searxng_locale'], 'en-US')
|
||||
engine_language = traits.get_language(params['searxng_locale'], 'en')
|
||||
|
||||
# build arguments
|
||||
args = {
|
||||
'query': query,
|
||||
'cat': 'web',
|
||||
'cat': startpage_categ,
|
||||
't': 'device',
|
||||
'sc': get_sc_code(params['searxng_locale'], params), # hint: this func needs HTTP headers,
|
||||
'with_date': time_range_dict.get(params['time_range'], ''),
|
||||
|
@ -317,73 +309,118 @@ def _request_cat_web(query, params):
|
|||
return params
|
||||
|
||||
|
||||
# get response from search-request
|
||||
def _parse_published_date(content: str) -> tuple[str, datetime | None]:
|
||||
published_date = None
|
||||
|
||||
# check if search result starts with something like: "2 Sep 2014 ... "
|
||||
if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
|
||||
date_pos = content.find('...') + 4
|
||||
date_string = content[0 : date_pos - 5]
|
||||
# fix content string
|
||||
content = content[date_pos:]
|
||||
|
||||
try:
|
||||
published_date = dateutil.parser.parse(date_string, dayfirst=True)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# check if search result starts with something like: "5 days ago ... "
|
||||
elif re.match(r"^[0-9]+ days? ago \.\.\. ", content):
|
||||
date_pos = content.find('...') + 4
|
||||
date_string = content[0 : date_pos - 5]
|
||||
|
||||
# calculate datetime
|
||||
published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) # type: ignore
|
||||
|
||||
# fix content string
|
||||
content = content[date_pos:]
|
||||
|
||||
return content, published_date
|
||||
|
||||
|
||||
def _get_web_result(result):
|
||||
content = html_to_text(result.get('description'))
|
||||
content, publishedDate = _parse_published_date(content)
|
||||
|
||||
return {
|
||||
'url': result['clickUrl'],
|
||||
'title': html_to_text(result['title']),
|
||||
'content': content,
|
||||
'publishedDate': publishedDate,
|
||||
}
|
||||
|
||||
|
||||
def _get_news_result(result):
|
||||
|
||||
title = remove_pua_from_str(html_to_text(result['title']))
|
||||
content = remove_pua_from_str(html_to_text(result.get('description')))
|
||||
|
||||
publishedDate = None
|
||||
if result.get('date'):
|
||||
publishedDate = datetime.fromtimestamp(result['date'] / 1000)
|
||||
|
||||
thumbnailUrl = None
|
||||
if result.get('thumbnailUrl'):
|
||||
thumbnailUrl = base_url + result['thumbnailUrl']
|
||||
|
||||
return {
|
||||
'url': result['clickUrl'],
|
||||
'title': title,
|
||||
'content': content,
|
||||
'publishedDate': publishedDate,
|
||||
'thumbnail': thumbnailUrl,
|
||||
}
|
||||
|
||||
|
||||
def _get_image_result(result) -> dict[str, Any] | None:
|
||||
url = result.get('altClickUrl')
|
||||
if not url:
|
||||
return None
|
||||
|
||||
thumbnailUrl = None
|
||||
if result.get('thumbnailUrl'):
|
||||
thumbnailUrl = base_url + result['thumbnailUrl']
|
||||
|
||||
resolution = None
|
||||
if result.get('width') and result.get('height'):
|
||||
resolution = f"{result['width']}x{result['height']}"
|
||||
|
||||
filesize = None
|
||||
if result.get('filesize'):
|
||||
size_str = ''.join(filter(str.isdigit, result['filesize']))
|
||||
filesize = humanize_bytes(int(size_str))
|
||||
|
||||
return {
|
||||
'template': 'images.html',
|
||||
'url': url,
|
||||
'title': html_to_text(result['title']),
|
||||
'content': '',
|
||||
'img_src': result.get('rawImageUrl'),
|
||||
'thumbnail_src': thumbnailUrl,
|
||||
'resolution': resolution,
|
||||
'img_format': result.get('format'),
|
||||
'filesize': filesize,
|
||||
}
|
||||
|
||||
|
||||
def response(resp):
|
||||
dom = lxml.html.fromstring(resp.text)
|
||||
categ = startpage_categ.capitalize()
|
||||
results_raw = '{' + extr(resp.text, f"React.createElement(UIStartpage.AppSerp{categ}, {{", '}})') + '}}'
|
||||
results_json = loads(results_raw)
|
||||
results_obj = results_json.get('render', {}).get('presenter', {}).get('regions', {})
|
||||
|
||||
if startpage_categ == 'web':
|
||||
return _response_cat_web(dom)
|
||||
|
||||
logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
|
||||
return []
|
||||
|
||||
|
||||
def _response_cat_web(dom):
|
||||
results = []
|
||||
for results_categ in results_obj.get('mainline', []):
|
||||
for item in results_categ.get('results', []):
|
||||
if results_categ['display_type'] == 'web-google':
|
||||
results.append(_get_web_result(item))
|
||||
elif results_categ['display_type'] == 'news-bing':
|
||||
results.append(_get_news_result(item))
|
||||
elif 'images' in results_categ['display_type']:
|
||||
item = _get_image_result(item)
|
||||
if item:
|
||||
results.append(item)
|
||||
|
||||
# parse results
|
||||
for result in eval_xpath(dom, '//div[@class="w-gl"]/div[contains(@class, "result")]'):
|
||||
links = eval_xpath(result, './/a[contains(@class, "result-title result-link")]')
|
||||
if not links:
|
||||
continue
|
||||
link = links[0]
|
||||
url = link.attrib.get('href')
|
||||
|
||||
# block google-ad url's
|
||||
if re.match(r"^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url):
|
||||
continue
|
||||
|
||||
# block startpage search url's
|
||||
if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url):
|
||||
continue
|
||||
|
||||
title = extract_text(eval_xpath(link, 'h2'))
|
||||
content = eval_xpath(result, './/p[contains(@class, "description")]')
|
||||
content = extract_text(content, allow_none=True) or ''
|
||||
|
||||
published_date = None
|
||||
|
||||
# check if search result starts with something like: "2 Sep 2014 ... "
|
||||
if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
|
||||
date_pos = content.find('...') + 4
|
||||
date_string = content[0 : date_pos - 5]
|
||||
# fix content string
|
||||
content = content[date_pos:]
|
||||
|
||||
try:
|
||||
published_date = dateutil.parser.parse(date_string, dayfirst=True)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# check if search result starts with something like: "5 days ago ... "
|
||||
elif re.match(r"^[0-9]+ days? ago \.\.\. ", content):
|
||||
date_pos = content.find('...') + 4
|
||||
date_string = content[0 : date_pos - 5]
|
||||
|
||||
# calculate datetime
|
||||
published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) # type: ignore
|
||||
|
||||
# fix content string
|
||||
content = content[date_pos:]
|
||||
|
||||
if published_date:
|
||||
# append result
|
||||
results.append({'url': url, 'title': title, 'content': content, 'publishedDate': published_date})
|
||||
else:
|
||||
# append result
|
||||
results.append({'url': url, 'title': title, 'content': content})
|
||||
|
||||
# return results
|
||||
return results
|
||||
|
||||
|
||||
|
|
|
@ -1451,9 +1451,12 @@ engines:
|
|||
frontend_url: https://srv.piped.video
|
||||
# Instance will be selected randomly, for more see https://piped-instances.kavin.rocks/
|
||||
backend_url:
|
||||
- https://pipedapi.kavin.rocks
|
||||
- https://pipedapi-libre.kavin.rocks
|
||||
- https://pipedapi.adminforge.de
|
||||
- https://pipedapi.nosebs.ru
|
||||
- https://pipedapi.ducks.party
|
||||
- https://pipedapi.reallyaweso.me
|
||||
- https://api.piped.private.coffee
|
||||
- https://pipedapi.darkness.services
|
||||
|
||||
- name: piped.music
|
||||
engine: piped
|
||||
|
@ -1787,11 +1790,23 @@ engines:
|
|||
- name: startpage
|
||||
engine: startpage
|
||||
shortcut: sp
|
||||
timeout: 6.0
|
||||
disabled: true
|
||||
startpage_categ: web
|
||||
categories: [general, web]
|
||||
additional_tests:
|
||||
rosebud: *test_rosebud
|
||||
|
||||
- name: startpage news
|
||||
engine: startpage
|
||||
startpage_categ: news
|
||||
categories: [news, web]
|
||||
shortcut: spn
|
||||
|
||||
- name: startpage images
|
||||
engine: startpage
|
||||
startpage_categ: images
|
||||
categories: [images, web]
|
||||
shortcut: spi
|
||||
|
||||
- name: tokyotoshokan
|
||||
engine: tokyotoshokan
|
||||
shortcut: tt
|
||||
|
|
|
@ -470,6 +470,21 @@ def ecma_unescape(string: str) -> str:
|
|||
return string
|
||||
|
||||
|
||||
def remove_pua_from_str(string):
|
||||
"""Removes unicode's "PRIVATE USE CHARACTER"s (PUA_) from a string.
|
||||
|
||||
_PUA: https://en.wikipedia.org/wiki/Private_Use_Areas
|
||||
"""
|
||||
pua_ranges = ((0xE000, 0xF8FF), (0xF0000, 0xFFFFD), (0x100000, 0x10FFFD))
|
||||
s = []
|
||||
for c in string:
|
||||
i = ord(c)
|
||||
if any(a <= i <= b for (a, b) in pua_ranges):
|
||||
continue
|
||||
s.append(c)
|
||||
return "".join(s)
|
||||
|
||||
|
||||
def get_string_replaces_function(replaces: Dict[str, str]) -> Callable[[str], str]:
|
||||
rep = {re.escape(k): v for k, v in replaces.items()}
|
||||
pattern = re.compile("|".join(rep.keys()))
|
||||
|
|
|
@ -594,7 +594,7 @@ def health():
|
|||
@app.route('/client<token>.css', methods=['GET', 'POST'])
|
||||
def client_token(token=None):
|
||||
link_token.ping(sxng_request, token)
|
||||
return Response('', mimetype='text/css')
|
||||
return Response('', mimetype='text/css', headers={"Cache-Control": "no-store, max-age=0"})
|
||||
|
||||
|
||||
@app.route('/rss.xsl', methods=['GET', 'POST'])
|
||||
|
|
Loading…
Add table
Reference in a new issue