Merge branch 'searxng:master' into master

This commit is contained in:
Azure Star 2023-04-04 22:59:23 +02:00 committed by GitHub
commit 97f05ed4d3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
68 changed files with 133 additions and 98 deletions

View file

@ -4,7 +4,7 @@ cov-core==1.15.0
black==22.12.0
pylint==2.17.1
splinter==0.19.0
selenium==4.8.2
selenium==4.8.3
twine==4.0.2
Pallets-Sphinx-Themes==2.0.3
Sphinx==5.3.0

View file

@ -12,7 +12,7 @@ Brotli==1.0.9
uvloop==0.17.0
httpx-socks[asyncio]==0.7.2
setproctitle==1.3.2
redis==4.5.1
redis==4.5.4
markdown-it-py==2.2.0
typing_extensions==4.5.0
fasttext-predict==0.9.2.1

View file

@ -6,6 +6,7 @@ DuckDuckGo Lite
"""
from typing import TYPE_CHECKING
import re
from urllib.parse import urlencode
import json
import babel
@ -15,6 +16,7 @@ from searx import (
network,
locales,
redislib,
external_bang,
)
from searx import redisdb
from searx.utils import (
@ -197,6 +199,17 @@ ddg_lang_map = {
def request(query, params):
# quote ddg bangs
query_parts = []
# for val in re.split(r'(\s+)', query):
for val in re.split(r'(\s+)', query):
if not val.strip():
continue
if val.startswith('!') and external_bang.get_node(external_bang.EXTERNAL_BANGS, val[1:]):
val = f"'{val}'"
query_parts.append(val)
query = ' '.join(query_parts)
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
# eng_lang = get_ddg_lang(traits, params['searxng_locale'])

View file

@ -1,14 +1,22 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Flickr (Images)
# lint: pylint
"""Flickr (Images)
"""
from json import loads
from typing import TYPE_CHECKING
import json
from time import time
import re
from urllib.parse import urlencode
from searx.utils import ecma_unescape, html_to_text
if TYPE_CHECKING:
import logging
logger: logging.Logger
# about
about = {
"website": 'https://www.flickr.com',
@ -19,23 +27,24 @@ about = {
"results": 'HTML',
}
# engine dependent config
categories = ['images']
url = 'https://www.flickr.com/'
search_url = url + 'search?{query}&page={page}'
time_range_url = '&min_upload_date={start}&max_upload_date={end}'
photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}'
modelexport_re = re.compile(r"^\s*modelExport:\s*({.*}),$", re.M)
image_sizes = ('o', 'k', 'h', 'b', 'c', 'z', 'n', 'm', 't', 'q', 's')
paging = True
time_range_support = True
safesearch = False
time_range_dict = {
'day': 60 * 60 * 24,
'week': 60 * 60 * 24 * 7,
'month': 60 * 60 * 24 * 7 * 4,
'year': 60 * 60 * 24 * 7 * 52,
}
image_sizes = ('o', 'k', 'h', 'b', 'c', 'z', 'm', 'n', 't', 'q', 's')
search_url = 'https://www.flickr.com/search?{query}&page={page}'
time_range_url = '&min_upload_date={start}&max_upload_date={end}'
photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}'
modelexport_re = re.compile(r"^\s*modelExport:\s*({.*}),$", re.M)
def build_flickr_url(user_id, photo_id):
@ -55,51 +64,59 @@ def request(query, params):
return params
def response(resp):
def response(resp): # pylint: disable=too-many-branches
results = []
matches = modelexport_re.search(resp.text)
if matches is None:
return results
match = matches.group(1)
model_export = loads(match)
model_export = json.loads(match)
if 'legend' not in model_export:
return results
legend = model_export['legend']
# handle empty page
if not legend or not legend[0]:
return results
for index in legend:
photo = model_export['main'][index[0]][int(index[1])][index[2]][index[3]][int(index[4])]
for x, index in enumerate(legend):
if len(index) != 8:
logger.debug("skip legend enty %s : %s", x, index)
continue
photo = model_export['main'][index[0]][int(index[1])][index[2]][index[3]][index[4]][index[5]][int(index[6])][
index[7]
]
author = ecma_unescape(photo.get('realname', ''))
source = ecma_unescape(photo.get('username', '')) + ' @ Flickr'
source = ecma_unescape(photo.get('username', ''))
if source:
source += ' @ Flickr'
title = ecma_unescape(photo.get('title', ''))
content = html_to_text(ecma_unescape(photo.get('description', '')))
img_src = None
# From the biggest to the lowest format
size_data = None
for image_size in image_sizes:
if image_size in photo['sizes']:
img_src = photo['sizes'][image_size]['url']
img_format = (
'jpg ' + str(photo['sizes'][image_size]['width']) + 'x' + str(photo['sizes'][image_size]['height'])
)
if image_size in photo['sizes']['data']:
size_data = photo['sizes']['data'][image_size]['data']
break
if not img_src:
logger.debug('cannot find valid image size: {0}'.format(repr(photo)))
if not size_data:
logger.debug('cannot find valid image size: {0}'.format(repr(photo['sizes']['data'])))
continue
img_src = size_data['url']
img_format = f"{size_data['width']} x {size_data['height']}"
# For a bigger thumbnail, keep only the url_z, not the url_n
if 'n' in photo['sizes']:
thumbnail_src = photo['sizes']['n']['url']
elif 'z' in photo['sizes']:
thumbnail_src = photo['sizes']['z']['url']
if 'n' in photo['sizes']['data']:
thumbnail_src = photo['sizes']['data']['n']['data']['url']
elif 'z' in photo['sizes']['data']:
thumbnail_src = photo['sizes']['data']['z']['data']['url']
else:
thumbnail_src = img_src

View file

@ -27,10 +27,8 @@ The google news API ignores some parameters from the common :ref:`google API`:
from typing import TYPE_CHECKING
import binascii
import re
from urllib.parse import urlencode
from base64 import b64decode
import base64
from lxml import html
import babel
@ -144,34 +142,17 @@ def response(resp):
for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'):
# The first <a> tag in the <article> contains the link to the
# article The href attribute of the <a> is a google internal link,
# we can't use. The real link is hidden in the jslog attribute:
#
# <a ...
# jslog="95014; 4:https://www.cnn.com/.../index.html; track:click"
# href="./articles/CAIiENu3nGS...?hl=en-US&amp;gl=US&amp;ceid=US%3Aen"
# ... />
# The first <a> tag in the <article> contains the link to the article
# The href attribute of the <a> tag is a google internal link, we have
# to decode
jslog = eval_xpath_getindex(result, './article/a/@jslog', 0)
url = re.findall('http[^;]*', jslog)
if url:
url = url[0]
else:
# The real URL is base64 encoded in the json attribute:
# jslog="95014; 5:W251bGwsbnVsbCxudW...giXQ==; track:click"
jslog = jslog.split(";")[1].split(':')[1].strip()
try:
padding = (4 - (len(jslog) % 4)) * "="
jslog = b64decode(jslog + padding)
except binascii.Error:
# URL can't be read, skip this result
continue
href = eval_xpath_getindex(result, './article/a/@href', 0)
href = href.split('?')[0]
href = href.split('/')[-1]
href = base64.urlsafe_b64decode(href + '====')
href = href[href.index(b'http') :].split(b'\xd2')[0]
href = href.decode()
# now we have : b'[null, ... null,"https://www.cnn.com/.../index.html"]'
url = re.findall('http[^;"]*', str(jslog))[0]
# the first <h3> tag in the <article> contains the title of the link
title = extract_text(eval_xpath(result, './article/h3[1]'))
# The pub_date is mostly a string like 'yesertday', not a real
@ -189,7 +170,7 @@ def response(resp):
results.append(
{
'url': url,
'url': href,
'title': title,
'content': content,
'img_src': img_src,

View file

@ -1,6 +1,7 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Seznam
# lint: pylint
"""Seznam
"""
from urllib.parse import urlencode
@ -11,7 +12,6 @@ from searx.utils import (
extract_text,
eval_xpath_list,
eval_xpath_getindex,
eval_xpath,
)
# about
@ -54,8 +54,12 @@ def response(resp):
results = []
dom = html.fromstring(resp.content.decode())
for result_element in eval_xpath_list(dom, '//div[@data-dot="results"]/div'):
result_data = eval_xpath_getindex(result_element, './/div[contains(@class, "bec586")]', 0, default=None)
for result_element in eval_xpath_list(
dom, '//div[@id="searchpage-root"]//div[@class="Layout--left"]/div[@class="f2c528"]'
):
result_data = eval_xpath_getindex(
result_element, './/div[@class="c8774a" or @class="e69e8d a11657"]', 0, default=None
)
if result_data is None:
continue
title_element = eval_xpath_getindex(result_element, './/h3/a', 0)
@ -63,7 +67,7 @@ def response(resp):
{
'url': title_element.get('href'),
'title': extract_text(title_element),
'content': extract_text(eval_xpath(result_data, './/div[@class="_3eded7"]')),
'content': extract_text(result_data),
}
)

View file

@ -17,21 +17,26 @@ import re
from flask import request
from searx import redisdb
from searx.plugins import logger
from searx.redislib import incr_sliding_window
name = "Request limiter"
description = "Limit the number of request"
default_on = False
preference_section = 'service'
logger = logger.getChild('limiter')
re_bot = re.compile(
block_user_agent = re.compile(
r'('
+ r'[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp'
+ r'unknown'
+ r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp'
+ r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy'
+ r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot'
+ r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot'
+ r'|ZmEu|BLEXBot|bitlybot'
# when you block requests from Farside instances, your instance will
# disappear from https://farside.link/
# + r'|Farside'
+ r')'
)
@ -39,47 +44,59 @@ re_bot = re.compile(
def is_accepted_request() -> bool:
# pylint: disable=too-many-return-statements
redis_client = redisdb.client()
user_agent = request.headers.get('User-Agent', '')
user_agent = request.headers.get('User-Agent', 'unknown')
x_forwarded_for = request.headers.get('X-Forwarded-For', '')
if request.path == '/image_proxy':
if re_bot.match(user_agent):
return False
if request.path == '/healthz':
return True
if block_user_agent.match(user_agent):
logger.debug("BLOCK %s: %s --> detected User-Agent: %s" % (x_forwarded_for, request.path, user_agent))
return False
if request.path == '/search':
c_burst = incr_sliding_window(redis_client, 'IP limit, burst' + x_forwarded_for, 20)
c_10min = incr_sliding_window(redis_client, 'IP limit, 10 minutes' + x_forwarded_for, 600)
if c_burst > 15 or c_10min > 150:
logger.debug("to many request") # pylint: disable=undefined-variable
return False
if re_bot.match(user_agent):
logger.debug("detected bot") # pylint: disable=undefined-variable
logger.debug("BLOCK %s: to many request", x_forwarded_for)
return False
if len(request.headers.get('Accept-Language', '').strip()) == '':
logger.debug("missing Accept-Language") # pylint: disable=undefined-variable
logger.debug("BLOCK %s: missing Accept-Language", x_forwarded_for)
return False
if request.headers.get('Connection') == 'close':
logger.debug("got Connection=close") # pylint: disable=undefined-variable
logger.debug("BLOCK %s: got Connection=close", x_forwarded_for)
return False
accept_encoding_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')]
if 'gzip' not in accept_encoding_list and 'deflate' not in accept_encoding_list:
logger.debug("suspicious Accept-Encoding") # pylint: disable=undefined-variable
logger.debug("BLOCK %s: suspicious Accept-Encoding", x_forwarded_for)
return False
if 'text/html' not in request.accept_mimetypes:
logger.debug("Accept-Encoding misses text/html") # pylint: disable=undefined-variable
logger.debug("BLOCK %s: Accept-Encoding misses text/html", x_forwarded_for)
return False
if request.args.get('format', 'html') != 'html':
c = incr_sliding_window(redis_client, 'API limit' + x_forwarded_for, 3600)
if c > 4:
logger.debug("API limit exceeded") # pylint: disable=undefined-variable
logger.debug("BLOCK %s: API limit exceeded", x_forwarded_for)
return False
logger.debug(
"OK %s: '%s'" % (x_forwarded_for, request.path)
+ " || form: %s" % request.form
+ " || Accept: %s" % request.headers.get('Accept', '')
+ " || Accept-Language: %s" % request.headers.get('Accept-Language', '')
+ " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding', '')
+ " || Content-Type: %s" % request.headers.get('Content-Type', '')
+ " || Content-Length: %s" % request.headers.get('Content-Length', '')
+ " || Connection: %s" % request.headers.get('Connection', '')
+ " || User-Agent: %s" % user_agent
)
return True

View file

@ -11,10 +11,10 @@
"grunt-eslint": "^24.0.0",
"grunt-stylelint": "^0.16.0",
"grunt-image": "^6.4.0",
"ionicons": "^6.0.2",
"ionicons": "^7.1.0",
"less": "^4.1.3",
"less-plugin-clean-css": "^1.5.1",
"sharp": "^0.31.0",
"sharp": "^0.32.0",
"stylelint": "^13.13.1",
"stylelint-config-standard": "^22.0.0",
"ejs": "^3.1.8",

View file

@ -11,7 +11,7 @@ msgstr ""
"Project-Id-Version: PROJECT VERSION\n"
"Report-Msgid-Bugs-To: EMAIL@ADDRESS\n"
"POT-Creation-Date: 2023-02-20 11:22+0000\n"
"PO-Revision-Date: 2023-02-24 07:07+0000\n"
"PO-Revision-Date: 2023-03-30 12:37+0000\n"
"Last-Translator: return42 <markus.heiser@darmarit.de>\n"
"Language-Team: Norwegian Bokmål <https://translate.codeberg.org/projects/"
"searxng/searxng/nb_NO/>\n"
@ -20,7 +20,7 @@ msgstr ""
"Content-Type: text/plain; charset=utf-8\n"
"Content-Transfer-Encoding: 8bit\n"
"Plural-Forms: nplurals=2; plural=n != 1;\n"
"X-Generator: Weblate 4.15.2\n"
"X-Generator: Weblate 4.16.4\n"
"Generated-By: Babel 2.11.0\n"
#. CONSTANT_NAMES['DEFAULT_GROUP_NAME']
@ -392,10 +392,12 @@ msgid ""
"You are using Tor and it looks like you have this external IP address: "
"{ip_address}"
msgstr ""
"Du bruker Tor og det ser ut som om du har denne eksterne IP adressen: "
"{ip_address}"
#: searx/plugins/tor_check.py:86
msgid "You are not using Tor and you have this external IP address: {ip_address}"
msgstr ""
msgstr "Du bruker ikke Tor og du har denne IP adressen: {ip_address}"
#: searx/plugins/tracker_url_remover.py:29
msgid "Tracker URL remover"

View file

@ -10,20 +10,22 @@
# POORAJITH ST <gokulkannanst@gmail.com>, 2019
# Prasanna Venkadesh <prasmailme@gmail.com>, 2019
# Markus Heiser <markus.heiser@darmarit.de>, 2022.
# return42 <markus.heiser@darmarit.de>, 2023.
msgid ""
msgstr ""
"Project-Id-Version: searx\n"
"Project-Id-Version: searx\n"
"Report-Msgid-Bugs-To: EMAIL@ADDRESS\n"
"POT-Creation-Date: 2023-02-20 11:22+0000\n"
"PO-Revision-Date: 2022-11-04 07:18+0000\n"
"Last-Translator: Markus Heiser <markus.heiser@darmarit.de>\n"
"PO-Revision-Date: 2023-03-30 12:37+0000\n"
"Last-Translator: return42 <markus.heiser@darmarit.de>\n"
"Language-Team: Tamil <https://translate.codeberg.org/projects/searxng/"
"searxng/ta/>\n"
"Language: ta\n"
"Language-Team: Tamil "
"<https://weblate.bubu1.eu/projects/searxng/searxng/ta/>\n"
"Plural-Forms: nplurals=2; plural=n != 1;\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=utf-8\n"
"Content-Transfer-Encoding: 8bit\n"
"Plural-Forms: nplurals=2; plural=n != 1;\n"
"X-Generator: Weblate 4.16.4\n"
"Generated-By: Babel 2.11.0\n"
#. CONSTANT_NAMES['DEFAULT_GROUP_NAME']
@ -427,7 +429,7 @@ msgstr "பக்கம் கிடைக்கவில்லை"
#: searx/templates/simple/404.html:6
#, python-format
msgid "Go to %(search_page)s."
msgstr "%(search_page)s-க்கு செல்"
msgstr "%(search_page)s-க்கு செல்."
#: searx/templates/simple/404.html:6
msgid "search page"
@ -956,7 +958,7 @@ msgstr "முன் பக்கத்தைக் காட்டு"
#: searx/templates/simple/search.html:9
#: searx/templates/simple/simple_search.html:5
msgid "Search for..."
msgstr "எதைப்பற்றி தேட வேண்டும்?"
msgstr "எதைப்பற்றி தேட வேண்டும..."
#: searx/templates/simple/search.html:10
#: searx/templates/simple/simple_search.html:6
@ -1529,4 +1531,3 @@ msgstr "காணொளிகளை மறை"
#~ msgid "Automatically detect the query search language and switch to it."
#~ msgstr ""