forked from Ponysearch/Ponysearch
[mod] implement brave (WEB) engine to replace XPath configuration
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
parent
d151497db3
commit
460bbe5b81
3 changed files with 263 additions and 68 deletions
13
docs/dev/engines/online/brave.rst
Normal file
13
docs/dev/engines/online/brave.rst
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
.. _brave engine:
|
||||||
|
|
||||||
|
=============
|
||||||
|
Brave Engines
|
||||||
|
=============
|
||||||
|
|
||||||
|
.. contents:: Contents
|
||||||
|
:depth: 2
|
||||||
|
:local:
|
||||||
|
:backlinks: entry
|
||||||
|
|
||||||
|
.. automodule:: searx.engines.brave
|
||||||
|
:members:
|
|
@ -1,10 +1,56 @@
|
||||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
"""
|
# lint: pylint
|
||||||
Brave (General, news, videos, images)
|
"""Brave supports the categories listed in :py:obj:`brave_category` (General,
|
||||||
"""
|
news, videos, images). The support of :py:obj:`paging` and :py:obj:`time range
|
||||||
|
<time_range_support>` is limited (see remarks).
|
||||||
|
|
||||||
|
Configured ``brave`` engines:
|
||||||
|
|
||||||
|
.. code:: yaml
|
||||||
|
|
||||||
|
- name: brave
|
||||||
|
engine: brave
|
||||||
|
...
|
||||||
|
brave_category: search
|
||||||
|
time_range_support: true
|
||||||
|
paging: true
|
||||||
|
|
||||||
|
- name: brave.images
|
||||||
|
engine: brave
|
||||||
|
...
|
||||||
|
brave_category: images
|
||||||
|
|
||||||
|
- name: brave.videos
|
||||||
|
engine: brave
|
||||||
|
...
|
||||||
|
brave_category: videos
|
||||||
|
|
||||||
|
- name: brave.news
|
||||||
|
engine: brave
|
||||||
|
...
|
||||||
|
brave_category: news
|
||||||
|
|
||||||
|
|
||||||
|
Implementations
|
||||||
|
===============
|
||||||
|
|
||||||
|
"""
|
||||||
|
# pylint: disable=fixme
|
||||||
|
|
||||||
|
from urllib.parse import (
|
||||||
|
urlencode,
|
||||||
|
urlparse,
|
||||||
|
parse_qs,
|
||||||
|
)
|
||||||
|
|
||||||
from urllib.parse import urlencode
|
|
||||||
import chompjs
|
import chompjs
|
||||||
|
from lxml import html
|
||||||
|
|
||||||
|
from searx.utils import (
|
||||||
|
extract_text,
|
||||||
|
eval_xpath_list,
|
||||||
|
eval_xpath_getindex,
|
||||||
|
)
|
||||||
|
|
||||||
about = {
|
about = {
|
||||||
"website": 'https://search.brave.com/',
|
"website": 'https://search.brave.com/',
|
||||||
|
@ -14,41 +60,87 @@ about = {
|
||||||
"require_api_key": False,
|
"require_api_key": False,
|
||||||
"results": 'HTML',
|
"results": 'HTML',
|
||||||
}
|
}
|
||||||
|
|
||||||
base_url = "https://search.brave.com/"
|
base_url = "https://search.brave.com/"
|
||||||
|
categories = []
|
||||||
|
brave_category = 'search'
|
||||||
|
"""Brave supports common web-search, video search, image and video search.
|
||||||
|
|
||||||
|
- ``search``: Common WEB search
|
||||||
|
- ``videos``: search for videos
|
||||||
|
- ``images``: search for images
|
||||||
|
- ``news``: search for news
|
||||||
|
"""
|
||||||
|
|
||||||
|
brave_spellcheck = False
|
||||||
|
"""Brave supports some kind of spell checking. When activated, Brave tries to
|
||||||
|
fix typos, e.g. it searches for ``food`` when the user queries for ``fooh``. In
|
||||||
|
the UI of Brave the user gets warned about this, since we can not warn the user
|
||||||
|
in SearXNG, the spellchecking is disabled by default.
|
||||||
|
"""
|
||||||
|
|
||||||
|
send_accept_language_header = True
|
||||||
paging = False
|
paging = False
|
||||||
categories = ['images', 'videos', 'news'] # images, videos, news
|
"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
|
||||||
|
category All)."""
|
||||||
|
|
||||||
|
safesearch = True
|
||||||
|
safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'} # cookie: safesearch=off
|
||||||
|
|
||||||
|
time_range_support = False
|
||||||
|
"""Brave only supports time-range in :py:obj:`brave_category` ``search`` (UI
|
||||||
|
category All)."""
|
||||||
|
|
||||||
|
time_range_map = {
|
||||||
|
'day': 'pd',
|
||||||
|
'week': 'pw',
|
||||||
|
'month': 'pm',
|
||||||
|
'year': 'py',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def request(query, params):
|
def request(query, params):
|
||||||
|
|
||||||
|
# Don't accept br encoding / see https://github.com/searxng/searxng/pull/1787
|
||||||
|
params['headers']['Accept-Encoding'] = 'gzip, deflate'
|
||||||
|
|
||||||
args = {
|
args = {
|
||||||
'q': query,
|
'q': query,
|
||||||
'spellcheck': 1,
|
|
||||||
}
|
}
|
||||||
params["url"] = f"{base_url}{categories[0]}?{urlencode(args)}"
|
if brave_spellcheck:
|
||||||
|
args['spellcheck'] = '1'
|
||||||
|
|
||||||
|
if brave_category == 'search':
|
||||||
|
if params.get('pageno', 1) - 1:
|
||||||
|
args['offset'] = params.get('pageno', 1) - 1
|
||||||
|
if time_range_map.get(params['time_range']):
|
||||||
|
args['tf'] = time_range_map.get(params['time_range'])
|
||||||
|
|
||||||
def get_video_results(json_data):
|
params["url"] = f"{base_url}{brave_category}?{urlencode(args)}"
|
||||||
results = []
|
|
||||||
|
|
||||||
for result in json_data:
|
# set preferences in cookie
|
||||||
results.append(
|
params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off')
|
||||||
{
|
|
||||||
'template': 'videos.html',
|
|
||||||
'url': result['url'],
|
|
||||||
'thumbnail_src': result['thumbnail']['src'],
|
|
||||||
'img_src': result['properties']['url'],
|
|
||||||
'content': result['description'],
|
|
||||||
'title': result['title'],
|
|
||||||
'source': result['source'],
|
|
||||||
'duration': result['video']['duration'],
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
return results
|
# ToDo: we need a fetch_traits(..) implementation / the ui_lang of Brave are
|
||||||
|
# limited and the country handling has it quirks
|
||||||
|
|
||||||
|
eng_locale = params.get('searxng_locale')
|
||||||
|
params['cookies']['useLocation'] = '0' # the useLocation is IP based, we use 'country'
|
||||||
|
params['cookies']['summarizer'] = '0'
|
||||||
|
|
||||||
|
if not eng_locale or eng_locale == 'all':
|
||||||
|
params['cookies']['country'] = 'all' # country=all
|
||||||
|
else:
|
||||||
|
params['cookies']['country'] = eng_locale.split('-')[-1].lower()
|
||||||
|
params['cookies']['ui_lang'] = eng_locale.split('-')[0].lower()
|
||||||
|
|
||||||
|
# logger.debug("cookies %s", params['cookies'])
|
||||||
|
|
||||||
|
|
||||||
def response(resp):
|
def response(resp):
|
||||||
results = []
|
|
||||||
|
if brave_category == 'search':
|
||||||
|
return _parse_search(resp)
|
||||||
|
|
||||||
datastr = ""
|
datastr = ""
|
||||||
for line in resp.text.split("\n"):
|
for line in resp.text.split("\n"):
|
||||||
|
@ -57,10 +149,81 @@ def response(resp):
|
||||||
break
|
break
|
||||||
|
|
||||||
json_data = chompjs.parse_js_object(datastr)
|
json_data = chompjs.parse_js_object(datastr)
|
||||||
|
|
||||||
json_resp = json_data[1]['data']['body']['response']
|
json_resp = json_data[1]['data']['body']['response']
|
||||||
if categories[0] == 'news':
|
|
||||||
|
if brave_category == 'news':
|
||||||
json_resp = json_resp['news']
|
json_resp = json_resp['news']
|
||||||
|
return _parse_news(json_resp)
|
||||||
|
|
||||||
|
if brave_category == 'images':
|
||||||
|
return _parse_images(json_resp)
|
||||||
|
if brave_category == 'videos':
|
||||||
|
return _parse_videos(json_resp)
|
||||||
|
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_search(resp):
|
||||||
|
|
||||||
|
result_list = []
|
||||||
|
dom = html.fromstring(resp.text)
|
||||||
|
|
||||||
|
answer_tag = eval_xpath_getindex(dom, '//div[@class="answer"]', 0, default=None)
|
||||||
|
if answer_tag:
|
||||||
|
result_list.append({'answer': extract_text(answer_tag)})
|
||||||
|
|
||||||
|
# xpath_results = '//div[contains(@class, "snippet fdb") and @data-type="web"]'
|
||||||
|
xpath_results = '//div[contains(@class, "snippet")]'
|
||||||
|
|
||||||
|
for result in eval_xpath_list(dom, xpath_results):
|
||||||
|
|
||||||
|
url = eval_xpath_getindex(result, './/a[@class="result-header"]/@href', 0, default=None)
|
||||||
|
title_tag = eval_xpath_getindex(result, './/span[@class="snippet-title"]', 0, default=None)
|
||||||
|
if not (url and title_tag):
|
||||||
|
continue
|
||||||
|
|
||||||
|
content_tag = eval_xpath_getindex(result, './/p[@class="snippet-description"]', 0, default='')
|
||||||
|
img_src = eval_xpath_getindex(result, './/img[@class="thumb"]/@src', 0, default='')
|
||||||
|
|
||||||
|
item = {
|
||||||
|
'url': url,
|
||||||
|
'title': extract_text(title_tag),
|
||||||
|
'content': extract_text(content_tag),
|
||||||
|
'img_src': img_src,
|
||||||
|
}
|
||||||
|
|
||||||
|
video_tag = eval_xpath_getindex(
|
||||||
|
result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None
|
||||||
|
)
|
||||||
|
if video_tag:
|
||||||
|
|
||||||
|
# In my tests a video tag in the WEB search was mostoften not a
|
||||||
|
# video, except the ones from youtube ..
|
||||||
|
|
||||||
|
iframe_src = _get_iframe_src(url)
|
||||||
|
if iframe_src:
|
||||||
|
item['iframe_src'] = iframe_src
|
||||||
|
item['template'] = 'videos.html'
|
||||||
|
item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
|
||||||
|
else:
|
||||||
|
item['img_src'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
|
||||||
|
|
||||||
|
result_list.append(item)
|
||||||
|
|
||||||
|
return result_list
|
||||||
|
|
||||||
|
|
||||||
|
def _get_iframe_src(url):
|
||||||
|
parsed_url = urlparse(url)
|
||||||
|
if parsed_url.path == '/watch' and parsed_url.query:
|
||||||
|
video_id = parse_qs(parsed_url.query).get('v', []) # type: ignore
|
||||||
|
if video_id:
|
||||||
|
return 'https://www.youtube-nocookie.com/embed/' + video_id[0] # type: ignore
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_news(json_resp):
|
||||||
|
result_list = []
|
||||||
|
|
||||||
for result in json_resp["results"]:
|
for result in json_resp["results"]:
|
||||||
item = {
|
item = {
|
||||||
|
@ -68,18 +231,53 @@ def response(resp):
|
||||||
'title': result['title'],
|
'title': result['title'],
|
||||||
'content': result['description'],
|
'content': result['description'],
|
||||||
}
|
}
|
||||||
|
if result['thumbnail'] != "null":
|
||||||
|
item['img_src'] = result['thumbnail']['src']
|
||||||
|
result_list.append(item)
|
||||||
|
|
||||||
|
return result_list
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_images(json_resp):
|
||||||
|
result_list = []
|
||||||
|
|
||||||
|
for result in json_resp["results"]:
|
||||||
|
item = {
|
||||||
|
'url': result['url'],
|
||||||
|
'title': result['title'],
|
||||||
|
'content': result['description'],
|
||||||
|
'template': 'images.html',
|
||||||
|
'img_format': result['properties']['format'],
|
||||||
|
'source': result['source'],
|
||||||
|
'img_src': result['properties']['url'],
|
||||||
|
}
|
||||||
|
result_list.append(item)
|
||||||
|
|
||||||
|
return result_list
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_videos(json_resp):
|
||||||
|
result_list = []
|
||||||
|
|
||||||
|
for result in json_resp["results"]:
|
||||||
|
|
||||||
|
url = result['url']
|
||||||
|
item = {
|
||||||
|
'url': url,
|
||||||
|
'title': result['title'],
|
||||||
|
'content': result['description'],
|
||||||
|
'template': 'videos.html',
|
||||||
|
'length': result['video']['duration'],
|
||||||
|
'duration': result['video']['duration'],
|
||||||
|
}
|
||||||
|
|
||||||
if result['thumbnail'] != "null":
|
if result['thumbnail'] != "null":
|
||||||
item['thumbnail'] = result['thumbnail']['src']
|
item['thumbnail'] = result['thumbnail']['src']
|
||||||
|
|
||||||
if categories[0] == 'images':
|
iframe_src = _get_iframe_src(url)
|
||||||
item['template'] = 'images.html'
|
if iframe_src:
|
||||||
item['img_format'] = result['properties']['format']
|
item['iframe_src'] = iframe_src
|
||||||
item['source'] = result['source']
|
|
||||||
item['img_src'] = result['properties']['url']
|
|
||||||
elif categories[0] == 'videos':
|
|
||||||
item['template'] = 'videos.html'
|
|
||||||
item['length'] = result['video']['duration']
|
|
||||||
|
|
||||||
results.append(item)
|
result_list.append(item)
|
||||||
|
|
||||||
return results
|
return result_list
|
||||||
|
|
|
@ -1816,50 +1816,34 @@ engines:
|
||||||
timeout: 9.0
|
timeout: 9.0
|
||||||
|
|
||||||
- name: brave
|
- name: brave
|
||||||
shortcut: brave
|
engine: brave
|
||||||
engine: xpath
|
shortcut: br
|
||||||
paging: true
|
|
||||||
time_range_support: true
|
time_range_support: true
|
||||||
first_page_num: 0
|
paging: true
|
||||||
time_range_url: "&tf={time_range_val}"
|
|
||||||
search_url: https://search.brave.com/search?q={query}&offset={pageno}&spellcheck=1{time_range}
|
|
||||||
url_xpath: //a[@class="result-header"]/@href
|
|
||||||
title_xpath: //span[@class="snippet-title"]
|
|
||||||
content_xpath: //p[1][@class="snippet-description"]
|
|
||||||
suggestion_xpath: //div[@class="text-gray h6"]/a
|
|
||||||
time_range_map:
|
|
||||||
day: 'pd'
|
|
||||||
week: 'pw'
|
|
||||||
month: 'pm'
|
|
||||||
year: 'py'
|
|
||||||
categories: [general, web]
|
categories: [general, web]
|
||||||
disabled: true
|
brave_category: search
|
||||||
headers:
|
# brave_spellcheck: true
|
||||||
Accept-Encoding: gzip, deflate
|
|
||||||
about:
|
|
||||||
website: https://brave.com/search/
|
|
||||||
wikidata_id: Q107355971
|
|
||||||
use_official_api: false
|
|
||||||
require_api_key: false
|
|
||||||
results: HTML
|
|
||||||
|
|
||||||
- name: brave.images
|
- name: brave.images
|
||||||
shortcut: braveimg
|
|
||||||
engine: brave
|
engine: brave
|
||||||
categories: images
|
network: brave
|
||||||
disabled: true
|
shortcut: brimg
|
||||||
|
categories: [images, web]
|
||||||
|
brave_category: images
|
||||||
|
|
||||||
- name: brave.videos
|
- name: brave.videos
|
||||||
shortcut: bravevid
|
|
||||||
engine: brave
|
engine: brave
|
||||||
categories: videos
|
network: brave
|
||||||
disabled: true
|
shortcut: brvid
|
||||||
|
categories: [videos, web]
|
||||||
|
brave_category: videos
|
||||||
|
|
||||||
- name: brave.news
|
- name: brave.news
|
||||||
shortcut: bravenews
|
|
||||||
engine: brave
|
engine: brave
|
||||||
|
network: brave
|
||||||
|
shortcut: brnews
|
||||||
categories: news
|
categories: news
|
||||||
disabled: true
|
brave_category: news
|
||||||
|
|
||||||
- name: petalsearch
|
- name: petalsearch
|
||||||
shortcut: pts
|
shortcut: pts
|
||||||
|
|
Loading…
Reference in a new issue