forked from Ponysearch/Ponysearch
[mod] presearch: add language & region support
In Presearch there are languages for the UI and regions for narrowing down the search. With this change the SearXNG engine supports a search by region. The details can be found in the documentation of the source code. To test, you can search terms like:: !presearch bmw :zh-TW !presearch bmw :en-CA 1. You should get results corresponding to the region (Taiwan, Canada) 2. and in the language (Chinese, Englisch). 3. The context in info box content is in the same language. Exceptions: 1. Region or language is not supported by Presearch or 2. SearXNG user did not selected a region tag, example:: !presearch bmw :en Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
parent
a2c269bbac
commit
e560d7e373
2 changed files with 113 additions and 17 deletions
13
docs/dev/engines/online/presearch.rst
Normal file
13
docs/dev/engines/online/presearch.rst
Normal file
|
@ -0,0 +1,13 @@
|
|||
.. _engine presearch:
|
||||
|
||||
================
|
||||
Presearch Engine
|
||||
================
|
||||
|
||||
.. contents::
|
||||
:depth: 2
|
||||
:local:
|
||||
:backlinks: entry
|
||||
|
||||
.. automodule:: searx.engines.presearch
|
||||
:members:
|
|
@ -1,23 +1,72 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""Presearch (general, images, videos, news)
|
||||
"""Presearch supports the search types listed in :py:obj:`search_type` (general,
|
||||
images, videos, news).
|
||||
|
||||
Configured ``presarch`` engines:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
- name: presearch
|
||||
engine: presearch
|
||||
search_type: search
|
||||
categories: [general, web]
|
||||
|
||||
- name: presearch images
|
||||
...
|
||||
search_type: images
|
||||
categories: [images, web]
|
||||
|
||||
- name: presearch videos
|
||||
...
|
||||
search_type: videos
|
||||
categories: [general, web]
|
||||
|
||||
- name: presearch news
|
||||
...
|
||||
search_type: news
|
||||
categories: [news, web]
|
||||
|
||||
.. hint::
|
||||
|
||||
The results in the video category are most often links to pages that contain
|
||||
a video, for instance many links from preasearch's video category link
|
||||
content from facebook (aka Meta) or Twitter (aka X). Since these are not
|
||||
real links to video streams SearXNG can't use the video template for this and
|
||||
if SearXNG can't use this template, then the user doesn't want to see these
|
||||
hits in the videos category.
|
||||
|
||||
TL;DR; by default presearch's video category is placed into categories::
|
||||
By default Presearch's video category is intentionally placed into::
|
||||
|
||||
categories: [general, web]
|
||||
|
||||
|
||||
Search type ``video``
|
||||
=====================
|
||||
|
||||
The results in the video category are most often links to pages that contain a
|
||||
video, for instance many links from Preasearch's video category link content
|
||||
from facebook (aka Meta) or Twitter (aka X). Since these are not real links to
|
||||
video streams SearXNG can't use the video template for this and if SearXNG can't
|
||||
use this template, then the user doesn't want to see these hits in the videos
|
||||
category.
|
||||
|
||||
|
||||
Languages & Regions
|
||||
===================
|
||||
|
||||
In Presearch there are languages for the UI and regions for narrowing down the
|
||||
search. If we set "auto" for the region in the WEB-UI of Presearch and cookie
|
||||
``use_local_search_results=false``, then the defaults are set for both (the
|
||||
language and the region) from the ``Accept-Language`` header.
|
||||
|
||||
Since the region is already "auto" by default, we only need to set the
|
||||
``use_local_search_results`` cookie and send the ``Accept-Language`` header. We
|
||||
have to set these values in both requests we send to Presearch; in the first
|
||||
request to get the request-ID from Presearch and in the final request to get the
|
||||
result list (see ``send_accept_language_header``).
|
||||
|
||||
|
||||
Implementations
|
||||
===============
|
||||
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from searx import locales
|
||||
from searx.network import get
|
||||
from searx.utils import gen_useragent, html_to_text
|
||||
|
||||
|
@ -32,6 +81,7 @@ about = {
|
|||
paging = True
|
||||
safesearch = True
|
||||
time_range_support = True
|
||||
send_accept_language_header = True
|
||||
categories = ["general", "web"] # general, images, videos, news
|
||||
|
||||
search_type = "search"
|
||||
|
@ -46,19 +96,43 @@ def init(_):
|
|||
raise ValueError(f'presearch search_type: {search_type}')
|
||||
|
||||
|
||||
def _get_request_id(query, page, time_range, safesearch_param):
|
||||
def _get_request_id(query, params):
|
||||
|
||||
args = {
|
||||
"q": query,
|
||||
"page": page,
|
||||
"page": params["pageno"],
|
||||
}
|
||||
if time_range:
|
||||
args["time"] = time_range
|
||||
|
||||
if params["time_range"]:
|
||||
args["time"] = params["time_range"]
|
||||
|
||||
url = f"{base_url}/{search_type}?{urlencode(args)}"
|
||||
|
||||
headers = {
|
||||
'User-Agent': gen_useragent(),
|
||||
'Cookie': f"b=1;presearch_session=;use_safe_search={safesearch_map[safesearch_param]}",
|
||||
'Cookie': (
|
||||
f"b=1;"
|
||||
f" presearch_session=;"
|
||||
f" use_local_search_results=false;"
|
||||
f" use_safe_search={safesearch_map[params['safesearch']]}"
|
||||
),
|
||||
}
|
||||
if params['searxng_locale'] != 'all':
|
||||
l = locales.get_locale(params['searxng_locale'])
|
||||
|
||||
# Presearch narrows down the search by region. In SearXNG when the user
|
||||
# does not set a region (e.g. 'en-CA' / canada) we cannot hand over a
|
||||
# region.
|
||||
|
||||
# We could possibly use searx.locales.get_official_locales to determine
|
||||
# in which regions this language is an official one, but then we still
|
||||
# wouldn't know which region should be given more weight / Presearch
|
||||
# performs an IP-based geolocation of the user, we don't want that in
|
||||
# SearXNG ;-)
|
||||
|
||||
if l.territory:
|
||||
headers['Accept-Language'] = f"{l.language}-{l.territory},{l.language};" "q=0.9,*;" "q=0.5"
|
||||
|
||||
resp_text = get(url, headers=headers).text # type: ignore
|
||||
|
||||
for line in resp_text.split("\n"):
|
||||
|
@ -69,8 +143,7 @@ def _get_request_id(query, page, time_range, safesearch_param):
|
|||
|
||||
|
||||
def request(query, params):
|
||||
request_id = _get_request_id(query, params["pageno"], params["time_range"], params["safesearch"])
|
||||
|
||||
request_id = _get_request_id(query, params)
|
||||
params["headers"]["Accept"] = "application/json"
|
||||
params["url"] = f"{base_url}/results?id={request_id}"
|
||||
|
||||
|
@ -109,7 +182,17 @@ def parse_search_query(json_results):
|
|||
if info:
|
||||
attributes = []
|
||||
for item in info.get('about', []):
|
||||
label, value = html_to_text(item).split(':', 1)
|
||||
|
||||
text = html_to_text(item)
|
||||
if ':' in text:
|
||||
# split text into key / value
|
||||
label, value = text.split(':', 1)
|
||||
else:
|
||||
# In other languages (tested with zh-TW) a colon is represented
|
||||
# by a different symbol --> then we split at the first space.
|
||||
label, value = text.split(' ', 1)
|
||||
label = label[:-1]
|
||||
|
||||
value = _strip_leading_strings(value)
|
||||
attributes.append({'label': label, 'value': value})
|
||||
content = []
|
||||
|
|
Loading…
Reference in a new issue