forked from Ponysearch/Ponysearch
[mod] presearch: add language & region support
In Presearch there are languages for the UI and regions for narrowing down the search. With this change the SearXNG engine supports a search by region. The details can be found in the documentation of the source code. To test, you can search terms like:: !presearch bmw :zh-TW !presearch bmw :en-CA 1. You should get results corresponding to the region (Taiwan, Canada) 2. and in the language (Chinese, Englisch). 3. The context in info box content is in the same language. Exceptions: 1. Region or language is not supported by Presearch or 2. SearXNG user did not selected a region tag, example:: !presearch bmw :en Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
parent
a2c269bbac
commit
e560d7e373
2 changed files with 113 additions and 17 deletions
13
docs/dev/engines/online/presearch.rst
Normal file
13
docs/dev/engines/online/presearch.rst
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
.. _engine presearch:
|
||||||
|
|
||||||
|
================
|
||||||
|
Presearch Engine
|
||||||
|
================
|
||||||
|
|
||||||
|
.. contents::
|
||||||
|
:depth: 2
|
||||||
|
:local:
|
||||||
|
:backlinks: entry
|
||||||
|
|
||||||
|
.. automodule:: searx.engines.presearch
|
||||||
|
:members:
|
|
@ -1,23 +1,72 @@
|
||||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
# lint: pylint
|
# lint: pylint
|
||||||
"""Presearch (general, images, videos, news)
|
"""Presearch supports the search types listed in :py:obj:`search_type` (general,
|
||||||
|
images, videos, news).
|
||||||
|
|
||||||
|
Configured ``presarch`` engines:
|
||||||
|
|
||||||
|
.. code:: yaml
|
||||||
|
|
||||||
|
- name: presearch
|
||||||
|
engine: presearch
|
||||||
|
search_type: search
|
||||||
|
categories: [general, web]
|
||||||
|
|
||||||
|
- name: presearch images
|
||||||
|
...
|
||||||
|
search_type: images
|
||||||
|
categories: [images, web]
|
||||||
|
|
||||||
|
- name: presearch videos
|
||||||
|
...
|
||||||
|
search_type: videos
|
||||||
|
categories: [general, web]
|
||||||
|
|
||||||
|
- name: presearch news
|
||||||
|
...
|
||||||
|
search_type: news
|
||||||
|
categories: [news, web]
|
||||||
|
|
||||||
.. hint::
|
.. hint::
|
||||||
|
|
||||||
The results in the video category are most often links to pages that contain
|
By default Presearch's video category is intentionally placed into::
|
||||||
a video, for instance many links from preasearch's video category link
|
|
||||||
content from facebook (aka Meta) or Twitter (aka X). Since these are not
|
|
||||||
real links to video streams SearXNG can't use the video template for this and
|
|
||||||
if SearXNG can't use this template, then the user doesn't want to see these
|
|
||||||
hits in the videos category.
|
|
||||||
|
|
||||||
TL;DR; by default presearch's video category is placed into categories::
|
|
||||||
|
|
||||||
categories: [general, web]
|
categories: [general, web]
|
||||||
|
|
||||||
|
|
||||||
|
Search type ``video``
|
||||||
|
=====================
|
||||||
|
|
||||||
|
The results in the video category are most often links to pages that contain a
|
||||||
|
video, for instance many links from Preasearch's video category link content
|
||||||
|
from facebook (aka Meta) or Twitter (aka X). Since these are not real links to
|
||||||
|
video streams SearXNG can't use the video template for this and if SearXNG can't
|
||||||
|
use this template, then the user doesn't want to see these hits in the videos
|
||||||
|
category.
|
||||||
|
|
||||||
|
|
||||||
|
Languages & Regions
|
||||||
|
===================
|
||||||
|
|
||||||
|
In Presearch there are languages for the UI and regions for narrowing down the
|
||||||
|
search. If we set "auto" for the region in the WEB-UI of Presearch and cookie
|
||||||
|
``use_local_search_results=false``, then the defaults are set for both (the
|
||||||
|
language and the region) from the ``Accept-Language`` header.
|
||||||
|
|
||||||
|
Since the region is already "auto" by default, we only need to set the
|
||||||
|
``use_local_search_results`` cookie and send the ``Accept-Language`` header. We
|
||||||
|
have to set these values in both requests we send to Presearch; in the first
|
||||||
|
request to get the request-ID from Presearch and in the final request to get the
|
||||||
|
result list (see ``send_accept_language_header``).
|
||||||
|
|
||||||
|
|
||||||
|
Implementations
|
||||||
|
===============
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from urllib.parse import urlencode
|
from urllib.parse import urlencode
|
||||||
|
from searx import locales
|
||||||
from searx.network import get
|
from searx.network import get
|
||||||
from searx.utils import gen_useragent, html_to_text
|
from searx.utils import gen_useragent, html_to_text
|
||||||
|
|
||||||
|
@ -32,6 +81,7 @@ about = {
|
||||||
paging = True
|
paging = True
|
||||||
safesearch = True
|
safesearch = True
|
||||||
time_range_support = True
|
time_range_support = True
|
||||||
|
send_accept_language_header = True
|
||||||
categories = ["general", "web"] # general, images, videos, news
|
categories = ["general", "web"] # general, images, videos, news
|
||||||
|
|
||||||
search_type = "search"
|
search_type = "search"
|
||||||
|
@ -46,19 +96,43 @@ def init(_):
|
||||||
raise ValueError(f'presearch search_type: {search_type}')
|
raise ValueError(f'presearch search_type: {search_type}')
|
||||||
|
|
||||||
|
|
||||||
def _get_request_id(query, page, time_range, safesearch_param):
|
def _get_request_id(query, params):
|
||||||
|
|
||||||
args = {
|
args = {
|
||||||
"q": query,
|
"q": query,
|
||||||
"page": page,
|
"page": params["pageno"],
|
||||||
}
|
}
|
||||||
if time_range:
|
|
||||||
args["time"] = time_range
|
if params["time_range"]:
|
||||||
|
args["time"] = params["time_range"]
|
||||||
|
|
||||||
url = f"{base_url}/{search_type}?{urlencode(args)}"
|
url = f"{base_url}/{search_type}?{urlencode(args)}"
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
'User-Agent': gen_useragent(),
|
'User-Agent': gen_useragent(),
|
||||||
'Cookie': f"b=1;presearch_session=;use_safe_search={safesearch_map[safesearch_param]}",
|
'Cookie': (
|
||||||
|
f"b=1;"
|
||||||
|
f" presearch_session=;"
|
||||||
|
f" use_local_search_results=false;"
|
||||||
|
f" use_safe_search={safesearch_map[params['safesearch']]}"
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
if params['searxng_locale'] != 'all':
|
||||||
|
l = locales.get_locale(params['searxng_locale'])
|
||||||
|
|
||||||
|
# Presearch narrows down the search by region. In SearXNG when the user
|
||||||
|
# does not set a region (e.g. 'en-CA' / canada) we cannot hand over a
|
||||||
|
# region.
|
||||||
|
|
||||||
|
# We could possibly use searx.locales.get_official_locales to determine
|
||||||
|
# in which regions this language is an official one, but then we still
|
||||||
|
# wouldn't know which region should be given more weight / Presearch
|
||||||
|
# performs an IP-based geolocation of the user, we don't want that in
|
||||||
|
# SearXNG ;-)
|
||||||
|
|
||||||
|
if l.territory:
|
||||||
|
headers['Accept-Language'] = f"{l.language}-{l.territory},{l.language};" "q=0.9,*;" "q=0.5"
|
||||||
|
|
||||||
resp_text = get(url, headers=headers).text # type: ignore
|
resp_text = get(url, headers=headers).text # type: ignore
|
||||||
|
|
||||||
for line in resp_text.split("\n"):
|
for line in resp_text.split("\n"):
|
||||||
|
@ -69,8 +143,7 @@ def _get_request_id(query, page, time_range, safesearch_param):
|
||||||
|
|
||||||
|
|
||||||
def request(query, params):
|
def request(query, params):
|
||||||
request_id = _get_request_id(query, params["pageno"], params["time_range"], params["safesearch"])
|
request_id = _get_request_id(query, params)
|
||||||
|
|
||||||
params["headers"]["Accept"] = "application/json"
|
params["headers"]["Accept"] = "application/json"
|
||||||
params["url"] = f"{base_url}/results?id={request_id}"
|
params["url"] = f"{base_url}/results?id={request_id}"
|
||||||
|
|
||||||
|
@ -109,7 +182,17 @@ def parse_search_query(json_results):
|
||||||
if info:
|
if info:
|
||||||
attributes = []
|
attributes = []
|
||||||
for item in info.get('about', []):
|
for item in info.get('about', []):
|
||||||
label, value = html_to_text(item).split(':', 1)
|
|
||||||
|
text = html_to_text(item)
|
||||||
|
if ':' in text:
|
||||||
|
# split text into key / value
|
||||||
|
label, value = text.split(':', 1)
|
||||||
|
else:
|
||||||
|
# In other languages (tested with zh-TW) a colon is represented
|
||||||
|
# by a different symbol --> then we split at the first space.
|
||||||
|
label, value = text.split(' ', 1)
|
||||||
|
label = label[:-1]
|
||||||
|
|
||||||
value = _strip_leading_strings(value)
|
value = _strip_leading_strings(value)
|
||||||
attributes.append({'label': label, 'value': value})
|
attributes.append({'label': label, 'value': value})
|
||||||
content = []
|
content = []
|
||||||
|
|
Loading…
Reference in a new issue