Merge branch 'searxng:master' into master

This commit is contained in:
Azure Star 2023-03-29 09:58:10 +02:00 committed by GitHub
commit 42bc05744b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
75 changed files with 7823 additions and 6414 deletions

View file

@ -17,7 +17,7 @@ jobs:
- update_currencies.py - update_currencies.py
- update_external_bangs.py - update_external_bangs.py
- update_firefox_version.py - update_firefox_version.py
- update_languages.py - update_engine_traits.py
- update_wikidata_units.py - update_wikidata_units.py
- update_engine_descriptions.py - update_engine_descriptions.py
steps: steps:

View file

@ -42,7 +42,7 @@ Explanation of the :ref:`general engine configuration` shown in the table
- Timeout - Timeout
- Weight - Weight
- Paging - Paging
- Language - Language, Region
- Safe search - Safe search
- Time range - Time range

View file

@ -569,10 +569,13 @@ engine is shown. Most of the options have a default value or even are optional.
To disable by default the engine, but not deleting it. It will allow the user To disable by default the engine, but not deleting it. It will allow the user
to manually activate it in the settings. to manually activate it in the settings.
``inactive``: optional
Remove the engine from the settings (*disabled & removed*).
``language`` : optional ``language`` : optional
If you want to use another language for a specific engine, you can define it If you want to use another language for a specific engine, you can define it
by using the full ISO code of language and country, like ``fr_FR``, ``en_US``, by using the ISO code of language (and region), like ``fr``, ``en-US``,
``de_DE``. ``de-DE``.
``tokens`` : optional ``tokens`` : optional
A list of secret tokens to make this engine *private*, more details see A list of secret tokens to make this engine *private*, more details see

View file

@ -127,6 +127,10 @@ extensions = [
'notfound.extension', # https://github.com/readthedocs/sphinx-notfound-page 'notfound.extension', # https://github.com/readthedocs/sphinx-notfound-page
] ]
autodoc_default_options = {
'member-order': 'groupwise',
}
myst_enable_extensions = [ myst_enable_extensions = [
"replacements", "smartquotes" "replacements", "smartquotes"
] ]
@ -135,6 +139,7 @@ suppress_warnings = ['myst.domains']
intersphinx_mapping = { intersphinx_mapping = {
"python": ("https://docs.python.org/3/", None), "python": ("https://docs.python.org/3/", None),
"babel" : ("https://babel.readthedocs.io/en/latest/", None),
"flask": ("https://flask.palletsprojects.com/", None), "flask": ("https://flask.palletsprojects.com/", None),
"flask_babel": ("https://python-babel.github.io/flask-babel/", None), "flask_babel": ("https://python-babel.github.io/flask-babel/", None),
# "werkzeug": ("https://werkzeug.palletsprojects.com/", None), # "werkzeug": ("https://werkzeug.palletsprojects.com/", None),

View file

@ -54,6 +54,7 @@ Engine File
- ``offline`` :ref:`[ref] <offline engines>` - ``offline`` :ref:`[ref] <offline engines>`
- ``online_dictionary`` - ``online_dictionary``
- ``online_currency`` - ``online_currency``
- ``online_url_search``
======================= =========== ======================================================== ======================= =========== ========================================================
.. _engine settings: .. _engine settings:
@ -131,8 +132,10 @@ Passed Arguments (request)
These arguments can be used to construct the search query. Furthermore, These arguments can be used to construct the search query. Furthermore,
parameters with default value can be redefined for special purposes. parameters with default value can be redefined for special purposes.
.. _engine request online:
.. table:: If the ``engine_type`` is ``online`` .. table:: If the ``engine_type`` is :py:obj:`online
<searx.search.processors.online.OnlineProcessor.get_params>`
:width: 100% :width: 100%
====================== ============== ======================================================================== ====================== ============== ========================================================================
@ -149,12 +152,16 @@ parameters with default value can be redefined for special purposes.
safesearch int ``0``, between ``0`` and ``2`` (normal, moderate, strict) safesearch int ``0``, between ``0`` and ``2`` (normal, moderate, strict)
time_range Optional[str] ``None``, can be ``day``, ``week``, ``month``, ``year`` time_range Optional[str] ``None``, can be ``day``, ``week``, ``month``, ``year``
pageno int current pagenumber pageno int current pagenumber
language str specific language code like ``'en_US'``, or ``'all'`` if unspecified searxng_locale str SearXNG's locale selected by user. Specific language code like
``'en'``, ``'en-US'``, or ``'all'`` if unspecified.
====================== ============== ======================================================================== ====================== ============== ========================================================================
.. table:: If the ``engine_type`` is ``online_dictionary``, in addition to the .. _engine request online_dictionary:
``online`` arguments:
.. table:: If the ``engine_type`` is :py:obj:`online_dictionary
<searx.search.processors.online_dictionary.OnlineDictionaryProcessor.get_params>`,
in addition to the :ref:`online <engine request online>` arguments:
:width: 100% :width: 100%
====================== ============== ======================================================================== ====================== ============== ========================================================================
@ -165,8 +172,11 @@ parameters with default value can be redefined for special purposes.
query str the text query without the languages query str the text query without the languages
====================== ============== ======================================================================== ====================== ============== ========================================================================
.. table:: If the ``engine_type`` is ``online_currency```, in addition to the .. _engine request online_currency:
``online`` arguments:
.. table:: If the ``engine_type`` is :py:obj:`online_currency
<searx.search.processors.online_currency.OnlineCurrencyProcessor.get_params>`,
in addition to the :ref:`online <engine request online>` arguments:
:width: 100% :width: 100%
====================== ============== ======================================================================== ====================== ============== ========================================================================
@ -179,6 +189,26 @@ parameters with default value can be redefined for special purposes.
to_name str currency name to_name str currency name
====================== ============== ======================================================================== ====================== ============== ========================================================================
.. _engine request online_url_search:
.. table:: If the ``engine_type`` is :py:obj:`online_url_search
<searx.search.processors.online_url_search.OnlineUrlSearchProcessor.get_params>`,
in addition to the :ref:`online <engine request online>` arguments:
:width: 100%
====================== ============== ========================================================================
argument type default-value, information
====================== ============== ========================================================================
search_url dict URLs from the search query:
.. code:: python
{
'http': str,
'ftp': str,
'data:image': str
}
====================== ============== ========================================================================
Specify Request Specify Request
--------------- ---------------

View file

@ -52,12 +52,12 @@ Scripts to update static data in :origin:`searx/data/`
:members: :members:
``update_languages.py`` ``update_engine_traits.py``
======================= ===========================
:origin:`[source] <searxng_extra/update/update_languages.py>` :origin:`[source] <searxng_extra/update/update_engine_traits.py>`
.. automodule:: searxng_extra.update.update_languages .. automodule:: searxng_extra.update.update_engine_traits
:members: :members:

View file

@ -0,0 +1,9 @@
.. _archlinux engine:
==========
Arch Linux
==========
.. automodule:: searx.engines.archlinux
:members:

View file

@ -0,0 +1,8 @@
.. _dailymotion engine:
===========
Dailymotion
===========
.. automodule:: searx.engines.dailymotion
:members:

View file

@ -0,0 +1,22 @@
.. _duckduckgo engines:
=================
DukcDukGo engines
=================
.. contents:: Contents
:depth: 2
:local:
:backlinks: entry
.. automodule:: searx.engines.duckduckgo
:members:
.. automodule:: searx.engines.duckduckgo_images
:members:
.. automodule:: searx.engines.duckduckgo_definitions
:members:
.. automodule:: searx.engines.duckduckgo_weather
:members:

View file

@ -0,0 +1,17 @@
.. _searx.enginelib:
============
Engine model
============
.. automodule:: searx.enginelib
:members:
.. _searx.enginelib.traits:
=============
Engine traits
=============
.. automodule:: searx.enginelib.traits
:members:

View file

@ -0,0 +1,43 @@
.. _bing engines:
============
Bing Engines
============
.. contents:: Contents
:depth: 2
:local:
:backlinks: entry
.. _bing web engine:
Bing WEB
========
.. automodule:: searx.engines.bing
:members:
.. _bing images engine:
Bing Images
===========
.. automodule:: searx.engines.bing_images
:members:
.. _bing videos engine:
Bing Videos
===========
.. automodule:: searx.engines.bing_videos
:members:
.. _bing news engine:
Bing News
=========
.. automodule:: searx.engines.bing_news
:members:

View file

@ -12,15 +12,21 @@ Google Engines
.. _google API: .. _google API:
google API Google API
========== ==========
.. _Query Parameter Definitions: .. _Query Parameter Definitions:
https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
SearXNG's implementation of the Google API is mainly done in
:py:obj:`get_google_info <searx.engines.google.get_google_info>`.
For detailed description of the *REST-full* API see: `Query Parameter For detailed description of the *REST-full* API see: `Query Parameter
Definitions`_. Not all parameters can be appied and some engines are *special* Definitions`_. The linked API documentation can sometimes be helpful during
(e.g. :ref:`google news engine`). reverse engineering. However, we cannot use it in the freely accessible WEB
services; not all parameters can be applied and some engines are more *special*
than other (e.g. :ref:`google news engine`).
.. _google web engine: .. _google web engine:
@ -30,6 +36,13 @@ Google WEB
.. automodule:: searx.engines.google .. automodule:: searx.engines.google
:members: :members:
.. _google autocomplete:
Google Autocomplete
====================
.. autofunction:: searx.autocomplete.google_complete
.. _google images engine: .. _google images engine:
Google Images Google Images
@ -53,3 +66,11 @@ Google News
.. automodule:: searx.engines.google_news .. automodule:: searx.engines.google_news
:members: :members:
.. _google scholar engine:
Google Scholar
==============
.. automodule:: searx.engines.google_scholar
:members:

View file

@ -0,0 +1,27 @@
.. _peertube engines:
================
Peertube Engines
================
.. contents:: Contents
:depth: 2
:local:
:backlinks: entry
.. _peertube video engine:
Peertube Video
==============
.. automodule:: searx.engines.peertube
:members:
.. _sepiasearch engine:
SepiaSearch
===========
.. automodule:: searx.engines.sepiasearch
:members:

View file

@ -1,8 +1,8 @@
.. _load_engines: .. _searx.engines:
============ =================
Load Engines SearXNG's engines
============ =================
.. automodule:: searx.engines .. automodule:: searx.engines
:members: :members:

View file

@ -0,0 +1,13 @@
.. _startpage engines:
=================
Startpage engines
=================
.. contents:: Contents
:depth: 2
:local:
:backlinks: entry
.. automodule:: searx.engines.startpage
:members:

View file

@ -0,0 +1,27 @@
.. _wikimedia engines:
=========
Wikimedia
=========
.. contents:: Contents
:depth: 2
:local:
:backlinks: entry
.. _wikipedia engine:
Wikipedia
=========
.. automodule:: searx.engines.wikipedia
:members:
.. _wikidata engine:
Wikidata
=========
.. automodule:: searx.engines.wikidata
:members:

View file

@ -4,5 +4,17 @@
Locales Locales
======= =======
.. contents:: Contents
:depth: 2
:local:
:backlinks: entry
.. automodule:: searx.locales .. automodule:: searx.locales
:members: :members:
SearXNG's locale codes
======================
.. automodule:: searx.sxng_locales
:members:

View file

@ -0,0 +1,47 @@
.. _searx.search.processors:
=================
Search processors
=================
.. contents:: Contents
:depth: 2
:local:
:backlinks: entry
Abstract processor class
========================
.. automodule:: searx.search.processors.abstract
:members:
Offline processor
=================
.. automodule:: searx.search.processors.offline
:members:
Online processor
================
.. automodule:: searx.search.processors.online
:members:
Online currency processor
=========================
.. automodule:: searx.search.processors.online_currency
:members:
Online Dictionary processor
===========================
.. automodule:: searx.search.processors.online_dictionary
:members:
Online URL search processor
===========================
.. automodule:: searx.search.processors.online_url_search
:members:

2
manage
View file

@ -63,7 +63,7 @@ PYLINT_SEARXNG_DISABLE_OPTION="\
I,C,R,\ I,C,R,\
W0105,W0212,W0511,W0603,W0613,W0621,W0702,W0703,W1401,\ W0105,W0212,W0511,W0603,W0613,W0621,W0702,W0703,W1401,\
E1136" E1136"
PYLINT_ADDITIONAL_BUILTINS_FOR_ENGINES="supported_languages,language_aliases,logger,categories" PYLINT_ADDITIONAL_BUILTINS_FOR_ENGINES="traits,supported_languages,language_aliases,logger,categories"
PYLINT_OPTIONS="-m pylint -j 0 --rcfile .pylintrc" PYLINT_OPTIONS="-m pylint -j 0 --rcfile .pylintrc"
help() { help() {

View file

@ -1,5 +1,5 @@
certifi==2022.12.7 certifi==2022.12.7
babel==2.11.0 babel==2.12.1
flask-babel==3.0.1 flask-babel==3.0.1
flask==2.2.3 flask==2.2.3
jinja2==3.1.2 jinja2==3.1.2

View file

@ -5,20 +5,20 @@
""" """
# pylint: disable=use-dict-literal # pylint: disable=use-dict-literal
from json import loads import json
from urllib.parse import urlencode from urllib.parse import urlencode
from lxml import etree import lxml
from httpx import HTTPError from httpx import HTTPError
from searx import settings from searx import settings
from searx.data import ENGINES_LANGUAGES from searx.engines import (
engines,
google,
)
from searx.network import get as http_get from searx.network import get as http_get
from searx.exceptions import SearxEngineResponseException from searx.exceptions import SearxEngineResponseException
# a fetch_supported_languages() for XPath engines isn't available right now
# _brave = ENGINES_LANGUAGES['brave'].keys()
def get(*args, **kwargs): def get(*args, **kwargs):
if 'timeout' not in kwargs: if 'timeout' not in kwargs:
@ -55,34 +55,58 @@ def dbpedia(query, _lang):
results = [] results = []
if response.ok: if response.ok:
dom = etree.fromstring(response.content) dom = lxml.etree.fromstring(response.content)
results = dom.xpath('//Result/Label//text()') results = dom.xpath('//Result/Label//text()')
return results return results
def duckduckgo(query, _lang): def duckduckgo(query, sxng_locale):
# duckduckgo autocompleter """Autocomplete from DuckDuckGo. Supports DuckDuckGo's languages"""
url = 'https://ac.duckduckgo.com/ac/?{0}&type=list'
resp = loads(get(url.format(urlencode(dict(q=query)))).text) traits = engines['duckduckgo'].traits
if len(resp) > 1: args = {
return resp[1] 'q': query,
return [] 'kl': traits.get_region(sxng_locale, traits.all_locale),
}
url = 'https://duckduckgo.com/ac/?type=list&' + urlencode(args)
resp = get(url)
ret_val = []
if resp.ok:
j = resp.json()
if len(j) > 1:
ret_val = j[1]
return ret_val
def google(query, lang): def google_complete(query, sxng_locale):
# google autocompleter """Autocomplete from Google. Supports Google's languages and subdomains
autocomplete_url = 'https://suggestqueries.google.com/complete/search?client=toolbar&' (:py:obj:`searx.engines.google.get_google_info`) by using the async REST
API::
response = get(autocomplete_url + urlencode(dict(hl=lang, q=query))) https://{subdomain}/complete/search?{args}
"""
google_info = google.get_google_info({'searxng_locale': sxng_locale}, engines['google'].traits)
url = 'https://{subdomain}/complete/search?{args}'
args = urlencode(
{
'q': query,
'client': 'gws-wiz',
'hl': google_info['params']['hl'],
}
)
results = [] results = []
resp = get(url.format(subdomain=google_info['subdomain'], args=args))
if response.ok: if resp.ok:
dom = etree.fromstring(response.text) json_txt = resp.text[resp.text.find('[') : resp.text.find(']', -3) + 1]
results = dom.xpath('//suggestion/@data') data = json.loads(json_txt)
for item in data[0]:
results.append(lxml.html.fromstring(item[0]).text_content())
return results return results
@ -109,9 +133,9 @@ def seznam(query, _lang):
] ]
def startpage(query, lang): def startpage(query, sxng_locale):
# startpage autocompleter """Autocomplete from Startpage. Supports Startpage's languages"""
lui = ENGINES_LANGUAGES['startpage'].get(lang, 'english') lui = engines['startpage'].traits.get_language(sxng_locale, 'english')
url = 'https://startpage.com/suggestions?{query}' url = 'https://startpage.com/suggestions?{query}'
resp = get(url.format(query=urlencode({'q': query, 'segment': 'startpage.udog', 'lui': lui}))) resp = get(url.format(query=urlencode({'q': query, 'segment': 'startpage.udog', 'lui': lui})))
data = resp.json() data = resp.json()
@ -122,20 +146,20 @@ def swisscows(query, _lang):
# swisscows autocompleter # swisscows autocompleter
url = 'https://swisscows.ch/api/suggest?{query}&itemsCount=5' url = 'https://swisscows.ch/api/suggest?{query}&itemsCount=5'
resp = loads(get(url.format(query=urlencode({'query': query}))).text) resp = json.loads(get(url.format(query=urlencode({'query': query}))).text)
return resp return resp
def qwant(query, lang): def qwant(query, sxng_locale):
# qwant autocompleter (additional parameter : lang=en_en&count=xxx ) """Autocomplete from Qwant. Supports Qwant's regions."""
url = 'https://api.qwant.com/api/suggest?{query}'
resp = get(url.format(query=urlencode({'q': query, 'lang': lang})))
results = [] results = []
locale = engines['qwant'].traits.get_region(sxng_locale, 'en_US')
url = 'https://api.qwant.com/v3/suggest?{query}'
resp = get(url.format(query=urlencode({'q': query, 'locale': locale, 'version': '2'})))
if resp.ok: if resp.ok:
data = loads(resp.text) data = resp.json()
if data['status'] == 'success': if data['status'] == 'success':
for item in data['data']['items']: for item in data['data']['items']:
results.append(item['value']) results.append(item['value'])
@ -143,21 +167,38 @@ def qwant(query, lang):
return results return results
def wikipedia(query, lang): def wikipedia(query, sxng_locale):
# wikipedia autocompleter """Autocomplete from Wikipedia. Supports Wikipedia's languages (aka netloc)."""
url = 'https://' + lang + '.wikipedia.org/w/api.php?action=opensearch&{0}&limit=10&namespace=0&format=json' results = []
eng_traits = engines['wikipedia'].traits
wiki_lang = eng_traits.get_language(sxng_locale, 'en')
wiki_netloc = eng_traits.custom['wiki_netloc'].get(wiki_lang, 'en.wikipedia.org')
resp = loads(get(url.format(urlencode(dict(search=query)))).text) url = 'https://{wiki_netloc}/w/api.php?{args}'
if len(resp) > 1: args = urlencode(
return resp[1] {
return [] 'action': 'opensearch',
'format': 'json',
'formatversion': '2',
'search': query,
'namespace': '0',
'limit': '10',
}
)
resp = get(url.format(args=args, wiki_netloc=wiki_netloc))
if resp.ok:
data = resp.json()
if len(data) > 1:
results = data[1]
return results
def yandex(query, _lang): def yandex(query, _lang):
# yandex autocompleter # yandex autocompleter
url = "https://suggest.yandex.com/suggest-ff.cgi?{0}" url = "https://suggest.yandex.com/suggest-ff.cgi?{0}"
resp = loads(get(url.format(urlencode(dict(part=query)))).text) resp = json.loads(get(url.format(urlencode(dict(part=query)))).text)
if len(resp) > 1: if len(resp) > 1:
return resp[1] return resp[1]
return [] return []
@ -166,7 +207,7 @@ def yandex(query, _lang):
backends = { backends = {
'dbpedia': dbpedia, 'dbpedia': dbpedia,
'duckduckgo': duckduckgo, 'duckduckgo': duckduckgo,
'google': google, 'google': google_complete,
'seznam': seznam, 'seznam': seznam,
'startpage': startpage, 'startpage': startpage,
'swisscows': swisscows, 'swisscows': swisscows,
@ -177,12 +218,11 @@ backends = {
} }
def search_autocomplete(backend_name, query, lang): def search_autocomplete(backend_name, query, sxng_locale):
backend = backends.get(backend_name) backend = backends.get(backend_name)
if backend is None: if backend is None:
return [] return []
try: try:
return backend(query, lang) return backend(query, sxng_locale)
except (HTTPError, SearxEngineResponseException): except (HTTPError, SearxEngineResponseException):
return [] return []

View file

@ -7,7 +7,7 @@
""" """
__all__ = [ __all__ = [
'ENGINES_LANGUAGES', 'ENGINE_TRAITS',
'CURRENCIES', 'CURRENCIES',
'USER_AGENTS', 'USER_AGENTS',
'EXTERNAL_URLS', 'EXTERNAL_URLS',
@ -42,7 +42,6 @@ def ahmia_blacklist_loader():
return f.read().split() return f.read().split()
ENGINES_LANGUAGES = _load('engines_languages.json')
CURRENCIES = _load('currencies.json') CURRENCIES = _load('currencies.json')
USER_AGENTS = _load('useragents.json') USER_AGENTS = _load('useragents.json')
EXTERNAL_URLS = _load('external_urls.json') EXTERNAL_URLS = _load('external_urls.json')
@ -50,3 +49,4 @@ WIKIDATA_UNITS = _load('wikidata_units.json')
EXTERNAL_BANGS = _load('external_bangs.json') EXTERNAL_BANGS = _load('external_bangs.json')
OSM_KEYS_TAGS = _load('osm_keys_tags.json') OSM_KEYS_TAGS = _load('osm_keys_tags.json')
ENGINE_DESCRIPTIONS = _load('engine_descriptions.json') ENGINE_DESCRIPTIONS = _load('engine_descriptions.json')
ENGINE_TRAITS = _load('engine_traits.json')

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

136
searx/enginelib/__init__.py Normal file
View file

@ -0,0 +1,136 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Engine related implementations
.. note::
The long term goal is to modularize all relevant implementations to the
engines here in this Python package. In addition to improved modularization,
this will also be necessary in part because the probability of circular
imports will increase due to the increased typification of implementations in
the future.
ToDo:
- move :py:obj:`searx.engines.load_engine` to a new module `searx.enginelib`.
"""
from __future__ import annotations
from typing import Union, Dict, List, Callable, TYPE_CHECKING
if TYPE_CHECKING:
from searx.enginelib import traits
class Engine: # pylint: disable=too-few-public-methods
"""Class of engine instances build from YAML settings.
Further documentation see :ref:`general engine configuration`.
.. hint::
This class is currently never initialized and only used for type hinting.
"""
# Common options in the engine module
engine_type: str
"""Type of the engine (:origin:`searx/search/processors`)"""
paging: bool
"""Engine supports multiple pages."""
time_range_support: bool
"""Engine supports search time range."""
safesearch: bool
"""Engine supports SafeSearch"""
language_support: bool
"""Engine supports languages (locales) search."""
language: str
"""For an engine, when there is ``language: ...`` in the YAML settings the engine
does support only this one language:
.. code:: yaml
- name: google french
engine: google
language: fr
"""
region: str
"""For an engine, when there is ``region: ...`` in the YAML settings the engine
does support only this one region::
.. code:: yaml
- name: google belgium
engine: google
region: fr-BE
"""
fetch_traits: Callable
"""Function to to fetch engine's traits from origin."""
traits: traits.EngineTraits
"""Traits of the engine."""
# settings.yml
categories: List[str]
"""Tabs, in which the engine is working."""
name: str
"""Name that will be used across SearXNG to define this engine. In settings, on
the result page .."""
engine: str
"""Name of the python file used to handle requests and responses to and from
this search engine (file name from :origin:`searx/engines` without
``.py``)."""
enable_http: bool
"""Enable HTTP (by default only HTTPS is enabled)."""
shortcut: str
"""Code used to execute bang requests (``!foo``)"""
timeout: float
"""Specific timeout for search-engine."""
display_error_messages: bool
"""Display error messages on the web UI."""
proxies: dict
"""Set proxies for a specific engine (YAML):
.. code:: yaml
proxies :
http: socks5://proxy:port
https: socks5://proxy:port
"""
disabled: bool
"""To disable by default the engine, but not deleting it. It will allow the
user to manually activate it in the settings."""
inactive: bool
"""Remove the engine from the settings (*disabled & removed*)."""
about: dict
"""Additional fileds describing the engine.
.. code:: yaml
about:
website: https://example.com
wikidata_id: Q306656
official_api_documentation: https://example.com/api-doc
use_official_api: true
require_api_key: true
results: HTML
"""

250
searx/enginelib/traits.py Normal file
View file

@ -0,0 +1,250 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Engine's traits are fetched from the origin engines and stored in a JSON file
in the *data folder*. Most often traits are languages and region codes and
their mapping from SearXNG's representation to the representation in the origin
search engine. For new traits new properties can be added to the class
:py:class:`EngineTraits`.
To load traits from the persistence :py:obj:`EngineTraitsMap.from_data` can be
used.
"""
from __future__ import annotations
import json
import dataclasses
from typing import Dict, Union, Callable, Optional, TYPE_CHECKING
from typing_extensions import Literal, Self
from searx import locales
from searx.data import data_dir, ENGINE_TRAITS
if TYPE_CHECKING:
from . import Engine
class EngineTraitsEncoder(json.JSONEncoder):
"""Encodes :class:`EngineTraits` to a serializable object, see
:class:`json.JSONEncoder`."""
def default(self, o):
"""Return dictionary of a :class:`EngineTraits` object."""
if isinstance(o, EngineTraits):
return o.__dict__
return super().default(o)
@dataclasses.dataclass
class EngineTraits:
"""The class is intended to be instantiated for each engine."""
regions: Dict[str, str] = dataclasses.field(default_factory=dict)
"""Maps SearXNG's internal representation of a region to the one of the engine.
SearXNG's internal representation can be parsed by babel and the value is
send to the engine:
.. code:: python
regions ={
'fr-BE' : <engine's region name>,
}
for key, egnine_region regions.items():
searxng_region = babel.Locale.parse(key, sep='-')
...
"""
languages: Dict[str, str] = dataclasses.field(default_factory=dict)
"""Maps SearXNG's internal representation of a language to the one of the engine.
SearXNG's internal representation can be parsed by babel and the value is
send to the engine:
.. code:: python
languages = {
'ca' : <engine's language name>,
}
for key, egnine_lang in languages.items():
searxng_lang = babel.Locale.parse(key)
...
"""
all_locale: Optional[str] = None
"""To which locale value SearXNG's ``all`` language is mapped (shown a "Default
language").
"""
data_type: Literal['traits_v1'] = 'traits_v1'
"""Data type, default is 'traits_v1'.
"""
custom: Dict[str, Dict] = dataclasses.field(default_factory=dict)
"""A place to store engine's custom traits, not related to the SearXNG core
"""
def get_language(self, searxng_locale: str, default=None):
"""Return engine's language string that *best fits* to SearXNG's locale.
:param searxng_locale: SearXNG's internal representation of locale
selected by the user.
:param default: engine's default language
The *best fits* rules are implemented in
:py:obj:`locales.get_engine_locale`. Except for the special value ``all``
which is determined from :py:obj`EngineTraits.all_language`.
"""
if searxng_locale == 'all' and self.all_locale is not None:
return self.all_locale
return locales.get_engine_locale(searxng_locale, self.languages, default=default)
def get_region(self, searxng_locale: str, default=None):
"""Return engine's region string that best fits to SearXNG's locale.
:param searxng_locale: SearXNG's internal representation of locale
selected by the user.
:param default: engine's default region
The *best fits* rules are implemented in
:py:obj:`locales.get_engine_locale`. Except for the special value ``all``
which is determined from :py:obj`EngineTraits.all_language`.
"""
if searxng_locale == 'all' and self.all_locale is not None:
return self.all_locale
return locales.get_engine_locale(searxng_locale, self.regions, default=default)
def is_locale_supported(self, searxng_locale: str) -> bool:
"""A *locale* (SearXNG's internal representation) is considered to be supported
by the engine if the *region* or the *language* is supported by the
engine. For verification the functions :py:func:`self.get_region` and
:py:func:`self.get_region` are used.
"""
if self.data_type == 'traits_v1':
return bool(self.get_region(searxng_locale) or self.get_language(searxng_locale))
raise TypeError('engine traits of type %s is unknown' % self.data_type)
def copy(self):
"""Create a copy of the dataclass object."""
return EngineTraits(**dataclasses.asdict(self))
@classmethod
def fetch_traits(cls, engine: Engine) -> Union[Self, None]:
"""Call a function ``fetch_traits(engine_traits)`` from engines namespace to fetch
and set properties from the origin engine in the object ``engine_traits``. If
function does not exists, ``None`` is returned.
"""
fetch_traits = getattr(engine, 'fetch_traits', None)
engine_traits = None
if fetch_traits:
engine_traits = cls()
fetch_traits(engine_traits)
return engine_traits
def set_traits(self, engine: Engine):
"""Set traits from self object in a :py:obj:`.Engine` namespace.
:param engine: engine instance build by :py:func:`searx.engines.load_engine`
"""
if self.data_type == 'traits_v1':
self._set_traits_v1(engine)
else:
raise TypeError('engine traits of type %s is unknown' % self.data_type)
def _set_traits_v1(self, engine: Engine):
# For an engine, when there is `language: ...` in the YAML settings the engine
# does support only this one language (region)::
#
# - name: google italian
# engine: google
# language: it
# region: it-IT
traits = self.copy()
_msg = "settings.yml - engine: '%s' / %s: '%s' not supported"
languages = traits.languages
if hasattr(engine, 'language'):
if engine.language not in languages:
raise ValueError(_msg % (engine.name, 'language', engine.language))
traits.languages = {engine.language: languages[engine.language]}
regions = traits.regions
if hasattr(engine, 'region'):
if engine.region not in regions:
raise ValueError(_msg % (engine.name, 'region', engine.region))
traits.regions = {engine.region: regions[engine.region]}
engine.language_support = bool(traits.languages or traits.regions)
# set the copied & modified traits in engine's namespace
engine.traits = traits
class EngineTraitsMap(Dict[str, EngineTraits]):
"""A python dictionary to map :class:`EngineTraits` by engine name."""
ENGINE_TRAITS_FILE = (data_dir / 'engine_traits.json').resolve()
"""File with persistence of the :py:obj:`EngineTraitsMap`."""
def save_data(self):
"""Store EngineTraitsMap in in file :py:obj:`self.ENGINE_TRAITS_FILE`"""
with open(self.ENGINE_TRAITS_FILE, 'w', encoding='utf-8') as f:
json.dump(self, f, indent=2, sort_keys=True, cls=EngineTraitsEncoder)
@classmethod
def from_data(cls) -> Self:
"""Instantiate :class:`EngineTraitsMap` object from :py:obj:`ENGINE_TRAITS`"""
obj = cls()
for k, v in ENGINE_TRAITS.items():
obj[k] = EngineTraits(**v)
return obj
@classmethod
def fetch_traits(cls, log: Callable) -> Self:
from searx import engines # pylint: disable=cyclic-import, import-outside-toplevel
names = list(engines.engines)
names.sort()
obj = cls()
for engine_name in names:
engine = engines.engines[engine_name]
traits = EngineTraits.fetch_traits(engine)
if traits is not None:
log("%-20s: SearXNG languages --> %s " % (engine_name, len(traits.languages)))
log("%-20s: SearXNG regions --> %s" % (engine_name, len(traits.regions)))
obj[engine_name] = traits
return obj
def set_traits(self, engine: Engine):
"""Set traits in a :py:obj:`Engine` namespace.
:param engine: engine instance build by :py:func:`searx.engines.load_engine`
"""
engine_traits = EngineTraits(data_type='traits_v1')
if engine.name in self.keys():
engine_traits = self[engine.name]
elif engine.engine in self.keys():
# The key of the dictionary traits_map is the *engine name*
# configured in settings.xml. When multiple engines are configured
# in settings.yml to use the same origin engine (python module)
# these additional engines can use the languages from the origin
# engine. For this use the configured ``engine: ...`` from
# settings.yml
engine_traits = self[engine.engine]
engine_traits.set_traits(engine)

View file

@ -11,24 +11,22 @@ usage::
""" """
from __future__ import annotations
import sys import sys
import copy import copy
from typing import Dict, List, Optional
from os.path import realpath, dirname from os.path import realpath, dirname
from babel.localedata import locale_identifiers
from searx import logger, settings
from searx.data import ENGINES_LANGUAGES
from searx.network import get
from searx.utils import load_module, match_language, gen_useragent
from typing import TYPE_CHECKING, Dict, Optional
from searx import logger, settings
from searx.utils import load_module
if TYPE_CHECKING:
from searx.enginelib import Engine
logger = logger.getChild('engines') logger = logger.getChild('engines')
ENGINE_DIR = dirname(realpath(__file__)) ENGINE_DIR = dirname(realpath(__file__))
BABEL_LANGS = [
lang_parts[0] + '-' + lang_parts[-1] if len(lang_parts) > 1 else lang_parts[0]
for lang_parts in (lang_code.split('_') for lang_code in locale_identifiers())
]
ENGINE_DEFAULT_ARGS = { ENGINE_DEFAULT_ARGS = {
"engine_type": "online", "engine_type": "online",
"inactive": False, "inactive": False,
@ -36,8 +34,6 @@ ENGINE_DEFAULT_ARGS = {
"timeout": settings["outgoing"]["request_timeout"], "timeout": settings["outgoing"]["request_timeout"],
"shortcut": "-", "shortcut": "-",
"categories": ["general"], "categories": ["general"],
"supported_languages": [],
"language_aliases": {},
"paging": False, "paging": False,
"safesearch": False, "safesearch": False,
"time_range_support": False, "time_range_support": False,
@ -52,24 +48,6 @@ ENGINE_DEFAULT_ARGS = {
OTHER_CATEGORY = 'other' OTHER_CATEGORY = 'other'
class Engine: # pylint: disable=too-few-public-methods
"""This class is currently never initialized and only used for type hinting."""
name: str
engine: str
shortcut: str
categories: List[str]
supported_languages: List[str]
about: dict
inactive: bool
disabled: bool
language_support: bool
paging: bool
safesearch: bool
time_range_support: bool
timeout: float
# Defaults for the namespace of an engine module, see :py:func:`load_engine` # Defaults for the namespace of an engine module, see :py:func:`load_engine`
categories = {'general': []} categories = {'general': []}
@ -136,9 +114,15 @@ def load_engine(engine_data: dict) -> Optional[Engine]:
return None return None
update_engine_attributes(engine, engine_data) update_engine_attributes(engine, engine_data)
set_language_attributes(engine)
update_attributes_for_tor(engine) update_attributes_for_tor(engine)
# avoid cyclic imports
# pylint: disable=import-outside-toplevel
from searx.enginelib.traits import EngineTraitsMap
trait_map = EngineTraitsMap.from_data()
trait_map.set_traits(engine)
if not is_engine_active(engine): if not is_engine_active(engine):
return None return None
@ -190,60 +174,6 @@ def update_engine_attributes(engine: Engine, engine_data):
setattr(engine, arg_name, copy.deepcopy(arg_value)) setattr(engine, arg_name, copy.deepcopy(arg_value))
def set_language_attributes(engine: Engine):
# assign supported languages from json file
if engine.name in ENGINES_LANGUAGES:
engine.supported_languages = ENGINES_LANGUAGES[engine.name]
elif engine.engine in ENGINES_LANGUAGES:
# The key of the dictionary ENGINES_LANGUAGES is the *engine name*
# configured in settings.xml. When multiple engines are configured in
# settings.yml to use the same origin engine (python module) these
# additional engines can use the languages from the origin engine.
# For this use the configured ``engine: ...`` from settings.yml
engine.supported_languages = ENGINES_LANGUAGES[engine.engine]
if hasattr(engine, 'language'):
# For an engine, when there is `language: ...` in the YAML settings, the
# engine supports only one language, in this case
# engine.supported_languages should contains this value defined in
# settings.yml
if engine.language not in engine.supported_languages:
raise ValueError(
"settings.yml - engine: '%s' / language: '%s' not supported" % (engine.name, engine.language)
)
if isinstance(engine.supported_languages, dict):
engine.supported_languages = {engine.language: engine.supported_languages[engine.language]}
else:
engine.supported_languages = [engine.language]
# find custom aliases for non standard language codes
for engine_lang in engine.supported_languages:
iso_lang = match_language(engine_lang, BABEL_LANGS, fallback=None)
if (
iso_lang
and iso_lang != engine_lang
and not engine_lang.startswith(iso_lang)
and iso_lang not in engine.supported_languages
):
engine.language_aliases[iso_lang] = engine_lang
# language_support
engine.language_support = len(engine.supported_languages) > 0
# assign language fetching method if auxiliary method exists
if hasattr(engine, '_fetch_supported_languages'):
headers = {
'User-Agent': gen_useragent(),
'Accept-Language': "en-US,en;q=0.5", # bing needs to set the English language
}
engine.fetch_supported_languages = (
# pylint: disable=protected-access
lambda: engine._fetch_supported_languages(get(engine.supported_languages_url, headers=headers))
)
def update_attributes_for_tor(engine: Engine) -> bool: def update_attributes_for_tor(engine: Engine) -> bool:
if using_tor_proxy(engine) and hasattr(engine, 'onion_url'): if using_tor_proxy(engine) and hasattr(engine, 'onion_url'):
engine.search_url = engine.onion_url + getattr(engine, 'search_path', '') engine.search_url = engine.onion_url + getattr(engine, 'search_path', '')

View file

@ -1,15 +1,32 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
""" """
Arch Linux Wiki Arch Linux Wiki
~~~~~~~~~~~~~~~
This implementation does not use a official API: Mediawiki provides API, but
Arch Wiki blocks access to it.
API: Mediawiki provides API, but Arch Wiki blocks access to it
""" """
from urllib.parse import urlencode, urljoin from typing import TYPE_CHECKING
from lxml import html from urllib.parse import urlencode, urljoin, urlparse
import lxml
import babel
from searx import network
from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex
from searx.enginelib.traits import EngineTraits
from searx.locales import language_tag
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = { about = {
"website": 'https://wiki.archlinux.org/', "website": 'https://wiki.archlinux.org/',
"wikidata_id": 'Q101445877', "wikidata_id": 'Q101445877',
@ -22,125 +39,113 @@ about = {
# engine dependent config # engine dependent config
categories = ['it', 'software wikis'] categories = ['it', 'software wikis']
paging = True paging = True
base_url = 'https://wiki.archlinux.org' main_wiki = 'wiki.archlinux.org'
# xpath queries
xpath_results = '//ul[@class="mw-search-results"]/li'
xpath_link = './/div[@class="mw-search-result-heading"]/a'
# cut 'en' from 'en-US', 'de' from 'de-CH', and so on
def locale_to_lang_code(locale):
if locale.find('-') >= 0:
locale = locale.split('-')[0]
return locale
# wikis for some languages were moved off from the main site, we need to make
# requests to correct URLs to be able to get results in those languages
lang_urls = {
# fmt: off
'all': {
'base': 'https://wiki.archlinux.org',
'search': '/index.php?title=Special:Search&offset={offset}&{query}'
},
'de': {
'base': 'https://wiki.archlinux.de',
'search': '/index.php?title=Spezial:Suche&offset={offset}&{query}'
},
'fr': {
'base': 'https://wiki.archlinux.fr',
'search': '/index.php?title=Spécial:Recherche&offset={offset}&{query}'
},
'ja': {
'base': 'https://wiki.archlinuxjp.org',
'search': '/index.php?title=特別:検索&offset={offset}&{query}'
},
'ro': {
'base': 'http://wiki.archlinux.ro',
'search': '/index.php?title=Special:Căutare&offset={offset}&{query}'
},
'tr': {
'base': 'http://archtr.org/wiki',
'search': '/index.php?title=Özel:Ara&offset={offset}&{query}'
}
# fmt: on
}
# get base & search URLs for selected language
def get_lang_urls(language):
if language in lang_urls:
return lang_urls[language]
return lang_urls['all']
# Language names to build search requests for
# those languages which are hosted on the main site.
main_langs = {
'ar': 'العربية',
'bg': 'Български',
'cs': 'Česky',
'da': 'Dansk',
'el': 'Ελληνικά',
'es': 'Español',
'he': 'עברית',
'hr': 'Hrvatski',
'hu': 'Magyar',
'it': 'Italiano',
'ko': '한국어',
'lt': 'Lietuviškai',
'nl': 'Nederlands',
'pl': 'Polski',
'pt': 'Português',
'ru': 'Русский',
'sl': 'Slovenský',
'th': 'ไทย',
'uk': 'Українська',
'zh': '简体中文',
}
supported_languages = dict(lang_urls, **main_langs)
# do search-request
def request(query, params): def request(query, params):
# translate the locale (e.g. 'en-US') to language code ('en')
language = locale_to_lang_code(params['language'])
# if our language is hosted on the main site, we need to add its name sxng_lang = params['searxng_locale'].split('-')[0]
# to the query in order to narrow the results to that language netloc = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki)
if language in main_langs: title = traits.custom['title'].get(sxng_lang, 'Special:Search')
query += ' (' + main_langs[language] + ')' base_url = 'https://' + netloc + '/index.php?'
# prepare the request parameters
query = urlencode({'search': query})
offset = (params['pageno'] - 1) * 20 offset = (params['pageno'] - 1) * 20
# get request URLs for our language of choice if netloc == main_wiki:
urls = get_lang_urls(language) eng_lang: str = traits.get_language(sxng_lang, 'English')
search_url = urls['base'] + urls['search'] query += ' (' + eng_lang + ')'
elif netloc == 'wiki.archlinuxcn.org':
base_url = 'https://' + netloc + '/wzh/index.php?'
params['url'] = search_url.format(query=query, offset=offset) args = {
'search': query,
'title': title,
'limit': 20,
'offset': offset,
'profile': 'default',
}
params['url'] = base_url + urlencode(args)
return params return params
# get response from search-request
def response(resp): def response(resp):
# get the base URL for the language in which request was made
language = locale_to_lang_code(resp.search_params['language'])
base_url = get_lang_urls(language)['base']
results = [] results = []
dom = lxml.html.fromstring(resp.text)
dom = html.fromstring(resp.text) # get the base URL for the language in which request was made
sxng_lang = resp.search_params['searxng_locale'].split('-')[0]
netloc = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki)
base_url = 'https://' + netloc + '/index.php?'
# parse results for result in eval_xpath_list(dom, '//ul[@class="mw-search-results"]/li'):
for result in eval_xpath_list(dom, xpath_results): link = eval_xpath_getindex(result, './/div[@class="mw-search-result-heading"]/a', 0)
link = eval_xpath_getindex(result, xpath_link, 0) content = extract_text(result.xpath('.//div[@class="searchresult"]'))
href = urljoin(base_url, link.attrib.get('href')) results.append(
title = extract_text(link) {
'url': urljoin(base_url, link.get('href')),
results.append({'url': href, 'title': title}) 'title': extract_text(link),
'content': content,
}
)
return results return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages from Archlinix-Wiki. The location of the Wiki address of a
language is mapped in a :py:obj:`custom field
<searx.enginelib.traits.EngineTraits.custom>` (``wiki_netloc``). Depending
on the location, the ``title`` argument in the request is translated.
.. code:: python
"custom": {
"wiki_netloc": {
"de": "wiki.archlinux.de",
# ...
"zh": "wiki.archlinuxcn.org"
}
"title": {
"de": "Spezial:Suche",
# ...
"zh": "Special:\u641c\u7d22"
},
},
"""
engine_traits.custom['wiki_netloc'] = {}
engine_traits.custom['title'] = {}
title_map = {
'de': 'Spezial:Suche',
'fa': 'ویژه:جستجو',
'ja': '特別:検索',
'zh': 'Special:搜索',
}
resp = network.get('https://wiki.archlinux.org/')
if not resp.ok:
print("ERROR: response from wiki.archlinix.org is not OK.")
dom = lxml.html.fromstring(resp.text)
for a in eval_xpath_list(dom, "//a[@class='interlanguage-link-target']"):
sxng_tag = language_tag(babel.Locale.parse(a.get('lang'), sep='-'))
# zh_Hans --> zh
sxng_tag = sxng_tag.split('_')[0]
netloc = urlparse(a.get('href')).netloc
if netloc != 'wiki.archlinux.org':
title = title_map.get(sxng_tag)
if not title:
print("ERROR: title tag from %s (%s) is unknown" % (netloc, sxng_tag))
continue
engine_traits.custom['wiki_netloc'][sxng_tag] = netloc
engine_traits.custom['title'][sxng_tag] = title
eng_tag = extract_text(eval_xpath_list(a, ".//span"))
engine_traits.languages[sxng_tag] = eng_tag
engine_traits.languages['en'] = 'English'

View file

@ -1,16 +1,53 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint # lint: pylint
"""Bing (Web) """This is the implementation of the Bing-WEB engine. Some of this
implementations are shared by other engines:
- :ref:`bing images engine`
- :ref:`bing news engine`
- :ref:`bing videos engine`
On the `preference page`_ Bing offers a lot of languages an regions (see section
'Search results languages' and 'Country/region'). However, the abundant choice
does not correspond to reality, where Bing has a full-text indexer only for a
limited number of languages. By example: you can select a language like Māori
but you never get a result in this language.
What comes a bit closer to the truth are the `search-APIs`_ but they don`t seem
to be completely correct either (if you take a closer look you will find some
inaccuracies there too):
- :py:obj:`searx.engines.bing.bing_traits_url`
- :py:obj:`searx.engines.bing_videos.bing_traits_url`
- :py:obj:`searx.engines.bing_images.bing_traits_url`
- :py:obj:`searx.engines.bing_news.bing_traits_url`
.. _preference page: https://www.bing.com/account/general
.. _search-APIs: https://learn.microsoft.com/en-us/bing/search-apis/
- https://github.com/searx/searx/issues/2019#issuecomment-648227442
""" """
# pylint: disable=too-many-branches # pylint: disable=too-many-branches, invalid-name
from typing import TYPE_CHECKING
import datetime
import re import re
from urllib.parse import urlencode, urlparse, parse_qs import uuid
from urllib.parse import urlencode
from lxml import html from lxml import html
from searx.utils import eval_xpath, extract_text, eval_xpath_list, match_language, eval_xpath_getindex import babel
from searx.network import multi_requests, Request import babel.languages
from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex
from searx import network
from searx.locales import language_tag, region_tag
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = { about = {
"website": 'https://www.bing.com', "website": 'https://www.bing.com',
@ -21,56 +58,124 @@ about = {
"results": 'HTML', "results": 'HTML',
} }
send_accept_language_header = True
"""Bing tries to guess user's language and territory from the HTTP
Accept-Language. Optional the user can select a search-language (can be
different to the UI language) and a region (market code)."""
# engine dependent config # engine dependent config
categories = ['general', 'web'] categories = ['general', 'web']
paging = True paging = True
time_range_support = False time_range_support = True
safesearch = False safesearch = True
send_accept_language_header = True safesearch_types = {2: 'STRICT', 1: 'DEMOTE', 0: 'OFF'} # cookie: ADLT=STRICT
supported_languages_url = 'https://www.bing.com/account/general'
language_aliases = {}
# search-url base_url = 'https://www.bing.com/search'
base_url = 'https://www.bing.com/' """Bing (Web) search URL"""
# initial query: https://www.bing.com/search?q=foo&search=&form=QBLH bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/reference/market-codes'
inital_query = 'search?{query}&search=&form=QBLH' """Bing (Web) search API description"""
# following queries: https://www.bing.com/search?q=foo&search=&first=11&FORM=PERE
page_query = 'search?{query}&search=&first={offset}&FORM=PERE'
def _get_offset_from_pageno(pageno): def _get_offset_from_pageno(pageno):
return (pageno - 1) * 10 + 1 return (pageno - 1) * 10 + 1
def set_bing_cookies(params, engine_language, engine_region, SID):
# set cookies
# -----------
params['cookies']['_EDGE_V'] = '1'
# _EDGE_S: F=1&SID=3A5253BD6BCA609509B741876AF961CA&mkt=zh-tw
_EDGE_S = [
'F=1',
'SID=%s' % SID,
'mkt=%s' % engine_region.lower(),
'ui=%s' % engine_language.lower(),
]
params['cookies']['_EDGE_S'] = '&'.join(_EDGE_S)
logger.debug("cookie _EDGE_S=%s", params['cookies']['_EDGE_S'])
# "_EDGE_CD": "m=zh-tw",
_EDGE_CD = [ # pylint: disable=invalid-name
'm=%s' % engine_region.lower(), # search region: zh-cn
'u=%s' % engine_language.lower(), # UI: en-us
]
params['cookies']['_EDGE_CD'] = '&'.join(_EDGE_CD) + ';'
logger.debug("cookie _EDGE_CD=%s", params['cookies']['_EDGE_CD'])
SRCHHPGUSR = [ # pylint: disable=invalid-name
'SRCHLANG=%s' % engine_language,
# Trying to set ADLT cookie here seems not to have any effect, I assume
# there is some age verification by a cookie (and/or session ID) needed,
# to disable the SafeSearch.
'ADLT=%s' % safesearch_types.get(params['safesearch'], 'DEMOTE'),
]
params['cookies']['SRCHHPGUSR'] = '&'.join(SRCHHPGUSR)
logger.debug("cookie SRCHHPGUSR=%s", params['cookies']['SRCHHPGUSR'])
def request(query, params): def request(query, params):
"""Assemble a Bing-Web request."""
offset = _get_offset_from_pageno(params.get('pageno', 1)) engine_region = traits.get_region(params['searxng_locale'], 'en-US')
engine_language = traits.get_language(params['searxng_locale'], 'en')
# logger.debug("params['pageno'] --> %s", params.get('pageno')) SID = uuid.uuid1().hex.upper()
# logger.debug(" offset --> %s", offset) CVID = uuid.uuid1().hex.upper()
search_string = page_query set_bing_cookies(params, engine_language, engine_region, SID)
if offset == 1:
search_string = inital_query
if params['language'] == 'all': # build URL query
lang = 'EN' # ---------------
else:
lang = match_language(params['language'], supported_languages, language_aliases)
query = 'language:{} {}'.format(lang.split('-')[0].upper(), query) # query term
page = int(params.get('pageno', 1))
query_params = {
# fmt: off
'q': query,
'pq': query,
'cvid': CVID,
'qs': 'n',
'sp': '-1'
# fmt: on
}
search_path = search_string.format(query=urlencode({'q': query}), offset=offset) # page
if page > 1:
if offset > 1: referer = base_url + '?' + urlencode(query_params)
referer = base_url + inital_query.format(query=urlencode({'q': query}))
params['headers']['Referer'] = referer params['headers']['Referer'] = referer
logger.debug("headers.Referer --> %s", referer) logger.debug("headers.Referer --> %s", referer)
params['url'] = base_url + search_path query_params['first'] = _get_offset_from_pageno(page)
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
if page == 2:
query_params['FORM'] = 'PERE'
elif page > 2:
query_params['FORM'] = 'PERE%s' % (page - 2)
filters = ''
if params['time_range']:
query_params['filt'] = 'custom'
if params['time_range'] == 'day':
filters = 'ex1:"ez1"'
elif params['time_range'] == 'week':
filters = 'ex1:"ez2"'
elif params['time_range'] == 'month':
filters = 'ex1:"ez3"'
elif params['time_range'] == 'year':
epoch_1970 = datetime.date(1970, 1, 1)
today_no = (datetime.date.today() - epoch_1970).days
filters = 'ex1:"ez5_%s_%s"' % (today_no - 365, today_no)
params['url'] = base_url + '?' + urlencode(query_params)
if filters:
params['url'] = params['url'] + '&filters=' + filters
return params return params
@ -107,7 +212,8 @@ def response(resp):
url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite')) url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite'))
# Bing can shorten the URL either at the end or in the middle of the string # Bing can shorten the URL either at the end or in the middle of the string
if ( if (
url_cite.startswith('https://') url_cite
and url_cite.startswith('https://')
and '' not in url_cite and '' not in url_cite
and '...' not in url_cite and '...' not in url_cite
and '' not in url_cite and '' not in url_cite
@ -127,9 +233,9 @@ def response(resp):
# resolve all Bing redirections in parallel # resolve all Bing redirections in parallel
request_list = [ request_list = [
Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve network.Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve
] ]
response_list = multi_requests(request_list) response_list = network.multi_requests(request_list)
for i, redirect_response in enumerate(response_list): for i, redirect_response in enumerate(response_list):
if not isinstance(redirect_response, Exception): if not isinstance(redirect_response, Exception):
results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location'] results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location']
@ -157,27 +263,71 @@ def response(resp):
return results return results
# get supported languages from their site def fetch_traits(engine_traits: EngineTraits):
def _fetch_supported_languages(resp): """Fetch languages and regions from Bing-Web."""
lang_tags = set() xpath_market_codes = '//table[1]/tbody/tr/td[3]'
# xpath_country_codes = '//table[2]/tbody/tr/td[2]'
xpath_language_codes = '//table[3]/tbody/tr/td[2]'
_fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)
def _fetch_traits(engine_traits: EngineTraits, url: str, xpath_language_codes: str, xpath_market_codes: str):
# insert alias to map from a language (zh) to a language + script (zh_Hans)
engine_traits.languages['zh'] = 'zh-hans'
resp = network.get(url)
if not resp.ok:
print("ERROR: response from peertube is not OK.")
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
lang_links = eval_xpath(dom, '//div[@id="language-section"]//li')
for _li in lang_links: map_lang = {'jp': 'ja'}
for td in eval_xpath(dom, xpath_language_codes):
eng_lang = td.text
href = eval_xpath(_li, './/@href')[0] if eng_lang in ('en-gb', 'pt-br'):
(_scheme, _netloc, _path, _params, query, _fragment) = urlparse(href) # language 'en' is already in the list and a language 'en-gb' can't
query = parse_qs(query, keep_blank_values=True) # be handled in SearXNG, same with pt-br which is covered by pt-pt.
continue
# fmt: off babel_lang = map_lang.get(eng_lang, eng_lang).replace('-', '_')
setlang = query.get('setlang', [None, ])[0] try:
# example: 'mn-Cyrl-MN' --> '['mn', 'Cyrl-MN'] sxng_tag = language_tag(babel.Locale.parse(babel_lang))
lang, nation = (setlang.split('-', maxsplit=1) + [None,])[:2] # fmt: skip except babel.UnknownLocaleError:
# fmt: on print("ERROR: language (%s) is unknown by babel" % (eng_lang))
continue
conflict = engine_traits.languages.get(sxng_tag)
if conflict:
if conflict != eng_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang))
continue
engine_traits.languages[sxng_tag] = eng_lang
tag = lang + '-' + nation if nation else lang map_region = {
lang_tags.add(tag) 'en-ID': 'id_ID',
'no-NO': 'nb_NO',
}
return list(lang_tags) for td in eval_xpath(dom, xpath_market_codes):
eng_region = td.text
babel_region = map_region.get(eng_region, eng_region).replace('-', '_')
if eng_region == 'en-WW':
engine_traits.all_locale = eng_region
continue
try:
sxng_tag = region_tag(babel.Locale.parse(babel_region))
except babel.UnknownLocaleError:
print("ERROR: region (%s) is unknown by babel" % (eng_region))
continue
conflict = engine_traits.regions.get(sxng_tag)
if conflict:
if conflict != eng_region:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_region))
continue
engine_traits.regions[sxng_tag] = eng_region

View file

@ -1,20 +1,30 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint # lint: pylint
"""Bing (Images) """Bing-Images: description see :py:obj:`searx.engines.bing`.
""" """
# pylint: disable=invalid-name
from json import loads
from typing import TYPE_CHECKING
import uuid
import json
from urllib.parse import urlencode from urllib.parse import urlencode
from lxml import html from lxml import html
from searx.utils import match_language from searx.enginelib.traits import EngineTraits
from searx.engines.bing import language_aliases from searx.engines.bing import (
from searx.engines.bing import ( # pylint: disable=unused-import set_bing_cookies,
_fetch_supported_languages, _fetch_traits,
supported_languages_url,
) )
from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about # about
about = { about = {
@ -31,77 +41,92 @@ categories = ['images', 'web']
paging = True paging = True
safesearch = True safesearch = True
time_range_support = True time_range_support = True
send_accept_language_header = True
supported_languages_url = 'https://www.bing.com/account/general'
number_of_results = 28
# search-url base_url = 'https://www.bing.com/images/async'
base_url = 'https://www.bing.com/' """Bing (Images) search URL"""
search_string = (
bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-image-search/reference/market-codes'
"""Bing (Images) search API description"""
time_map = {
# fmt: off # fmt: off
'images/search' 'day': 60 * 24,
'?{query}' 'week': 60 * 24 * 7,
'&count={count}' 'month': 60 * 24 * 31,
'&first={first}' 'year': 60 * 24 * 365,
'&tsc=ImageHoverTitle'
# fmt: on # fmt: on
) }
time_range_string = '&qft=+filterui:age-lt{interval}'
time_range_dict = {'day': '1440', 'week': '10080', 'month': '43200', 'year': '525600'}
# safesearch definitions
safesearch_types = {2: 'STRICT', 1: 'DEMOTE', 0: 'OFF'}
# do search-request
def request(query, params): def request(query, params):
offset = ((params['pageno'] - 1) * number_of_results) + 1 """Assemble a Bing-Image request."""
search_path = search_string.format(query=urlencode({'q': query}), count=number_of_results, first=offset) engine_region = traits.get_region(params['searxng_locale'], 'en-US')
engine_language = traits.get_language(params['searxng_locale'], 'en')
language = match_language(params['language'], supported_languages, language_aliases).lower() SID = uuid.uuid1().hex.upper()
set_bing_cookies(params, engine_language, engine_region, SID)
params['cookies']['SRCHHPGUSR'] = 'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE') # build URL query
# - example: https://www.bing.com/images/async?q=foo&first=155&count=35
params['cookies']['_EDGE_S'] = 'mkt=' + language + '&ui=' + language + '&F=1' query_params = {
# fmt: off
'q': query,
'async' : 'content',
# to simplify the page count lets use the default of 35 images per page
'first' : (int(params.get('pageno', 1)) - 1) * 35 + 1,
'count' : 35,
# fmt: on
}
params['url'] = base_url + search_path # time range
if params['time_range'] in time_range_dict: # - example: one year (525600 minutes) 'qft=+filterui:age-lt525600'
params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']])
if params['time_range']:
query_params['qft'] = 'filterui:age-lt%s' % time_map[params['time_range']]
params['url'] = base_url + '?' + urlencode(query_params)
return params return params
# get response from search-request
def response(resp): def response(resp):
results = [] """Get response from Bing-Images"""
results = []
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
# parse results for result in dom.xpath('//ul[contains(@class, "dgControl_list")]/li'):
for result in dom.xpath('//div[@class="imgpt"]'):
img_format = result.xpath('./div[contains(@class, "img_info")]/span/text()')[0]
# Microsoft seems to experiment with this code so don't make the path too specific,
# just catch the text section for the first anchor in img_info assuming this to be
# the originating site.
source = result.xpath('./div[contains(@class, "img_info")]//a/text()')[0]
m = loads(result.xpath('./a/@m')[0]) metadata = result.xpath('.//a[@class="iusc"]/@m')
if not metadata:
continue
# strip 'Unicode private use area' highlighting, they render to Tux metadata = json.loads(result.xpath('.//a[@class="iusc"]/@m')[0])
# the Linux penguin and a standing diamond on my machine... title = ' '.join(result.xpath('.//div[@class="infnmpt"]//a/text()')).strip()
title = m.get('t', '').replace('\ue000', '').replace('\ue001', '') img_format = ' '.join(result.xpath('.//div[@class="imgpt"]/div/span/text()')).strip()
source = ' '.join(result.xpath('.//div[@class="imgpt"]//div[@class="lnkw"]//a/text()')).strip()
results.append( results.append(
{ {
'template': 'images.html', 'template': 'images.html',
'url': m['purl'], 'url': metadata['purl'],
'thumbnail_src': m['turl'], 'thumbnail_src': metadata['turl'],
'img_src': m['murl'], 'img_src': metadata['murl'],
'content': '', 'content': metadata['desc'],
'title': title, 'title': title,
'source': source, 'source': source,
'img_format': img_format, 'img_format': img_format,
} }
) )
return results return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and regions from Bing-News."""
xpath_market_codes = '//table[1]/tbody/tr/td[3]'
# xpath_country_codes = '//table[2]/tbody/tr/td[2]'
xpath_language_codes = '//table[3]/tbody/tr/td[2]'
_fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)

View file

@ -1,24 +1,30 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint # lint: pylint
"""Bing (News) """Bing-News: description see :py:obj:`searx.engines.bing`.
""" """
from urllib.parse import ( # pylint: disable=invalid-name
urlencode,
urlparse, from typing import TYPE_CHECKING
parse_qsl, import uuid
quote, from urllib.parse import urlencode
)
from datetime import datetime from lxml import html
from dateutil import parser
from lxml import etree from searx.enginelib.traits import EngineTraits
from lxml.etree import XPath from searx.engines.bing import (
from searx.utils import match_language, eval_xpath_getindex set_bing_cookies,
from searx.engines.bing import ( # pylint: disable=unused-import _fetch_traits,
language_aliases,
_fetch_supported_languages,
supported_languages_url,
) )
from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about # about
about = { about = {
@ -34,108 +40,111 @@ about = {
categories = ['news'] categories = ['news']
paging = True paging = True
time_range_support = True time_range_support = True
send_accept_language_header = True time_map = {
'day': '4',
'week': '8',
'month': '9',
}
"""A string '4' means *last hour*. We use *last hour* for ``day`` here since the
difference of *last day* and *last week* in the result list is just marginally.
"""
# search-url base_url = 'https://www.bing.com/news/infinitescrollajax'
base_url = 'https://www.bing.com/' """Bing (News) search URL"""
search_string = 'news/search?{query}&first={offset}&format=RSS'
search_string_with_time = 'news/search?{query}&first={offset}&qft=interval%3d"{interval}"&format=RSS'
time_range_dict = {'day': '7', 'week': '8', 'month': '9'}
bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-news-search/reference/market-codes'
"""Bing (News) search API description"""
def url_cleanup(url_string): mkt_alias = {
"""remove click""" 'zh': 'en-WW',
'zh-CN': 'en-WW',
parsed_url = urlparse(url_string) }
if parsed_url.netloc == 'www.bing.com' and parsed_url.path == '/news/apiclick.aspx': """Bing News has an official market code 'zh-CN' but we won't get a result with
query = dict(parse_qsl(parsed_url.query)) this market code. For 'zh' and 'zh-CN' we better use the *Worldwide aggregate*
url_string = query.get('url', None) market code (en-WW).
return url_string """
def image_url_cleanup(url_string):
"""replace the http://*bing.com/th?id=... by https://www.bing.com/th?id=..."""
parsed_url = urlparse(url_string)
if parsed_url.netloc.endswith('bing.com') and parsed_url.path == '/th':
query = dict(parse_qsl(parsed_url.query))
url_string = "https://www.bing.com/th?id=" + quote(query.get('id'))
return url_string
def _get_url(query, language, offset, time_range):
if time_range in time_range_dict:
search_path = search_string_with_time.format(
# fmt: off
query = urlencode({
'q': query,
'setmkt': language
}),
offset = offset,
interval = time_range_dict[time_range]
# fmt: on
)
else:
# e.g. setmkt=de-de&setlang=de
search_path = search_string.format(
# fmt: off
query = urlencode({
'q': query,
'setmkt': language
}),
offset = offset
# fmt: on
)
return base_url + search_path
def request(query, params): def request(query, params):
"""Assemble a Bing-News request."""
if params['time_range'] and params['time_range'] not in time_range_dict: sxng_locale = params['searxng_locale']
return params engine_region = traits.get_region(mkt_alias.get(sxng_locale, sxng_locale), traits.all_locale)
engine_language = traits.get_language(sxng_locale, 'en')
offset = (params['pageno'] - 1) * 10 + 1 SID = uuid.uuid1().hex.upper()
if params['language'] == 'all': set_bing_cookies(params, engine_language, engine_region, SID)
language = 'en-US'
else: # build URL query
language = match_language(params['language'], supported_languages, language_aliases) #
params['url'] = _get_url(query, language, offset, params['time_range']) # example: https://www.bing.com/news/infinitescrollajax?q=london&first=1
query_params = {
# fmt: off
'q': query,
'InfiniteScroll': 1,
# to simplify the page count lets use the default of 10 images per page
'first' : (int(params.get('pageno', 1)) - 1) * 10 + 1,
# fmt: on
}
if params['time_range']:
# qft=interval:"7"
query_params['qft'] = 'qft=interval="%s"' % time_map.get(params['time_range'], '9')
params['url'] = base_url + '?' + urlencode(query_params)
return params return params
def response(resp): def response(resp):
"""Get response from Bing-Video"""
results = [] results = []
rss = etree.fromstring(resp.content)
namespaces = rss.nsmap
for item in rss.xpath('./channel/item'): if not resp.ok or not resp.text:
# url / title / content return results
url = url_cleanup(eval_xpath_getindex(item, './link/text()', 0, default=None))
title = eval_xpath_getindex(item, './title/text()', 0, default=url)
content = eval_xpath_getindex(item, './description/text()', 0, default='')
# publishedDate dom = html.fromstring(resp.text)
publishedDate = eval_xpath_getindex(item, './pubDate/text()', 0, default=None)
try:
publishedDate = parser.parse(publishedDate, dayfirst=False)
except TypeError:
publishedDate = datetime.now()
except ValueError:
publishedDate = datetime.now()
# thumbnail for newsitem in dom.xpath('//div[contains(@class, "newsitem")]'):
thumbnail = eval_xpath_getindex(item, XPath('./News:Image/text()', namespaces=namespaces), 0, default=None)
if thumbnail is not None:
thumbnail = image_url_cleanup(thumbnail)
# append result url = newsitem.xpath('./@url')[0]
if thumbnail is not None: title = ' '.join(newsitem.xpath('.//div[@class="caption"]//a[@class="title"]/text()')).strip()
results.append( content = ' '.join(newsitem.xpath('.//div[@class="snippet"]/text()')).strip()
{'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content, 'img_src': thumbnail} thumbnail = None
) author = newsitem.xpath('./@data-author')[0]
else: metadata = ' '.join(newsitem.xpath('.//div[@class="source"]/span/text()')).strip()
results.append({'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content})
img_src = newsitem.xpath('.//a[@class="imagelink"]//img/@src')
if img_src:
thumbnail = 'https://www.bing.com/' + img_src[0]
results.append(
{
'url': url,
'title': title,
'content': content,
'img_src': thumbnail,
'author': author,
'metadata': metadata,
}
)
return results return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and regions from Bing-News.
The :py:obj:`description <searx.engines.bing_news.bing_traits_url>` of the
first table says *"query parameter when calling the Video Search API."*
.. thats why I use the 4. table "News Category API markets" for the
``xpath_market_codes``.
"""
xpath_market_codes = '//table[4]/tbody/tr/td[3]'
# xpath_country_codes = '//table[2]/tbody/tr/td[2]'
xpath_language_codes = '//table[3]/tbody/tr/td[2]'
_fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)

View file

@ -1,21 +1,30 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint # lint: pylint
"""Bing (Videos) """Bing-Videos: description see :py:obj:`searx.engines.bing`.
""" """
# pylint: disable=invalid-name
from json import loads from typing import TYPE_CHECKING
import uuid
import json
from urllib.parse import urlencode from urllib.parse import urlencode
from lxml import html from lxml import html
from searx.utils import match_language from searx.enginelib.traits import EngineTraits
from searx.engines.bing import language_aliases from searx.engines.bing import (
set_bing_cookies,
from searx.engines.bing import ( # pylint: disable=unused-import _fetch_traits,
_fetch_supported_languages,
supported_languages_url,
) )
from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = { about = {
"website": 'https://www.bing.com/videos', "website": 'https://www.bing.com/videos',
@ -26,65 +35,76 @@ about = {
"results": 'HTML', "results": 'HTML',
} }
# engine dependent config
categories = ['videos', 'web'] categories = ['videos', 'web']
paging = True paging = True
safesearch = True safesearch = True
time_range_support = True time_range_support = True
send_accept_language_header = True
number_of_results = 28
base_url = 'https://www.bing.com/' base_url = 'https://www.bing.com/videos/asyncv2'
search_string = ( """Bing (Videos) async search URL."""
bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-video-search/reference/market-codes'
"""Bing (Video) search API description"""
time_map = {
# fmt: off # fmt: off
'videos/search' 'day': 60 * 24,
'?{query}' 'week': 60 * 24 * 7,
'&count={count}' 'month': 60 * 24 * 31,
'&first={first}' 'year': 60 * 24 * 365,
'&scope=video'
'&FORM=QBLH'
# fmt: on # fmt: on
) }
time_range_string = '&qft=+filterui:videoage-lt{interval}'
time_range_dict = {'day': '1440', 'week': '10080', 'month': '43200', 'year': '525600'}
# safesearch definitions
safesearch_types = {2: 'STRICT', 1: 'DEMOTE', 0: 'OFF'}
# do search-request
def request(query, params): def request(query, params):
offset = ((params['pageno'] - 1) * number_of_results) + 1 """Assemble a Bing-Video request."""
search_path = search_string.format(query=urlencode({'q': query}), count=number_of_results, first=offset) engine_region = traits.get_region(params['searxng_locale'], 'en-US')
engine_language = traits.get_language(params['searxng_locale'], 'en')
# safesearch cookie SID = uuid.uuid1().hex.upper()
params['cookies']['SRCHHPGUSR'] = 'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE') set_bing_cookies(params, engine_language, engine_region, SID)
# language cookie # build URL query
language = match_language(params['language'], supported_languages, language_aliases).lower() #
params['cookies']['_EDGE_S'] = 'mkt=' + language + '&F=1' # example: https://www.bing.com/videos/asyncv2?q=foo&async=content&first=1&count=35
# query and paging query_params = {
params['url'] = base_url + search_path # fmt: off
'q': query,
'async' : 'content',
# to simplify the page count lets use the default of 35 images per page
'first' : (int(params.get('pageno', 1)) - 1) * 35 + 1,
'count' : 35,
# fmt: on
}
# time range # time range
if params['time_range'] in time_range_dict: #
params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']]) # example: one week (10080 minutes) '&qft= filterui:videoage-lt10080' '&form=VRFLTR'
if params['time_range']:
query_params['form'] = 'VRFLTR'
query_params['qft'] = ' filterui:videoage-lt%s' % time_map[params['time_range']]
params['url'] = base_url + '?' + urlencode(query_params)
return params return params
# get response from search-request
def response(resp): def response(resp):
"""Get response from Bing-Video"""
results = [] results = []
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
for result in dom.xpath('//div[@class="dg_u"]/div[contains(@class, "mc_vtvc")]'): for result in dom.xpath('//div[@class="dg_u"]//div[contains(@id, "mc_vtvc_video")]'):
metadata = loads(result.xpath('.//div[@class="vrhdata"]/@vrhm')[0]) metadata = json.loads(result.xpath('.//div[@class="vrhdata"]/@vrhm')[0])
info = ' - '.join(result.xpath('.//div[@class="mc_vtvc_meta_block"]//span/text()')).strip() info = ' - '.join(result.xpath('.//div[@class="mc_vtvc_meta_block"]//span/text()')).strip()
content = '{0} - {1}'.format(metadata['du'], info) content = '{0} - {1}'.format(metadata['du'], info)
thumbnail = '{0}th?id={1}'.format(base_url, metadata['thid']) thumbnail = result.xpath('.//div[contains(@class, "mc_vtvc_th")]//img/@src')[0]
results.append( results.append(
{ {
'url': metadata['murl'], 'url': metadata['murl'],
@ -96,3 +116,13 @@ def response(resp):
) )
return results return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and regions from Bing-Videos."""
xpath_market_codes = '//table[1]/tbody/tr/td[3]'
# xpath_country_codes = '//table[2]/tbody/tr/td[2]'
xpath_language_codes = '//table[3]/tbody/tr/td[2]'
_fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)

View file

@ -1,17 +1,35 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
"""Dailymotion (Videos) # lint: pylint
"""
Dailymotion (Videos)
~~~~~~~~~~~~~~~~~~~~
.. _REST GET: https://developers.dailymotion.com/tools/
.. _Global API Parameters: https://developers.dailymotion.com/api/#global-parameters
.. _Video filters API: https://developers.dailymotion.com/api/#video-filters
.. _Fields selection: https://developers.dailymotion.com/api/#fields-selection
""" """
from typing import Set from typing import TYPE_CHECKING
from datetime import datetime, timedelta from datetime import datetime, timedelta
from urllib.parse import urlencode from urllib.parse import urlencode
import time import time
import babel import babel
from searx.exceptions import SearxEngineAPIException from searx.exceptions import SearxEngineAPIException
from searx.network import raise_for_httperror from searx import network
from searx.utils import html_to_text from searx.utils import html_to_text
from searx.locales import region_tag, language_tag
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about # about
about = { about = {
@ -37,11 +55,24 @@ time_delta_dict = {
} }
safesearch = True safesearch = True
safesearch_params = {2: '&is_created_for_kids=true', 1: '&is_created_for_kids=true', 0: ''} safesearch_params = {
2: {'is_created_for_kids': 'true'},
1: {'is_created_for_kids': 'true'},
0: {},
}
"""True if this video is "Created for Kids" / intends to target an audience
under the age of 16 (``is_created_for_kids`` in `Video filters API`_ )
"""
# search-url family_filter_map = {
# - https://developers.dailymotion.com/tools/ 2: 'true',
# - https://www.dailymotion.com/doc/api/obj-video.html 1: 'true',
0: 'false',
}
"""By default, the family filter is turned on. Setting this parameter to
``false`` will stop filtering-out explicit content from searches and global
contexts (``family_filter`` in `Global API Parameters`_ ).
"""
result_fields = [ result_fields = [
'allow_embed', 'allow_embed',
@ -53,27 +84,21 @@ result_fields = [
'thumbnail_360_url', 'thumbnail_360_url',
'id', 'id',
] ]
search_url = ( """`Fields selection`_, by default, a few fields are returned. To request more
'https://api.dailymotion.com/videos?' specific fields, the ``fields`` parameter is used with the list of fields
'fields={fields}&password_protected={password_protected}&private={private}&sort={sort}&limit={limit}' SearXNG needs in the response to build a video result list.
).format( """
fields=','.join(result_fields),
password_protected='false', search_url = 'https://api.dailymotion.com/videos?'
private='false', """URL to retrieve a list of videos.
sort='relevance',
limit=number_of_results, - `REST GET`_
) - `Global API Parameters`_
- `Video filters API`_
"""
iframe_src = "https://www.dailymotion.com/embed/video/{video_id}" iframe_src = "https://www.dailymotion.com/embed/video/{video_id}"
"""URL template to embed video in SearXNG's result list."""
# The request query filters by 'languages' & 'country', therefore instead of
# fetching only languages we need to fetch locales.
supported_languages_url = 'https://api.dailymotion.com/locales'
supported_languages_iso639: Set[str] = set()
def init(_engine_settings):
global supported_languages_iso639
supported_languages_iso639 = set([language.split('_')[0] for language in supported_languages])
def request(query, params): def request(query, params):
@ -81,34 +106,42 @@ def request(query, params):
if not query: if not query:
return False return False
language = params['language'] eng_region = traits.get_region(params['searxng_locale'], 'en_US')
if language == 'all': eng_lang = traits.get_language(params['searxng_locale'], 'en')
language = 'en-US'
locale = babel.Locale.parse(language, sep='-')
language_iso639 = locale.language args = {
if locale.language not in supported_languages_iso639:
language_iso639 = 'en'
query_args = {
'search': query, 'search': query,
'languages': language_iso639, 'family_filter': family_filter_map.get(params['safesearch'], 'false'),
'thumbnail_ratio': 'original', # original|widescreen|square
# https://developers.dailymotion.com/api/#video-filters
'languages': eng_lang,
'page': params['pageno'], 'page': params['pageno'],
'password_protected': 'false',
'private': 'false',
'sort': 'relevance',
'limit': number_of_results,
'fields': ','.join(result_fields),
} }
if locale.territory: args.update(safesearch_params.get(params['safesearch'], {}))
localization = locale.language + '_' + locale.territory
if localization in supported_languages: # Don't add localization and country arguments if the user does select a
query_args['country'] = locale.territory # language (:de, :en, ..)
if len(params['searxng_locale'].split('-')) > 1:
# https://developers.dailymotion.com/api/#global-parameters
args['localization'] = eng_region
args['country'] = eng_region.split('_')[1]
# Insufficient rights for the `ams_country' parameter of route `GET /videos'
# 'ams_country': eng_region.split('_')[1],
time_delta = time_delta_dict.get(params["time_range"]) time_delta = time_delta_dict.get(params["time_range"])
if time_delta: if time_delta:
created_after = datetime.now() - time_delta created_after = datetime.now() - time_delta
query_args['created_after'] = datetime.timestamp(created_after) args['created_after'] = datetime.timestamp(created_after)
query_str = urlencode(query_args) query_str = urlencode(args)
params['url'] = search_url + '&' + query_str + safesearch_params.get(params['safesearch'], '') params['url'] = search_url + query_str
params['raise_for_httperror'] = False
return params return params
@ -123,7 +156,7 @@ def response(resp):
if 'error' in search_res: if 'error' in search_res:
raise SearxEngineAPIException(search_res['error'].get('message')) raise SearxEngineAPIException(search_res['error'].get('message'))
raise_for_httperror(resp) network.raise_for_httperror(resp)
# parse results # parse results
for res in search_res.get('list', []): for res in search_res.get('list', []):
@ -167,7 +200,53 @@ def response(resp):
return results return results
# get supported languages from their site def fetch_traits(engine_traits: EngineTraits):
def _fetch_supported_languages(resp): """Fetch locales & languages from dailymotion.
response_json = resp.json()
return [item['locale'] for item in response_json['list']] Locales fetched from `api/locales <https://api.dailymotion.com/locales>`_.
There are duplications in the locale codes returned from Dailymotion which
can be ignored::
en_EN --> en_GB, en_US
ar_AA --> ar_EG, ar_AE, ar_SA
The language list `api/languages <https://api.dailymotion.com/languages>`_
contains over 7000 *languages* codes (see PR1071_). We use only those
language codes that are used in the locales.
.. _PR1071: https://github.com/searxng/searxng/pull/1071
"""
resp = network.get('https://api.dailymotion.com/locales')
if not resp.ok:
print("ERROR: response from dailymotion/locales is not OK.")
for item in resp.json()['list']:
eng_tag = item['locale']
if eng_tag in ('en_EN', 'ar_AA'):
continue
try:
sxng_tag = region_tag(babel.Locale.parse(eng_tag))
except babel.UnknownLocaleError:
print("ERROR: item unknown --> %s" % item)
continue
conflict = engine_traits.regions.get(sxng_tag)
if conflict:
if conflict != eng_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
continue
engine_traits.regions[sxng_tag] = eng_tag
locale_lang_list = [x.split('_')[0] for x in engine_traits.regions.values()]
resp = network.get('https://api.dailymotion.com/languages')
if not resp.ok:
print("ERROR: response from dailymotion/languages is not OK.")
for item in resp.json()['list']:
eng_tag = item['code']
if eng_tag in locale_lang_list:
sxng_tag = language_tag(babel.Locale.parse(eng_tag))
engine_traits.languages[sxng_tag] = eng_tag

View file

@ -63,7 +63,7 @@ def search(query, request_params):
for row in result_list: for row in result_list:
entry = { entry = {
'query': query, 'query': query,
'language': request_params['language'], 'language': request_params['searxng_locale'],
'value': row.get("value"), 'value': row.get("value"),
# choose a result template or comment out to use the *default* # choose a result template or comment out to use the *default*
'template': 'key-value.html', 'template': 'key-value.html',

View file

@ -1,71 +1,207 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint # lint: pylint
"""DuckDuckGo Lite """
DuckDuckGo Lite
~~~~~~~~~~~~~~~
""" """
from json import loads from typing import TYPE_CHECKING
from urllib.parse import urlencode
from lxml.html import fromstring import json
import babel
import lxml.html
from searx import (
network,
locales,
redislib,
)
from searx import redisdb
from searx.utils import ( from searx.utils import (
dict_subset,
eval_xpath, eval_xpath,
eval_xpath_getindex, eval_xpath_getindex,
extract_text, extract_text,
match_language,
) )
from searx.network import get from searx.enginelib.traits import EngineTraits
from searx.exceptions import SearxEngineAPIException
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = { about = {
"website": 'https://lite.duckduckgo.com/lite/', "website": 'https://lite.duckduckgo.com/lite/',
"wikidata_id": 'Q12805', "wikidata_id": 'Q12805',
"official_api_documentation": 'https://duckduckgo.com/api',
"use_official_api": False, "use_official_api": False,
"require_api_key": False, "require_api_key": False,
"results": 'HTML', "results": 'HTML',
} }
send_accept_language_header = True
"""DuckDuckGo-Lite tries to guess user's prefered language from the HTTP
``Accept-Language``. Optional the user can select a region filter (but not a
language).
"""
# engine dependent config # engine dependent config
categories = ['general', 'web'] categories = ['general', 'web']
paging = True paging = True
supported_languages_url = 'https://duckduckgo.com/util/u588.js'
time_range_support = True time_range_support = True
send_accept_language_header = True safesearch = True # user can't select but the results are filtered
language_aliases = { url = 'https://lite.duckduckgo.com/lite/'
'ar-SA': 'ar-XA', # url_ping = 'https://duckduckgo.com/t/sl_l'
'es-419': 'es-XL',
'ja': 'jp-JP',
'ko': 'kr-KR',
'sl-SI': 'sl-SL',
'zh-TW': 'tzh-TW',
'zh-HK': 'tzh-HK',
}
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'} time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'}
# search-url
url = 'https://lite.duckduckgo.com/lite/'
url_ping = 'https://duckduckgo.com/t/sl_l'
# match query's language to a region code that duckduckgo will accept def cache_vqd(query, value):
def get_region_code(lang, lang_list=None): """Caches a ``vqd`` value from a query.
if lang == 'all':
return None
lang_code = match_language(lang, lang_list or [], language_aliases, 'wt-WT') The vqd value depends on the query string and is needed for the follow up
lang_parts = lang_code.split('-') pages or the images loaded by a XMLHttpRequest:
# country code goes first - DuckDuckGo Web: `https://links.duckduckgo.com/d.js?q=...&vqd=...`
return lang_parts[1].lower() + '-' + lang_parts[0].lower() - DuckDuckGo Images: `https://duckduckgo.com/i.js??q=...&vqd=...`
"""
c = redisdb.client()
if c:
logger.debug("cache vqd value: %s", value)
key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query)
c.set(key, value, ex=600)
def get_vqd(query, headers):
"""Returns the ``vqd`` that fits to the *query*. If there is no ``vqd`` cached
(:py:obj:`cache_vqd`) the query is sent to DDG to get a vqd value from the
response.
"""
value = None
c = redisdb.client()
if c:
key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query)
value = c.get(key)
if value:
value = value.decode('utf-8')
logger.debug("re-use cached vqd value: %s", value)
return value
query_url = 'https://duckduckgo.com/?{query}&iar=images'.format(query=urlencode({'q': query}))
res = network.get(query_url, headers=headers)
content = res.text
if content.find('vqd=\'') == -1:
raise SearxEngineAPIException('Request failed')
value = content[content.find('vqd=\'') + 5 :]
value = value[: value.find('\'')]
logger.debug("new vqd value: %s", value)
cache_vqd(query, value)
return value
def get_ddg_lang(eng_traits: EngineTraits, sxng_locale, default='en_US'):
"""Get DuckDuckGo's language identifier from SearXNG's locale.
DuckDuckGo defines its lanaguages by region codes (see
:py:obj:`fetch_traits`).
To get region and language of a DDG service use:
.. code: python
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
It might confuse, but the ``l`` value of the cookie is what SearXNG calls
the *region*:
.. code:: python
# !ddi paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'}
params['cookies']['ad'] = eng_lang
params['cookies']['ah'] = eng_region
params['cookies']['l'] = eng_region
.. hint::
`DDG-lite <https://lite.duckduckgo.com/lite>`__ does not offer a language
selection to the user, only a region can be selected by the user
(``eng_region`` from the example above). DDG-lite stores the selected
region in a cookie::
params['cookies']['kl'] = eng_region # 'ar-es'
"""
return eng_traits.custom['lang_region'].get(sxng_locale, eng_traits.get_language(sxng_locale, default))
ddg_reg_map = {
'tw-tzh': 'zh_TW',
'hk-tzh': 'zh_HK',
'ct-ca': 'skip', # ct-ca and es-ca both map to ca_ES
'es-ca': 'ca_ES',
'id-en': 'id_ID',
'no-no': 'nb_NO',
'jp-jp': 'ja_JP',
'kr-kr': 'ko_KR',
'xa-ar': 'ar_SA',
'sl-sl': 'sl_SI',
'th-en': 'th_TH',
'vn-en': 'vi_VN',
}
ddg_lang_map = {
# use ar --> ar_EG (Egypt's arabic)
"ar_DZ": 'lang_region',
"ar_JO": 'lang_region',
"ar_SA": 'lang_region',
# use bn --> bn_BD
'bn_IN': 'lang_region',
# use de --> de_DE
'de_CH': 'lang_region',
# use en --> en_US,
'en_AU': 'lang_region',
'en_CA': 'lang_region',
'en_GB': 'lang_region',
# Esperanto
'eo_XX': 'eo',
# use es --> es_ES,
'es_AR': 'lang_region',
'es_CL': 'lang_region',
'es_CO': 'lang_region',
'es_CR': 'lang_region',
'es_EC': 'lang_region',
'es_MX': 'lang_region',
'es_PE': 'lang_region',
'es_UY': 'lang_region',
'es_VE': 'lang_region',
# use fr --> rf_FR
'fr_CA': 'lang_region',
'fr_CH': 'lang_region',
'fr_BE': 'lang_region',
# use nl --> nl_NL
'nl_BE': 'lang_region',
# use pt --> pt_PT
'pt_BR': 'lang_region',
# skip these languages
'od_IN': 'skip',
'io_XX': 'skip',
'tokipona_XX': 'skip',
}
def request(query, params): def request(query, params):
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
# eng_lang = get_ddg_lang(traits, params['searxng_locale'])
params['url'] = url params['url'] = url
params['method'] = 'POST' params['method'] = 'POST'
params['data']['q'] = query params['data']['q'] = query
# The API is not documented, so we do some reverse engineering and emulate # The API is not documented, so we do some reverse engineering and emulate
@ -88,23 +224,19 @@ def request(query, params):
params['data']['s'] = offset params['data']['s'] = offset
params['data']['dc'] = offset + 1 params['data']['dc'] = offset + 1
# request needs a vqd argument
params['data']['vqd'] = get_vqd(query, params["headers"])
# initial page does not have additional data in the input form # initial page does not have additional data in the input form
if params['pageno'] > 1: if params['pageno'] > 1:
# request the second page (and more pages) needs 'o' and 'api' arguments
params['data']['o'] = 'json'
params['data']['api'] = 'd.js'
# initial page does not have additional data in the input form params['data']['o'] = form_data.get('o', 'json')
if params['pageno'] > 2: params['data']['api'] = form_data.get('api', 'd.js')
# request the third page (and more pages) some more arguments params['data']['nextParams'] = form_data.get('nextParams', '')
params['data']['nextParams'] = '' params['data']['v'] = form_data.get('v', 'l')
params['data']['v'] = ''
params['data']['vqd'] = ''
region_code = get_region_code(params['language'], supported_languages) params['data']['kl'] = eng_region
if region_code: params['cookies']['kl'] = eng_region
params['data']['kl'] = region_code
params['cookies']['kl'] = region_code
params['data']['df'] = '' params['data']['df'] = ''
if params['time_range'] in time_range_dict: if params['time_range'] in time_range_dict:
@ -116,26 +248,40 @@ def request(query, params):
return params return params
# get response from search-request
def response(resp): def response(resp):
headers_ping = dict_subset(resp.request.headers, ['User-Agent', 'Accept-Encoding', 'Accept', 'Cookie'])
get(url_ping, headers=headers_ping)
if resp.status_code == 303: if resp.status_code == 303:
return [] return []
results = [] results = []
doc = fromstring(resp.text) doc = lxml.html.fromstring(resp.text)
result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table') result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table')
if not len(result_table) >= 3:
if len(result_table) == 2:
# some locales (at least China) does not have a "next page" button and
# the layout of the HTML tables is different.
result_table = result_table[1]
elif not len(result_table) >= 3:
# no more results # no more results
return [] return []
result_table = result_table[2] else:
result_table = result_table[2]
# update form data from response
form = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table//input/..')
if len(form):
form = form[0]
form_data['v'] = eval_xpath(form, '//input[@name="v"]/@value')[0]
form_data['api'] = eval_xpath(form, '//input[@name="api"]/@value')[0]
form_data['o'] = eval_xpath(form, '//input[@name="o"]/@value')[0]
logger.debug('form_data: %s', form_data)
value = eval_xpath(form, '//input[@name="vqd"]/@value')[0]
query = resp.search_params['data']['q']
cache_vqd(query, value)
tr_rows = eval_xpath(result_table, './/tr') tr_rows = eval_xpath(result_table, './/tr')
# In the last <tr> is the form of the 'previous/next page' links # In the last <tr> is the form of the 'previous/next page' links
tr_rows = tr_rows[:-1] tr_rows = tr_rows[:-1]
@ -172,15 +318,105 @@ def response(resp):
return results return results
# get supported languages from their site def fetch_traits(engine_traits: EngineTraits):
def _fetch_supported_languages(resp): """Fetch languages & regions from DuckDuckGo.
# response is a js file with regions as an embedded object SearXNG's ``all`` locale maps DuckDuckGo's "Alle regions" (``wt-wt``).
response_page = resp.text DuckDuckGo's language "Browsers prefered language" (``wt_WT``) makes no
response_page = response_page[response_page.find('regions:{') + 8 :] sense in a SearXNG request since SearXNG's ``all`` will not add a
response_page = response_page[: response_page.find('}') + 1] ``Accept-Language`` HTTP header. The value in ``engine_traits.all_locale``
is ``wt-wt`` (the region).
regions_json = loads(response_page) Beside regions DuckDuckGo also defines its lanaguages by region codes. By
supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys()) example these are the english languages in DuckDuckGo:
return list(supported_languages) - en_US
- en_AU
- en_CA
- en_GB
The function :py:obj:`get_ddg_lang` evaluates DuckDuckGo's language from
SearXNG's locale.
"""
# pylint: disable=too-many-branches, too-many-statements
# fetch regions
engine_traits.all_locale = 'wt-wt'
# updated from u588 to u661 / should be updated automatically?
resp = network.get('https://duckduckgo.com/util/u661.js')
if not resp.ok:
print("ERROR: response from DuckDuckGo is not OK.")
pos = resp.text.find('regions:{') + 8
js_code = resp.text[pos:]
pos = js_code.find('}') + 1
regions = json.loads(js_code[:pos])
for eng_tag, name in regions.items():
if eng_tag == 'wt-wt':
engine_traits.all_locale = 'wt-wt'
continue
region = ddg_reg_map.get(eng_tag)
if region == 'skip':
continue
if not region:
eng_territory, eng_lang = eng_tag.split('-')
region = eng_lang + '_' + eng_territory.upper()
try:
sxng_tag = locales.region_tag(babel.Locale.parse(region))
except babel.UnknownLocaleError:
print("ERROR: %s (%s) -> %s is unknown by babel" % (name, eng_tag, region))
continue
conflict = engine_traits.regions.get(sxng_tag)
if conflict:
if conflict != eng_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
continue
engine_traits.regions[sxng_tag] = eng_tag
# fetch languages
engine_traits.custom['lang_region'] = {}
pos = resp.text.find('languages:{') + 10
js_code = resp.text[pos:]
pos = js_code.find('}') + 1
js_code = '{"' + js_code[1:pos].replace(':', '":').replace(',', ',"')
languages = json.loads(js_code)
for eng_lang, name in languages.items():
if eng_lang == 'wt_WT':
continue
babel_tag = ddg_lang_map.get(eng_lang, eng_lang)
if babel_tag == 'skip':
continue
try:
if babel_tag == 'lang_region':
sxng_tag = locales.region_tag(babel.Locale.parse(eng_lang))
engine_traits.custom['lang_region'][sxng_tag] = eng_lang
continue
sxng_tag = locales.language_tag(babel.Locale.parse(babel_tag))
except babel.UnknownLocaleError:
print("ERROR: language %s (%s) is unknown by babel" % (name, eng_lang))
continue
conflict = engine_traits.languages.get(sxng_tag)
if conflict:
if conflict != eng_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang))
continue
engine_traits.languages[sxng_tag] = eng_lang

View file

@ -1,22 +1,33 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint # lint: pylint
"""DuckDuckGo (Instant Answer API) """
DuckDuckGo Instant Answer API
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The `DDG-API <https://duckduckgo.com/api>`__ is no longer documented but from
reverse engineering we can see that some services (e.g. instant answers) still
in use from the DDG search engine.
As far we can say the *instant answers* API does not support languages, or at
least we could not find out how language support should work. It seems that
most of the features are based on English terms.
""" """
import json from typing import TYPE_CHECKING
from urllib.parse import urlencode, urlparse, urljoin from urllib.parse import urlencode, urlparse, urljoin
from lxml import html from lxml import html
from searx.data import WIKIDATA_UNITS from searx.data import WIKIDATA_UNITS
from searx.engines.duckduckgo import language_aliases from searx.utils import extract_text, html_to_text, get_string_replaces_function
from searx.engines.duckduckgo import ( # pylint: disable=unused-import
_fetch_supported_languages,
supported_languages_url,
)
from searx.utils import extract_text, html_to_text, match_language, get_string_replaces_function
from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
if TYPE_CHECKING:
import logging
logger: logging.Logger
# about # about
about = { about = {
"website": 'https://duckduckgo.com/', "website": 'https://duckduckgo.com/',
@ -37,7 +48,7 @@ replace_http_by_https = get_string_replaces_function({'http:': 'https:'})
def is_broken_text(text): def is_broken_text(text):
"""duckduckgo may return something like "<a href="xxxx">http://somewhere Related website<a/>" """duckduckgo may return something like ``<a href="xxxx">http://somewhere Related website<a/>``
The href URL is broken, the "Related website" may contains some HTML. The href URL is broken, the "Related website" may contains some HTML.
@ -62,8 +73,6 @@ def result_to_text(text, htmlResult):
def request(query, params): def request(query, params):
params['url'] = URL.format(query=urlencode({'q': query})) params['url'] = URL.format(query=urlencode({'q': query}))
language = match_language(params['language'], supported_languages, language_aliases)
language = language.split('-')[0]
return params return params
@ -71,7 +80,7 @@ def response(resp):
# pylint: disable=too-many-locals, too-many-branches, too-many-statements # pylint: disable=too-many-locals, too-many-branches, too-many-statements
results = [] results = []
search_res = json.loads(resp.text) search_res = resp.json()
# search_res.get('Entity') possible values (not exhaustive) : # search_res.get('Entity') possible values (not exhaustive) :
# * continent / country / department / location / waterfall # * continent / country / department / location / waterfall
@ -235,7 +244,7 @@ def unit_to_str(unit):
def area_to_str(area): def area_to_str(area):
"""parse {'unit': 'http://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}""" """parse ``{'unit': 'https://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}``"""
unit = unit_to_str(area.get('unit')) unit = unit_to_str(area.get('unit'))
if unit is not None: if unit is not None:
try: try:

View file

@ -1,26 +1,30 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
""" """
DuckDuckGo (Images) DuckDuckGo Images
~~~~~~~~~~~~~~~~~
""" """
from json import loads from typing import TYPE_CHECKING
from urllib.parse import urlencode from urllib.parse import urlencode
from searx.exceptions import SearxEngineAPIException
from searx.engines.duckduckgo import get_region_code from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import
from searx.engines.duckduckgo import ( # pylint: disable=unused-import from searx.engines.duckduckgo import (
_fetch_supported_languages, get_ddg_lang,
supported_languages_url, get_vqd,
) )
from searx.network import get from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about # about
about = { about = {
"website": 'https://duckduckgo.com/', "website": 'https://duckduckgo.com/',
"wikidata_id": 'Q12805', "wikidata_id": 'Q12805',
"official_api_documentation": {
'url': 'https://duckduckgo.com/api',
'comment': 'but images are not supported',
},
"use_official_api": False, "use_official_api": False,
"require_api_key": False, "require_api_key": False,
"results": 'JSON (site requires js to get images)', "results": 'JSON (site requires js to get images)',
@ -32,70 +36,64 @@ paging = True
safesearch = True safesearch = True
send_accept_language_header = True send_accept_language_header = True
# search-url safesearch_cookies = {0: '-2', 1: None, 2: '1'}
images_url = 'https://duckduckgo.com/i.js?{query}&s={offset}&p={safesearch}&o=json&vqd={vqd}' safesearch_args = {0: '1', 1: None, 2: '1'}
site_url = 'https://duckduckgo.com/?{query}&iar=images&iax=1&ia=images'
# run query in site to get vqd number needed for requesting images
# TODO: find a way to get this number without an extra request (is it a hash of the query?)
def get_vqd(query, headers):
query_url = site_url.format(query=urlencode({'q': query}))
res = get(query_url, headers=headers)
content = res.text
if content.find('vqd=\'') == -1:
raise SearxEngineAPIException('Request failed')
vqd = content[content.find('vqd=\'') + 5 :]
vqd = vqd[: vqd.find('\'')]
return vqd
# do search-request
def request(query, params): def request(query, params):
# to avoid running actual external requests when testing
if 'is_test' not in params:
vqd = get_vqd(query, params['headers'])
else:
vqd = '12345'
offset = (params['pageno'] - 1) * 50 eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
safesearch = params['safesearch'] - 1 args = {
'q': query,
'o': 'json',
# 'u': 'bing',
'l': eng_region,
'vqd': get_vqd(query, params["headers"]),
}
region_code = get_region_code(params['language'], lang_list=supported_languages) if params['pageno'] > 1:
if region_code: args['s'] = (params['pageno'] - 1) * 100
params['url'] = images_url.format(
query=urlencode({'q': query, 'l': region_code}), offset=offset, safesearch=safesearch, vqd=vqd params['cookies']['ad'] = eng_lang # zh_CN
) params['cookies']['ah'] = eng_region # "us-en,de-de"
else: params['cookies']['l'] = eng_region # "hk-tzh"
params['url'] = images_url.format(query=urlencode({'q': query}), offset=offset, safesearch=safesearch, vqd=vqd) logger.debug("cookies: %s", params['cookies'])
safe_search = safesearch_cookies.get(params['safesearch'])
if safe_search is not None:
params['cookies']['p'] = safe_search # "-2", "1"
safe_search = safesearch_args.get(params['safesearch'])
if safe_search is not None:
args['p'] = safe_search # "-1", "1"
args = urlencode(args)
params['url'] = 'https://duckduckgo.com/i.js?{args}&f={f}'.format(args=args, f=',,,,,')
params['headers']['Accept'] = 'application/json, text/javascript, */*; q=0.01'
params['headers']['Referer'] = 'https://duckduckgo.com/'
params['headers']['X-Requested-With'] = 'XMLHttpRequest'
logger.debug("headers: %s", params['headers'])
return params return params
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
res_json = resp.json()
content = resp.text
res_json = loads(content)
# parse results
for result in res_json['results']: for result in res_json['results']:
title = result['title']
url = result['url']
thumbnail = result['thumbnail']
image = result['image']
# append result
results.append( results.append(
{ {
'template': 'images.html', 'template': 'images.html',
'title': title, 'title': result['title'],
'content': '', 'content': '',
'thumbnail_src': thumbnail, 'thumbnail_src': result['thumbnail'],
'img_src': image, 'img_src': result['image'],
'url': url, 'url': result['url'],
'img_format': '%s x %s' % (result['width'], result['height']),
'source': result['source'],
} }
) )

View file

@ -1,13 +1,29 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint # lint: pylint
"""DuckDuckGo Weather""" """
DuckDuckGo Weather
~~~~~~~~~~~~~~~~~~
"""
from typing import TYPE_CHECKING
from json import loads from json import loads
from urllib.parse import quote from urllib.parse import quote
from datetime import datetime from datetime import datetime
from flask_babel import gettext from flask_babel import gettext
from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import
from searx.engines.duckduckgo import get_ddg_lang
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = { about = {
"website": 'https://duckduckgo.com/', "website": 'https://duckduckgo.com/',
"wikidata_id": 'Q12805', "wikidata_id": 'Q12805',
@ -17,9 +33,11 @@ about = {
"results": "JSON", "results": "JSON",
} }
categories = ["others"] send_accept_language_header = True
url = "https://duckduckgo.com/js/spice/forecast/{query}/{lang}" # engine dependent config
categories = ["others"]
URL = "https://duckduckgo.com/js/spice/forecast/{query}/{lang}"
def generate_condition_table(condition): def generate_condition_table(condition):
@ -72,8 +90,17 @@ def generate_day_table(day):
def request(query, params): def request(query, params):
params["url"] = url.format(query=quote(query), lang=params['language'].split('-')[0])
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
# !ddw paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'}
params['cookies']['ad'] = eng_lang
params['cookies']['ah'] = eng_region
params['cookies']['l'] = eng_region
logger.debug("cookies: %s", params['cookies'])
params["url"] = URL.format(query=quote(query), lang=eng_lang.split('_')[0])
return params return params

View file

@ -25,6 +25,7 @@ base_url = 'https://wiki.gentoo.org'
# xpath queries # xpath queries
xpath_results = '//ul[@class="mw-search-results"]/li' xpath_results = '//ul[@class="mw-search-results"]/li'
xpath_link = './/div[@class="mw-search-result-heading"]/a' xpath_link = './/div[@class="mw-search-result-heading"]/a'
xpath_content = './/div[@class="searchresult"]'
# cut 'en' from 'en-US', 'de' from 'de-CH', and so on # cut 'en' from 'en-US', 'de' from 'de-CH', and so on
@ -77,8 +78,6 @@ main_langs = {
'uk': 'Українська', 'uk': 'Українська',
'zh': '简体中文', 'zh': '简体中文',
} }
supported_languages = dict(lang_urls, **main_langs)
# do search-request # do search-request
def request(query, params): def request(query, params):
@ -118,7 +117,8 @@ def response(resp):
link = result.xpath(xpath_link)[0] link = result.xpath(xpath_link)[0]
href = urljoin(base_url, link.attrib.get('href')) href = urljoin(base_url, link.attrib.get('href'))
title = extract_text(link) title = extract_text(link)
content = extract_text(result.xpath(xpath_content))
results.append({'url': href, 'title': title}) results.append({'url': href, 'title': title, 'content': content})
return results return results

View file

@ -1,34 +1,39 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint # lint: pylint
"""This is the implementation of the google WEB engine. Some of this """This is the implementation of the Google WEB engine. Some of this
implementations are shared by other engines: implementations (manly the :py:obj:`get_google_info`) are shared by other
engines:
- :ref:`google images engine` - :ref:`google images engine`
- :ref:`google news engine` - :ref:`google news engine`
- :ref:`google videos engine` - :ref:`google videos engine`
- :ref:`google scholar engine`
The google WEB engine itself has a special setup option: - :ref:`google autocomplete`
.. code:: yaml
- name: google
...
use_mobile_ui: false
``use_mobile_ui``: (default: ``false``)
Enables to use *mobile endpoint* to bypass the google blocking (see
:issue:`159`). On the mobile UI of Google Search, the button :guilabel:`More
results` is not affected by Google rate limiting and we can still do requests
while actively blocked by the original Google search. By activate
``use_mobile_ui`` this behavior is simulated by adding the parameter
``async=use_ac:true,_fmt:pc`` to the :py:func:`request`.
""" """
from typing import TYPE_CHECKING
import re
from urllib.parse import urlencode from urllib.parse import urlencode
from lxml import html from lxml import html
from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex import babel
import babel.core
import babel.languages
from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
from searx.locales import language_tag, region_tag, get_offical_locales
from searx import network
from searx.exceptions import SearxEngineCaptchaException from searx.exceptions import SearxEngineCaptchaException
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about # about
about = { about = {
@ -45,64 +50,6 @@ categories = ['general', 'web']
paging = True paging = True
time_range_support = True time_range_support = True
safesearch = True safesearch = True
send_accept_language_header = True
use_mobile_ui = False
supported_languages_url = 'https://www.google.com/preferences?#languages'
# based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests
google_domains = {
'BG': 'google.bg', # Bulgaria
'CZ': 'google.cz', # Czech Republic
'DE': 'google.de', # Germany
'DK': 'google.dk', # Denmark
'AT': 'google.at', # Austria
'CH': 'google.ch', # Switzerland
'GR': 'google.gr', # Greece
'AU': 'google.com.au', # Australia
'CA': 'google.ca', # Canada
'GB': 'google.co.uk', # United Kingdom
'ID': 'google.co.id', # Indonesia
'IE': 'google.ie', # Ireland
'IN': 'google.co.in', # India
'MY': 'google.com.my', # Malaysia
'NZ': 'google.co.nz', # New Zealand
'PH': 'google.com.ph', # Philippines
'SG': 'google.com.sg', # Singapore
'US': 'google.com', # United States (google.us) redirects to .com
'ZA': 'google.co.za', # South Africa
'AR': 'google.com.ar', # Argentina
'CL': 'google.cl', # Chile
'ES': 'google.es', # Spain
'MX': 'google.com.mx', # Mexico
'EE': 'google.ee', # Estonia
'FI': 'google.fi', # Finland
'BE': 'google.be', # Belgium
'FR': 'google.fr', # France
'IL': 'google.co.il', # Israel
'HR': 'google.hr', # Croatia
'HU': 'google.hu', # Hungary
'IT': 'google.it', # Italy
'JP': 'google.co.jp', # Japan
'KR': 'google.co.kr', # South Korea
'LT': 'google.lt', # Lithuania
'LV': 'google.lv', # Latvia
'NO': 'google.no', # Norway
'NL': 'google.nl', # Netherlands
'PL': 'google.pl', # Poland
'BR': 'google.com.br', # Brazil
'PT': 'google.pt', # Portugal
'RO': 'google.ro', # Romania
'RU': 'google.ru', # Russia
'SK': 'google.sk', # Slovakia
'SI': 'google.si', # Slovenia
'SE': 'google.se', # Sweden
'TH': 'google.co.th', # Thailand
'TR': 'google.com.tr', # Turkey
'UA': 'google.com.ua', # Ukraine
'CN': 'google.com.hk', # There is no google.cn, we use .com.hk for zh-CN
'HK': 'google.com.hk', # Hong Kong
'TW': 'google.com.tw', # Taiwan
}
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'} time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
@ -112,50 +59,50 @@ filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}
# specific xpath variables # specific xpath variables
# ------------------------ # ------------------------
results_xpath = './/div[@data-sokoban-container]' results_xpath = './/div[contains(@jscontroller, "SC7lYd")]'
title_xpath = './/a/h3[1]' title_xpath = './/a/h3[1]'
href_xpath = './/a[h3]/@href' href_xpath = './/a[h3]/@href'
content_xpath = './/div[@data-content-feature=1]' content_xpath = './/div[@data-sncf]'
# google *sections* are no usual *results*, we ignore them
g_section_with_header = './g-section-with-header'
# Suggestions are links placed in a *card-section*, we extract only the text # Suggestions are links placed in a *card-section*, we extract only the text
# from the links not the links itself. # from the links not the links itself.
suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a' suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a'
# UI_ASYNC = 'use_ac:true,_fmt:html' # returns a HTTP 500 when user search for
# # celebrities like '!google natasha allegri'
# # or '!google chris evans'
UI_ASYNC = 'use_ac:true,_fmt:prog'
"""Format of the response from UI's async request."""
def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
"""Composing various language properties for the google engines. def get_google_info(params, eng_traits):
"""Composing various (language) properties for the google engines (:ref:`google
API`).
This function is called by the various google engines (:ref:`google web This function is called by the various google engines (:ref:`google web
engine`, :ref:`google images engine`, :ref:`google news engine` and engine`, :ref:`google images engine`, :ref:`google news engine` and
:ref:`google videos engine`). :ref:`google videos engine`).
:param dict param: request parameters of the engine :param dict param: Request parameters of the engine. At least
a ``searxng_locale`` key should be in the dictionary.
:param list lang_list: list of supported languages of the engine :param eng_traits: Engine's traits fetched from google preferences
:py:obj:`ENGINES_LANGUAGES[engine-name] <searx.data.ENGINES_LANGUAGES>` (:py:obj:`searx.enginelib.traits.EngineTraits`)
:param dict lang_list: custom aliases for non standard language codes
(used when calling :py:func:`searx.utils.match_language`)
:param bool supported_any_language: When a language is not specified, the
language interpretation is left up to Google to decide how the search
results should be delivered. This argument is ``True`` for the google
engine and ``False`` for the other engines (google-images, -news,
-scholar, -videos).
:rtype: dict :rtype: dict
:returns: :returns:
Py-Dictionary with the key/value pairs: Py-Dictionary with the key/value pairs:
language: language:
Return value from :py:func:`searx.utils.match_language` The language code that is used by google (e.g. ``lang_en`` or
``lang_zh-TW``)
country: country:
The country code (e.g. US, AT, CA, FR, DE ..) The country code that is used by google (e.g. ``US`` or ``TW``)
locale:
A instance of :py:obj:`babel.core.Locale` build from the
``searxng_locale`` value.
subdomain: subdomain:
Google subdomain :py:obj:`google_domains` that fits to the country Google subdomain :py:obj:`google_domains` that fits to the country
@ -165,52 +112,67 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
Py-Dictionary with additional request arguments (can be passed to Py-Dictionary with additional request arguments (can be passed to
:py:func:`urllib.parse.urlencode`). :py:func:`urllib.parse.urlencode`).
- ``hl`` parameter: specifies the interface language of user interface.
- ``lr`` parameter: restricts search results to documents written in
a particular language.
- ``cr`` parameter: restricts search results to documents
originating in a particular country.
- ``ie`` parameter: sets the character encoding scheme that should
be used to interpret the query string ('utf8').
- ``oe`` parameter: sets the character encoding scheme that should
be used to decode the XML result ('utf8').
headers: headers:
Py-Dictionary with additional HTTP headers (can be passed to Py-Dictionary with additional HTTP headers (can be passed to
request's headers) request's headers)
- ``Accept: '*/*``
""" """
ret_val = { ret_val = {
'language': None, 'language': None,
'country': None, 'country': None,
'subdomain': None, 'subdomain': None,
'params': {}, 'params': {},
'headers': {}, 'headers': {},
'cookies': {},
'locale': None,
} }
# language ... sxng_locale = params.get('searxng_locale', 'all')
try:
locale = babel.Locale.parse(sxng_locale, sep='-')
except babel.core.UnknownLocaleError:
locale = None
_lang = params['language'] eng_lang = eng_traits.get_language(sxng_locale, 'lang_en')
_any_language = _lang.lower() == 'all' lang_code = eng_lang.split('_')[-1] # lang_zh-TW --> zh-TW / lang_en --> en
if _any_language: country = eng_traits.get_region(sxng_locale, eng_traits.all_locale)
_lang = 'en-US'
language = match_language(_lang, lang_list, custom_aliases)
ret_val['language'] = language
# country ... # Test zh_hans & zh_hant --> in the topmost links in the result list of list
# TW and HK you should a find wiktionary.org zh_hant link. In the result
# list of zh-CN should not be no hant link instead you should find
# zh.m.wikipedia.org/zh somewhere in the top.
_l = _lang.split('-') # '!go 日 :zh-TW' --> https://zh.m.wiktionary.org/zh-hant/%E6%97%A5
if len(_l) == 2: # '!go 日 :zh-CN' --> https://zh.m.wikipedia.org/zh/%E6%97%A5
country = _l[1]
else: ret_val['language'] = eng_lang
country = _l[0].upper()
if country == 'EN':
country = 'US'
ret_val['country'] = country ret_val['country'] = country
ret_val['locale'] = locale
# subdomain ... ret_val['subdomain'] = eng_traits.custom['supported_domains'].get(country.upper(), 'www.google.com')
ret_val['subdomain'] = 'www.' + google_domains.get(country.upper(), 'google.com')
# params & headers
lang_country = '%s-%s' % (language, country) # (en-US, en-EN, de-DE, de-AU, fr-FR ..)
# hl parameter: # hl parameter:
# https://developers.google.com/custom-search/docs/xml_results#hlsp The # The hl parameter specifies the interface language (host language) of
# Interface Language: # your user interface. To improve the performance and the quality of your
# search results, you are strongly encouraged to set this parameter
# explicitly.
# https://developers.google.com/custom-search/docs/xml_results#hlsp
# The Interface Language:
# https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages # https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
ret_val['params']['hl'] = lang_list.get(lang_country, language) ret_val['params']['hl'] = lang_code
# lr parameter: # lr parameter:
# The lr (language restrict) parameter restricts search results to # The lr (language restrict) parameter restricts search results to
@ -218,22 +180,72 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
# https://developers.google.com/custom-search/docs/xml_results#lrsp # https://developers.google.com/custom-search/docs/xml_results#lrsp
# Language Collection Values: # Language Collection Values:
# https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections # https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections
#
# To select 'all' languages an empty 'lr' value is used.
#
# Different to other google services, Google Schloar supports to select more
# than one language. The languages are seperated by a pipe '|' (logical OR).
# By example: &lr=lang_zh-TW%7Clang_de selects articles written in
# traditional chinese OR german language.
if _any_language and supported_any_language: ret_val['params']['lr'] = eng_lang
if sxng_locale == 'all':
ret_val['params']['lr'] = ''
# interpretation is left up to Google (based on whoogle) # cr parameter:
# # The cr parameter restricts search results to documents originating in a
# - add parameter ``source=lnt`` # particular country.
# - don't use parameter ``lr`` # https://developers.google.com/custom-search/docs/xml_results#crsp
# - don't add a ``Accept-Language`` HTTP header.
ret_val['params']['source'] = 'lnt' ret_val['params']['cr'] = 'country' + country
if sxng_locale == 'all':
ret_val['params']['cr'] = ''
else: # gl parameter: (mandatory by Geeogle News)
# The gl parameter value is a two-letter country code. For WebSearch
# results, the gl parameter boosts search results whose country of origin
# matches the parameter value. See the Country Codes section for a list of
# valid values.
# Specifying a gl parameter value in WebSearch requests should improve the
# relevance of results. This is particularly true for international
# customers and, even more specifically, for customers in English-speaking
# countries other than the United States.
# https://developers.google.com/custom-search/docs/xml_results#glsp
# restricts search results to documents written in a particular ret_val['params']['gl'] = country
# language.
ret_val['params']['lr'] = "lang_" + lang_list.get(lang_country, language) # ie parameter:
# The ie parameter sets the character encoding scheme that should be used
# to interpret the query string. The default ie value is latin1.
# https://developers.google.com/custom-search/docs/xml_results#iesp
ret_val['params']['ie'] = 'utf8'
# oe parameter:
# The oe parameter sets the character encoding scheme that should be used
# to decode the XML result. The default oe value is latin1.
# https://developers.google.com/custom-search/docs/xml_results#oesp
ret_val['params']['oe'] = 'utf8'
# num parameter:
# The num parameter identifies the number of search results to return.
# The default num value is 10, and the maximum value is 20. If you request
# more than 20 results, only 20 results will be returned.
# https://developers.google.com/custom-search/docs/xml_results#numsp
# HINT: seems to have no effect (tested in google WEB & Images)
# ret_val['params']['num'] = 20
# HTTP headers
ret_val['headers']['Accept'] = '*/*'
# Cookies
# - https://github.com/searxng/searxng/pull/1679#issuecomment-1235432746
# - https://github.com/searxng/searxng/issues/1555
ret_val['cookies']['CONSENT'] = "YES+"
return ret_val return ret_val
@ -245,33 +257,34 @@ def detect_google_sorry(resp):
def request(query, params): def request(query, params):
"""Google search request""" """Google search request"""
# pylint: disable=line-too-long
offset = (params['pageno'] - 1) * 10 offset = (params['pageno'] - 1) * 10
google_info = get_google_info(params, traits)
lang_info = get_lang_info(params, supported_languages, language_aliases, True)
additional_parameters = {}
if use_mobile_ui:
additional_parameters = {
'asearch': 'arc',
'async': 'use_ac:true,_fmt:prog',
}
# https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
query_url = ( query_url = (
'https://' 'https://'
+ lang_info['subdomain'] + google_info['subdomain']
+ '/search' + '/search'
+ "?" + "?"
+ urlencode( + urlencode(
{ {
'q': query, 'q': query,
**lang_info['params'], **google_info['params'],
'ie': "utf8",
'oe': "utf8",
'start': offset,
'filter': '0', 'filter': '0',
**additional_parameters, 'start': offset,
# 'vet': '12ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0QxK8CegQIARAC..i',
# 'ved': '2ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0Q_skCegQIARAG',
# 'cs' : 1,
# 'sa': 'N',
# 'yv': 3,
# 'prmd': 'vin',
# 'ei': 'GASaY6TxOcy_xc8PtYeY6AE',
# 'sa': 'N',
# 'sstk': 'AcOHfVkD7sWCSAheZi-0tx_09XDO55gTWY0JNq3_V26cNN-c8lfD45aZYPI8s_Bqp8s57AHz5pxchDtAGCA_cikAWSjy9kw3kgg'
# formally known as use_mobile_ui
'asearch': 'arc',
'async': UI_ASYNC,
} }
) )
) )
@ -282,25 +295,38 @@ def request(query, params):
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
params['url'] = query_url params['url'] = query_url
params['cookies']['CONSENT'] = "YES+" params['cookies'] = google_info['cookies']
params['headers'].update(lang_info['headers']) params['headers'].update(google_info['headers'])
if use_mobile_ui:
params['headers']['Accept'] = '*/*'
else:
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
return params return params
# =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87;
# ...6T+9Nl4cnD+gr9OK8I56/tX3l86nWYw//2Q==26;
RE_DATA_IMAGE = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*);')
def _parse_data_images(dom):
data_image_map = {}
for img_id, data_image in RE_DATA_IMAGE.findall(dom.text_content()):
end_pos = data_image.rfind('=')
if end_pos > 0:
data_image = data_image[: end_pos + 1]
data_image_map[img_id] = data_image
logger.debug('data:image objects --> %s', list(data_image_map.keys()))
return data_image_map
def response(resp): def response(resp):
"""Get response from google's search request""" """Get response from google's search request"""
# pylint: disable=too-many-branches, too-many-statements
detect_google_sorry(resp) detect_google_sorry(resp)
results = [] results = []
# convert the text to dom # convert the text to dom
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
data_image_map = _parse_data_images(dom)
# results --> answer # results --> answer
answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]') answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
if answer_list: if answer_list:
@ -309,25 +335,9 @@ def response(resp):
else: else:
logger.debug("did not find 'answer'") logger.debug("did not find 'answer'")
# results --> number_of_results
if not use_mobile_ui:
try:
_txt = eval_xpath_getindex(dom, '//div[@id="result-stats"]//text()', 0)
_digit = ''.join([n for n in _txt if n.isdigit()])
number_of_results = int(_digit)
results.append({'number_of_results': number_of_results})
except Exception as e: # pylint: disable=broad-except
logger.debug("did not 'number_of_results'")
logger.error(e, exc_info=True)
# parse results # parse results
for result in eval_xpath_list(dom, results_xpath): for result in eval_xpath_list(dom, results_xpath): # pylint: disable=too-many-nested-blocks
# google *sections*
if extract_text(eval_xpath(result, g_section_with_header)):
logger.debug("ignoring <g-section-with-header>")
continue
try: try:
title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None) title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None)
@ -336,16 +346,30 @@ def response(resp):
logger.debug('ignoring item from the result_xpath list: missing title') logger.debug('ignoring item from the result_xpath list: missing title')
continue continue
title = extract_text(title_tag) title = extract_text(title_tag)
url = eval_xpath_getindex(result, href_xpath, 0, None) url = eval_xpath_getindex(result, href_xpath, 0, None)
if url is None: if url is None:
logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title)
continue continue
content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True)
if content is None: content_nodes = eval_xpath(result, content_xpath)
content = extract_text(content_nodes)
if not content:
logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title) logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title)
continue continue
logger.debug('add link to results: %s', title) img_src = content_nodes[0].xpath('.//img/@src')
results.append({'url': url, 'title': title, 'content': content}) if img_src:
img_src = img_src[0]
if img_src.startswith('data:image'):
img_id = content_nodes[0].xpath('.//img/@id')
if img_id:
img_src = data_image_map.get(img_id[0])
else:
img_src = None
results.append({'url': url, 'title': title, 'content': content, 'img_src': img_src})
except Exception as e: # pylint: disable=broad-except except Exception as e: # pylint: disable=broad-except
logger.error(e, exc_info=True) logger.error(e, exc_info=True)
@ -361,15 +385,107 @@ def response(resp):
# get supported languages from their site # get supported languages from their site
def _fetch_supported_languages(resp):
ret_val = {}
skip_countries = [
# official language of google-country not in google-languages
'AL', # Albanien (sq)
'AZ', # Aserbaidschan (az)
'BD', # Bangladesch (bn)
'BN', # Brunei Darussalam (ms)
'BT', # Bhutan (dz)
'ET', # Äthiopien (am)
'GE', # Georgien (ka, os)
'GL', # Grönland (kl)
'KH', # Kambodscha (km)
'LA', # Laos (lo)
'LK', # Sri Lanka (si, ta)
'ME', # Montenegro (sr)
'MK', # Nordmazedonien (mk, sq)
'MM', # Myanmar (my)
'MN', # Mongolei (mn)
'MV', # Malediven (dv) // dv_MV is unknown by babel
'MY', # Malaysia (ms)
'NP', # Nepal (ne)
'TJ', # Tadschikistan (tg)
'TM', # Turkmenistan (tk)
'UZ', # Usbekistan (uz)
]
def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True):
"""Fetch languages from Google."""
# pylint: disable=import-outside-toplevel, too-many-branches
engine_traits.custom['supported_domains'] = {}
resp = network.get('https://www.google.com/preferences')
if not resp.ok:
raise RuntimeError("Response from Google's preferences is not OK.")
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
radio_buttons = eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]') # supported language codes
for x in radio_buttons: lang_map = {'no': 'nb'}
name = x.get("data-name") for x in eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]'):
code = x.get("value").split('_')[-1]
ret_val[code] = {"name": name}
return ret_val eng_lang = x.get("value").split('_')[-1]
try:
locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-')
except babel.UnknownLocaleError:
print("ERROR: %s -> %s is unknown by babel" % (x.get("data-name"), eng_lang))
continue
sxng_lang = language_tag(locale)
conflict = engine_traits.languages.get(sxng_lang)
if conflict:
if conflict != eng_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang))
continue
engine_traits.languages[sxng_lang] = 'lang_' + eng_lang
# alias languages
engine_traits.languages['zh'] = 'lang_zh-CN'
# supported region codes
for x in eval_xpath_list(dom, '//*[@name="region"]/..//input[@name="region"]'):
eng_country = x.get("value")
if eng_country in skip_countries:
continue
if eng_country == 'ZZ':
engine_traits.all_locale = 'ZZ'
continue
sxng_locales = get_offical_locales(eng_country, engine_traits.languages.keys(), regional=True)
if not sxng_locales:
print("ERROR: can't map from google country %s (%s) to a babel region." % (x.get('data-name'), eng_country))
continue
for sxng_locale in sxng_locales:
engine_traits.regions[region_tag(sxng_locale)] = eng_country
# alias regions
engine_traits.regions['zh-CN'] = 'HK'
# supported domains
if add_domains:
resp = network.get('https://www.google.com/supported_domains')
if not resp.ok:
raise RuntimeError("Response from https://www.google.com/supported_domains is not OK.")
for domain in resp.text.split():
domain = domain.strip()
if not domain or domain in [
'.google.com',
]:
continue
region = domain.split('.')[-1].upper()
engine_traits.custom['supported_domains'][region] = 'www' + domain
if region == 'HK':
# There is no google.cn, we use .com.hk for zh-CN
engine_traits.custom['supported_domains']['CN'] = 'www' + domain

View file

@ -1,31 +1,38 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint # lint: pylint
"""This is the implementation of the google images engine using the google """This is the implementation of the Google Images engine using the internal
internal API used the Google Go Android app. Google API used by the Google Go Android app.
This internal API offer results in This internal API offer results in
- JSON (_fmt:json) - JSON (``_fmt:json``)
- Protobuf (_fmt:pb) - Protobuf_ (``_fmt:pb``)
- Protobuf compressed? (_fmt:pc) - Protobuf_ compressed? (``_fmt:pc``)
- HTML (_fmt:html) - HTML (``_fmt:html``)
- Protobuf encoded in JSON (_fmt:jspb). - Protobuf_ encoded in JSON (``_fmt:jspb``).
.. _Protobuf: https://en.wikipedia.org/wiki/Protocol_Buffers
""" """
from typing import TYPE_CHECKING
from urllib.parse import urlencode from urllib.parse import urlencode
from json import loads from json import loads
from searx.engines.google import fetch_traits # pylint: disable=unused-import
from searx.engines.google import ( from searx.engines.google import (
get_lang_info, get_google_info,
time_range_dict, time_range_dict,
detect_google_sorry, detect_google_sorry,
) )
# pylint: disable=unused-import if TYPE_CHECKING:
from searx.engines.google import supported_languages_url, _fetch_supported_languages import logging
from searx.enginelib.traits import EngineTraits
logger: logging.Logger
traits: EngineTraits
# pylint: enable=unused-import
# about # about
about = { about = {
@ -40,7 +47,6 @@ about = {
# engine dependent config # engine dependent config
categories = ['images', 'web'] categories = ['images', 'web']
paging = True paging = True
use_locale_domain = True
time_range_support = True time_range_support = True
safesearch = True safesearch = True
send_accept_language_header = True send_accept_language_header = True
@ -51,20 +57,18 @@ filter_mapping = {0: 'images', 1: 'active', 2: 'active'}
def request(query, params): def request(query, params):
"""Google-Image search request""" """Google-Image search request"""
lang_info = get_lang_info(params, supported_languages, language_aliases, False) google_info = get_google_info(params, traits)
query_url = ( query_url = (
'https://' 'https://'
+ lang_info['subdomain'] + google_info['subdomain']
+ '/search' + '/search'
+ "?" + "?"
+ urlencode( + urlencode(
{ {
'q': query, 'q': query,
'tbm': "isch", 'tbm': "isch",
**lang_info['params'], **google_info['params'],
'ie': "utf8",
'oe': "utf8",
'asearch': 'isch', 'asearch': 'isch',
'async': '_fmt:json,p:1,ijn:' + str(params['pageno']), 'async': '_fmt:json,p:1,ijn:' + str(params['pageno']),
} }
@ -77,9 +81,8 @@ def request(query, params):
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
params['url'] = query_url params['url'] = query_url
params['headers'].update(lang_info['headers']) params['cookies'] = google_info['cookies']
params['headers']['User-Agent'] = 'NSTN/3.60.474802233.release Dalvik/2.1.0 (Linux; U; Android 12; US) gzip' params['headers'].update(google_info['headers'])
params['headers']['Accept'] = '*/*'
return params return params
@ -111,7 +114,11 @@ def response(resp):
copyright_notice = item["result"].get('iptc', {}).get('copyright_notice') copyright_notice = item["result"].get('iptc', {}).get('copyright_notice')
if copyright_notice: if copyright_notice:
result_item['source'] += ' / ' + copyright_notice result_item['source'] += ' | ' + copyright_notice
freshness_date = item["result"].get("freshness_date")
if freshness_date:
result_item['source'] += ' | ' + freshness_date
file_size = item.get('gsa', {}).get('file_size') file_size = item.get('gsa', {}).get('file_size')
if file_size: if file_size:

View file

@ -1,24 +1,40 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint # lint: pylint
"""This is the implementation of the google news engine. The google news API """This is the implementation of the Google News engine.
ignores some parameters from the common :ref:`google API`:
- num_ : the number of search results is ignored Google News has a different region handling compared to Google WEB.
- the ``ceid`` argument has to be set (:py:obj:`ceid_list`)
- the hl_ argument has to be set correctly (and different to Google WEB)
- the gl_ argument is mandatory
If one of this argument is not set correctly, the request is redirected to
CONSENT dialog::
https://consent.google.com/m?continue=
The google news API ignores some parameters from the common :ref:`google API`:
- num_ : the number of search results is ignored / there is no paging all
results for a query term are in the first response.
- save_ : is ignored / Google-News results are always *SafeSearch* - save_ : is ignored / Google-News results are always *SafeSearch*
.. _hl: https://developers.google.com/custom-search/docs/xml_results#hlsp
.. _gl: https://developers.google.com/custom-search/docs/xml_results#glsp
.. _num: https://developers.google.com/custom-search/docs/xml_results#numsp .. _num: https://developers.google.com/custom-search/docs/xml_results#numsp
.. _save: https://developers.google.com/custom-search/docs/xml_results#safesp .. _save: https://developers.google.com/custom-search/docs/xml_results#safesp
""" """
# pylint: disable=invalid-name from typing import TYPE_CHECKING
import binascii import binascii
import re import re
from urllib.parse import urlencode from urllib.parse import urlencode
from base64 import b64decode from base64 import b64decode
from lxml import html from lxml import html
import babel
from searx import locales
from searx.utils import ( from searx.utils import (
eval_xpath, eval_xpath,
eval_xpath_list, eval_xpath_list,
@ -26,18 +42,19 @@ from searx.utils import (
extract_text, extract_text,
) )
# pylint: disable=unused-import from searx.engines.google import fetch_traits as _fetch_traits # pylint: disable=unused-import
from searx.engines.google import ( from searx.engines.google import (
supported_languages_url, get_google_info,
_fetch_supported_languages,
)
# pylint: enable=unused-import
from searx.engines.google import (
get_lang_info,
detect_google_sorry, detect_google_sorry,
) )
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about # about
about = { about = {
@ -49,70 +66,77 @@ about = {
"results": 'HTML', "results": 'HTML',
} }
# compared to other google engines google-news has a different time range
# support. The time range is included in the search term.
time_range_dict = {
'day': 'when:1d',
'week': 'when:7d',
'month': 'when:1m',
'year': 'when:1y',
}
# engine dependent config # engine dependent config
categories = ['news'] categories = ['news']
paging = False paging = False
use_locale_domain = True time_range_support = False
time_range_support = True
# Google-News results are always *SafeSearch*. Option 'safesearch' is set to # Google-News results are always *SafeSearch*. Option 'safesearch' is set to
# False here, otherwise checker will report safesearch-errors:: # False here, otherwise checker will report safesearch-errors::
# #
# safesearch : results are identitical for safesearch=0 and safesearch=2 # safesearch : results are identitical for safesearch=0 and safesearch=2
safesearch = False safesearch = True
send_accept_language_header = True # send_accept_language_header = True
def request(query, params): def request(query, params):
"""Google-News search request""" """Google-News search request"""
lang_info = get_lang_info(params, supported_languages, language_aliases, False) sxng_locale = params.get('searxng_locale', 'en-US')
ceid = locales.get_engine_locale(sxng_locale, traits.custom['ceid'], default='US:en')
google_info = get_google_info(params, traits)
google_info['subdomain'] = 'news.google.com' # google news has only one domain
# google news has only one domain ceid_region, ceid_lang = ceid.split(':')
lang_info['subdomain'] = 'news.google.com' ceid_lang, ceid_suffix = (
ceid_lang.split('-')
+ [
None,
]
)[:2]
ceid = "%s:%s" % (lang_info['country'], lang_info['language']) google_info['params']['hl'] = ceid_lang
# google news redirects en to en-US if ceid_suffix and ceid_suffix not in ['Hans', 'Hant']:
if lang_info['params']['hl'] == 'en':
lang_info['params']['hl'] = 'en-US'
# Very special to google-news compared to other google engines, the time if ceid_region.lower() == ceid_lang:
# range is included in the search term. google_info['params']['hl'] = ceid_lang + '-' + ceid_region
if params['time_range']: else:
query += ' ' + time_range_dict[params['time_range']] google_info['params']['hl'] = ceid_lang + '-' + ceid_suffix
elif ceid_region.lower() != ceid_lang:
if ceid_region in ['AT', 'BE', 'CH', 'IL', 'SA', 'IN', 'BD', 'PT']:
google_info['params']['hl'] = ceid_lang
else:
google_info['params']['hl'] = ceid_lang + '-' + ceid_region
google_info['params']['lr'] = 'lang_' + ceid_lang.split('-')[0]
google_info['params']['gl'] = ceid_region
query_url = ( query_url = (
'https://' 'https://'
+ lang_info['subdomain'] + google_info['subdomain']
+ '/search' + "/search?"
+ "?" + urlencode(
+ urlencode({'q': query, **lang_info['params'], 'ie': "utf8", 'oe': "utf8", 'gl': lang_info['country']}) {
'q': query,
**google_info['params'],
}
)
# ceid includes a ':' character which must not be urlencoded
+ ('&ceid=%s' % ceid) + ('&ceid=%s' % ceid)
) # ceid includes a ':' character which must not be urlencoded )
params['url'] = query_url params['url'] = query_url
params['cookies'] = google_info['cookies']
params['cookies']['CONSENT'] = "YES+" params['headers'].update(google_info['headers'])
params['headers'].update(lang_info['headers'])
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
return params return params
def response(resp): def response(resp):
"""Get response from google's search request""" """Get response from google's search request"""
results = [] results = []
detect_google_sorry(resp) detect_google_sorry(resp)
# convert the text to dom # convert the text to dom
@ -152,8 +176,8 @@ def response(resp):
# The pub_date is mostly a string like 'yesertday', not a real # The pub_date is mostly a string like 'yesertday', not a real
# timezone date or time. Therefore we can't use publishedDate. # timezone date or time. Therefore we can't use publishedDate.
pub_date = extract_text(eval_xpath(result, './article/div[1]/div[1]/time')) pub_date = extract_text(eval_xpath(result, './article//time'))
pub_origin = extract_text(eval_xpath(result, './article/div[1]/div[1]/a')) pub_origin = extract_text(eval_xpath(result, './article//a[@data-n-tid]'))
content = ' / '.join([x for x in [pub_origin, pub_date] if x]) content = ' / '.join([x for x in [pub_origin, pub_date] if x])
@ -174,3 +198,127 @@ def response(resp):
# return results # return results
return results return results
ceid_list = [
'AE:ar',
'AR:es-419',
'AT:de',
'AU:en',
'BD:bn',
'BE:fr',
'BE:nl',
'BG:bg',
'BR:pt-419',
'BW:en',
'CA:en',
'CA:fr',
'CH:de',
'CH:fr',
'CL:es-419',
'CN:zh-Hans',
'CO:es-419',
'CU:es-419',
'CZ:cs',
'DE:de',
'EG:ar',
'ES:es',
'ET:en',
'FR:fr',
'GB:en',
'GH:en',
'GR:el',
'HK:zh-Hant',
'HU:hu',
'ID:en',
'ID:id',
'IE:en',
'IL:en',
'IL:he',
'IN:bn',
'IN:en',
'IN:hi',
'IN:ml',
'IN:mr',
'IN:ta',
'IN:te',
'IT:it',
'JP:ja',
'KE:en',
'KR:ko',
'LB:ar',
'LT:lt',
'LV:en',
'LV:lv',
'MA:fr',
'MX:es-419',
'MY:en',
'NA:en',
'NG:en',
'NL:nl',
'NO:no',
'NZ:en',
'PE:es-419',
'PH:en',
'PK:en',
'PL:pl',
'PT:pt-150',
'RO:ro',
'RS:sr',
'RU:ru',
'SA:ar',
'SE:sv',
'SG:en',
'SI:sl',
'SK:sk',
'SN:fr',
'TH:th',
'TR:tr',
'TW:zh-Hant',
'TZ:en',
'UA:ru',
'UA:uk',
'UG:en',
'US:en',
'US:es-419',
'VE:es-419',
'VN:vi',
'ZA:en',
'ZW:en',
]
"""List of region/language combinations supported by Google News. Values of the
``ceid`` argument of the Google News REST API."""
_skip_values = [
'ET:en', # english (ethiopia)
'ID:en', # english (indonesia)
'LV:en', # english (latvia)
]
_ceid_locale_map = {'NO:no': 'nb-NO'}
def fetch_traits(engine_traits: EngineTraits):
_fetch_traits(engine_traits, add_domains=False)
engine_traits.custom['ceid'] = {}
for ceid in ceid_list:
if ceid in _skip_values:
continue
region, lang = ceid.split(':')
x = lang.split('-')
if len(x) > 1:
if x[1] not in ['Hant', 'Hans']:
lang = x[0]
sxng_locale = _ceid_locale_map.get(ceid, lang + '-' + region)
try:
locale = babel.Locale.parse(sxng_locale, sep='-')
except babel.UnknownLocaleError:
print("ERROR: %s -> %s is unknown by babel" % (ceid, sxng_locale))
continue
engine_traits.custom['ceid'][locales.region_tag(locale)] = ceid

View file

@ -1,19 +1,18 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint # lint: pylint
"""Google (Scholar) """This is the implementation of the Google Scholar engine.
For detailed description of the *REST-full* API see: `Query Parameter Compared to other Google services the Scholar engine has a simple GET REST-API
Definitions`_. and there does not exists `async` API. Even though the API slightly vintage we
can make use of the :ref:`google API` to assemble the arguments of the GET
.. _Query Parameter Definitions: request.
https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
""" """
# pylint: disable=invalid-name from typing import TYPE_CHECKING
from typing import Optional
from urllib.parse import urlencode from urllib.parse import urlencode
from datetime import datetime from datetime import datetime
from typing import Optional
from lxml import html from lxml import html
from searx.utils import ( from searx.utils import (
@ -23,19 +22,21 @@ from searx.utils import (
extract_text, extract_text,
) )
from searx.exceptions import SearxEngineCaptchaException
from searx.engines.google import fetch_traits # pylint: disable=unused-import
from searx.engines.google import ( from searx.engines.google import (
get_lang_info, get_google_info,
time_range_dict, time_range_dict,
detect_google_sorry,
) )
from searx.enginelib.traits import EngineTraits
# pylint: disable=unused-import if TYPE_CHECKING:
from searx.engines.google import ( import logging
supported_languages_url,
_fetch_supported_languages,
)
# pylint: enable=unused-import logger: logging.Logger
traits: EngineTraits
# about # about
about = { about = {
@ -51,53 +52,62 @@ about = {
categories = ['science', 'scientific publications'] categories = ['science', 'scientific publications']
paging = True paging = True
language_support = True language_support = True
use_locale_domain = True
time_range_support = True time_range_support = True
safesearch = False safesearch = False
send_accept_language_header = True send_accept_language_header = True
def time_range_url(params): def time_range_args(params):
"""Returns a URL query component for a google-Scholar time range based on """Returns a dictionary with a time range arguments based on
``params['time_range']``. Google-Scholar does only support ranges in years. ``params['time_range']``.
To have any effect, all the Searx ranges (*day*, *week*, *month*, *year*)
are mapped to *year*. If no range is set, an empty string is returned. Google Scholar supports a detailed search by year. Searching by *last
Example:: month* or *last week* (as offered by SearXNG) is uncommon for scientific
publications and is not supported by Google Scholar.
To limit the result list when the users selects a range, all the SearXNG
ranges (*day*, *week*, *month*, *year*) are mapped to *year*. If no range
is set an empty dictionary of arguments is returned. Example; when
user selects a time range (current year minus one in 2022):
.. code:: python
{ 'as_ylo' : 2021 }
&as_ylo=2019
""" """
# as_ylo=2016&as_yhi=2019 ret_val = {}
ret_val = ''
if params['time_range'] in time_range_dict: if params['time_range'] in time_range_dict:
ret_val = urlencode({'as_ylo': datetime.now().year - 1}) ret_val['as_ylo'] = datetime.now().year - 1
return '&' + ret_val return ret_val
def detect_google_captcha(dom):
"""In case of CAPTCHA Google Scholar open its own *not a Robot* dialog and is
not redirected to ``sorry.google.com``.
"""
if eval_xpath(dom, "//form[@id='gs_captcha_f']"):
raise SearxEngineCaptchaException()
def request(query, params): def request(query, params):
"""Google-Scholar search request""" """Google-Scholar search request"""
offset = (params['pageno'] - 1) * 10 google_info = get_google_info(params, traits)
lang_info = get_lang_info(params, supported_languages, language_aliases, False)
# subdomain is: scholar.google.xy # subdomain is: scholar.google.xy
lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.") google_info['subdomain'] = google_info['subdomain'].replace("www.", "scholar.")
query_url = ( args = {
'https://' 'q': query,
+ lang_info['subdomain'] **google_info['params'],
+ '/scholar' 'start': (params['pageno'] - 1) * 10,
+ "?" 'as_sdt': '2007', # include patents / to disable set '0,5'
+ urlencode({'q': query, **lang_info['params'], 'ie': "utf8", 'oe': "utf8", 'start': offset}) 'as_vis': '0', # include citations / to disable set '1'
) }
args.update(time_range_args(params))
query_url += time_range_url(params) params['url'] = 'https://' + google_info['subdomain'] + '/scholar?' + urlencode(args)
params['url'] = query_url params['cookies'] = google_info['cookies']
params['headers'].update(google_info['headers'])
params['cookies']['CONSENT'] = "YES+"
params['headers'].update(lang_info['headers'])
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
# params['google_subdomain'] = subdomain
return params return params
@ -138,19 +148,15 @@ def parse_gs_a(text: Optional[str]):
def response(resp): # pylint: disable=too-many-locals def response(resp): # pylint: disable=too-many-locals
"""Get response from google's search request""" """Parse response from Google Scholar"""
results = [] results = []
detect_google_sorry(resp)
# which subdomain ?
# subdomain = resp.search_params.get('google_subdomain')
# convert the text to dom # convert the text to dom
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
detect_google_captcha(dom)
# parse results # parse results
for result in eval_xpath_list(dom, '//div[@data-cid]'): for result in eval_xpath_list(dom, '//div[@data-rp]'):
title = extract_text(eval_xpath(result, './/h3[1]//a')) title = extract_text(eval_xpath(result, './/h3[1]//a'))
@ -158,7 +164,7 @@ def response(resp): # pylint: disable=too-many-locals
# this is a [ZITATION] block # this is a [ZITATION] block
continue continue
pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ct1"]')) pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]'))
if pub_type: if pub_type:
pub_type = pub_type[1:-1].lower() pub_type = pub_type[1:-1].lower()

View file

@ -1,6 +1,6 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint # lint: pylint
"""This is the implementation of the google videos engine. """This is the implementation of the Google Videos engine.
.. admonition:: Content-Security-Policy (CSP) .. admonition:: Content-Security-Policy (CSP)
@ -14,9 +14,8 @@
""" """
# pylint: disable=invalid-name from typing import TYPE_CHECKING
import re
from urllib.parse import urlencode from urllib.parse import urlencode
from lxml import html from lxml import html
@ -27,20 +26,22 @@ from searx.utils import (
extract_text, extract_text,
) )
from searx.engines.google import fetch_traits # pylint: disable=unused-import
from searx.engines.google import ( from searx.engines.google import (
get_lang_info, get_google_info,
time_range_dict, time_range_dict,
filter_mapping, filter_mapping,
g_section_with_header,
title_xpath,
suggestion_xpath, suggestion_xpath,
detect_google_sorry, detect_google_sorry,
) )
from searx.enginelib.traits import EngineTraits
# pylint: disable=unused-import if TYPE_CHECKING:
from searx.engines.google import supported_languages_url, _fetch_supported_languages import logging
# pylint: enable=unused-import logger: logging.Logger
traits: EngineTraits
# about # about
about = { about = {
@ -55,70 +56,32 @@ about = {
# engine dependent config # engine dependent config
categories = ['videos', 'web'] categories = ['videos', 'web']
paging = False paging = True
language_support = True language_support = True
use_locale_domain = True
time_range_support = True time_range_support = True
safesearch = True safesearch = True
send_accept_language_header = True
RE_CACHE = {}
def _re(regexpr):
"""returns compiled regular expression"""
RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr))
return RE_CACHE[regexpr]
def scrap_out_thumbs_src(dom):
ret_val = {}
thumb_name = 'dimg_'
for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'):
_script = script.text
# "dimg_35":"https://i.ytimg.c....",
_dimurl = _re("s='([^']*)").findall(_script)
for k, v in _re('(' + thumb_name + '[0-9]*)":"(http[^"]*)').findall(_script):
v = v.replace(r'\u003d', '=')
v = v.replace(r'\u0026', '&')
ret_val[k] = v
logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys())
return ret_val
def scrap_out_thumbs(dom):
"""Scrap out thumbnail data from <script> tags."""
ret_val = {}
thumb_name = 'dimg_'
for script in eval_xpath_list(dom, '//script[contains(., "_setImagesSrc")]'):
_script = script.text
# var s='data:image/jpeg;base64, ...'
_imgdata = _re("s='([^']*)").findall(_script)
if not _imgdata:
continue
# var ii=['dimg_17']
for _vidthumb in _re(r"(%s\d+)" % thumb_name).findall(_script):
# At least the equal sign in the URL needs to be decoded
ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=")
logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys())
return ret_val
def request(query, params): def request(query, params):
"""Google-Video search request""" """Google-Video search request"""
lang_info = get_lang_info(params, supported_languages, language_aliases, False) google_info = get_google_info(params, traits)
query_url = ( query_url = (
'https://' 'https://'
+ lang_info['subdomain'] + google_info['subdomain']
+ '/search' + '/search'
+ "?" + "?"
+ urlencode({'q': query, 'tbm': "vid", **lang_info['params'], 'ie': "utf8", 'oe': "utf8"}) + urlencode(
{
'q': query,
'tbm': "vid",
'start': 10 * params['pageno'],
**google_info['params'],
'asearch': 'arc',
'async': 'use_ac:true,_fmt:html',
}
)
) )
if params['time_range'] in time_range_dict: if params['time_range'] in time_range_dict:
@ -127,9 +90,8 @@ def request(query, params):
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
params['url'] = query_url params['url'] = query_url
params['cookies']['CONSENT'] = "YES+" params['cookies'] = google_info['cookies']
params['headers'].update(lang_info['headers']) params['headers'].update(google_info['headers'])
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
return params return params
@ -141,43 +103,30 @@ def response(resp):
# convert the text to dom # convert the text to dom
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
vidthumb_imgdata = scrap_out_thumbs(dom)
thumbs_src = scrap_out_thumbs_src(dom)
logger.debug(str(thumbs_src))
# parse results # parse results
for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'): for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'):
# ignore google *sections* img_src = eval_xpath_getindex(result, './/img/@src', 0, None)
if extract_text(eval_xpath(result, g_section_with_header)): if img_src is None:
logger.debug("ignoring <g-section-with-header>")
continue continue
# ingnore articles without an image id / e.g. news articles title = extract_text(eval_xpath_getindex(result, './/a/h3[1]', 0))
img_id = eval_xpath_getindex(result, './/g-img/img/@id', 0, default=None) url = eval_xpath_getindex(result, './/a/h3[1]/../@href', 0)
if img_id is None:
logger.error("no img_id found in item %s (news article?)", len(results) + 1)
continue
img_src = vidthumb_imgdata.get(img_id, None)
if not img_src:
img_src = thumbs_src.get(img_id, "")
title = extract_text(eval_xpath_getindex(result, title_xpath, 0))
url = eval_xpath_getindex(result, './/div[@class="dXiKIc"]//a/@href', 0)
length = extract_text(eval_xpath(result, './/div[contains(@class, "P7xzyf")]/span/span'))
c_node = eval_xpath_getindex(result, './/div[@class="Uroaid"]', 0) c_node = eval_xpath_getindex(result, './/div[@class="Uroaid"]', 0)
content = extract_text(c_node) content = extract_text(c_node)
pub_info = extract_text(eval_xpath(result, './/div[@class="Zg1NU"]')) pub_info = extract_text(eval_xpath(result, './/div[@class="P7xzyf"]'))
length = extract_text(eval_xpath(result, './/div[@class="J1mWY"]'))
results.append( results.append(
{ {
'url': url, 'url': url,
'title': title, 'title': title,
'content': content, 'content': content,
'length': length,
'author': pub_info, 'author': pub_info,
'thumbnail': img_src, 'thumbnail': img_src,
'length': length,
'template': 'videos.html', 'template': 'videos.html',
} }
) )

View file

@ -1,18 +1,30 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
""" # lint: pylint
peertube (Videos) """Peertube and :py:obj:`SepiaSearch <searx.engines.sepiasearch>` do share
(more or less) the same REST API and the schema of the JSON result is identical.
""" """
from json import loads import re
from datetime import datetime
from urllib.parse import urlencode from urllib.parse import urlencode
from searx.utils import html_to_text from datetime import datetime
from dateutil.parser import parse
from dateutil.relativedelta import relativedelta
import babel
from searx import network
from searx.locales import language_tag
from searx.utils import html_to_text
from searx.enginelib.traits import EngineTraits
traits: EngineTraits
# about
about = { about = {
# pylint: disable=line-too-long
"website": 'https://joinpeertube.org', "website": 'https://joinpeertube.org',
"wikidata_id": 'Q50938515', "wikidata_id": 'Q50938515',
"official_api_documentation": 'https://docs.joinpeertube.org/api-rest-reference.html', "official_api_documentation": 'https://docs.joinpeertube.org/api-rest-reference.html#tag/Search/operation/searchVideos',
"use_official_api": True, "use_official_api": True,
"require_api_key": False, "require_api_key": False,
"results": 'JSON', "results": 'JSON',
@ -22,66 +34,155 @@ about = {
categories = ["videos"] categories = ["videos"]
paging = True paging = True
base_url = "https://peer.tube" base_url = "https://peer.tube"
supported_languages_url = 'https://peer.tube/api/v1/videos/languages' """Base URL of the Peertube instance. A list of instances is available at:
- https://instances.joinpeertube.org/instances
"""
time_range_support = True
time_range_table = {
'day': relativedelta(),
'week': relativedelta(weeks=-1),
'month': relativedelta(months=-1),
'year': relativedelta(years=-1),
}
safesearch = True
safesearch_table = {0: 'both', 1: 'false', 2: 'false'}
def minute_to_hm(minute):
if isinstance(minute, int):
return "%d:%02d" % (divmod(minute, 60))
return None
# do search-request
def request(query, params): def request(query, params):
sanitized_url = base_url.rstrip("/") """Assemble request for the Peertube API"""
pageno = (params["pageno"] - 1) * 15
search_url = sanitized_url + "/api/v1/search/videos/?pageno={pageno}&{query}" if not query:
query_dict = {"search": query} return False
language = params["language"].split("-")[0]
if "all" != language and language in supported_languages: # eng_region = traits.get_region(params['searxng_locale'], 'en_US')
query_dict["languageOneOf"] = language eng_lang = traits.get_language(params['searxng_locale'], None)
params["url"] = search_url.format(query=urlencode(query_dict), pageno=pageno)
params['url'] = (
base_url.rstrip("/")
+ "/api/v1/search/videos?"
+ urlencode(
{
'search': query,
'searchTarget': 'search-index', # Vidiversum
'resultType': 'videos',
'start': (params['pageno'] - 1) * 10,
'count': 10,
# -createdAt: sort by date ascending / createdAt: date descending
'sort': '-match', # sort by *match descending*
'nsfw': safesearch_table[params['safesearch']],
}
)
)
if eng_lang is not None:
params['url'] += '&languageOneOf[]=' + eng_lang
params['url'] += '&boostLanguages[]=' + eng_lang
if params['time_range'] in time_range_table:
time = datetime.now().date() + time_range_table[params['time_range']]
params['url'] += '&startDate=' + time.isoformat()
return params return params
def _get_offset_from_pageno(pageno):
return (pageno - 1) * 15 + 1
# get response from search-request
def response(resp): def response(resp):
sanitized_url = base_url.rstrip("/") return video_response(resp)
def video_response(resp):
"""Parse video response from SepiaSearch and Peertube instances."""
results = [] results = []
search_res = loads(resp.text) json_data = resp.json()
# return empty array if there are no results if 'data' not in json_data:
if "data" not in search_res:
return [] return []
# parse results for result in json_data['data']:
for res in search_res["data"]: metadata = [
title = res["name"] x
url = sanitized_url + "/videos/watch/" + res["uuid"] for x in [
description = res["description"] result.get('channel', {}).get('displayName'),
if description: result.get('channel', {}).get('name') + '@' + result.get('channel', {}).get('host'),
content = html_to_text(res["description"]) ', '.join(result.get('tags', [])),
else: ]
content = "" if x
thumbnail = sanitized_url + res["thumbnailPath"] ]
publishedDate = datetime.strptime(res["publishedAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
results.append( results.append(
{ {
"template": "videos.html", 'url': result['url'],
"url": url, 'title': result['name'],
"title": title, 'content': html_to_text(result.get('description') or ''),
"content": content, 'author': result.get('account', {}).get('displayName'),
"publishedDate": publishedDate, 'length': minute_to_hm(result.get('duration')),
"iframe_src": sanitized_url + res["embedPath"], 'template': 'videos.html',
"thumbnail": thumbnail, 'publishedDate': parse(result['publishedAt']),
'iframe_src': result.get('embedUrl'),
'thumbnail': result.get('thumbnailUrl') or result.get('previewUrl'),
'metadata': ' | '.join(metadata),
} }
) )
# return results
return results return results
def _fetch_supported_languages(resp): def fetch_traits(engine_traits: EngineTraits):
videolanguages = resp.json() """Fetch languages from peertube's search-index source code.
peertube_languages = list(videolanguages.keys())
return peertube_languages See videoLanguages_ in commit `8ed5c729 - Refactor and redesign client`_
.. _8ed5c729 - Refactor and redesign client:
https://framagit.org/framasoft/peertube/search-index/-/commit/8ed5c729
.. _videoLanguages:
https://framagit.org/framasoft/peertube/search-index/-/commit/8ed5c729#3d8747f9a60695c367c70bb64efba8f403721fad_0_291
"""
resp = network.get(
'https://framagit.org/framasoft/peertube/search-index/-/raw/master/client/src/components/Filters.vue',
# the response from search-index repository is very slow
timeout=60,
)
if not resp.ok:
print("ERROR: response from peertube is not OK.")
return
js_lang = re.search(r"videoLanguages \(\)[^\n]+(.*?)\]", resp.text, re.DOTALL)
if not js_lang:
print("ERROR: can't determine languages from peertube")
return
for lang in re.finditer(r"\{ id: '([a-z]+)', label:", js_lang.group(1)):
try:
eng_tag = lang.group(1)
if eng_tag == 'oc':
# Occitanis not known by babel, its closest relative is Catalan
# but 'ca' is already in the list of engine_traits.languages -->
# 'oc' will be ignored.
continue
sxng_tag = language_tag(babel.Locale.parse(eng_tag))
except babel.UnknownLocaleError:
print("ERROR: %s is unknown by babel" % eng_tag)
continue
conflict = engine_traits.languages.get(sxng_tag)
if conflict:
if conflict != eng_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
continue
engine_traits.languages[sxng_tag] = eng_tag
engine_traits.languages['zh_Hans'] = 'zh'
engine_traits.languages['zh_Hant'] = 'zh'

View file

@ -34,7 +34,9 @@ import babel
from searx.exceptions import SearxEngineAPIException from searx.exceptions import SearxEngineAPIException
from searx.network import raise_for_httperror from searx.network import raise_for_httperror
from searx.locales import get_engine_locale from searx.enginelib.traits import EngineTraits
traits: EngineTraits
# about # about
about = { about = {
@ -49,7 +51,6 @@ about = {
# engine dependent config # engine dependent config
categories = [] categories = []
paging = True paging = True
supported_languages_url = about['website']
qwant_categ = None # web|news|inages|videos qwant_categ = None # web|news|inages|videos
safesearch = True safesearch = True
@ -95,7 +96,7 @@ def request(query, params):
) )
# add quant's locale # add quant's locale
q_locale = get_engine_locale(params['language'], supported_languages, default='en_US') q_locale = traits.get_region(params["searxng_locale"], default='en_US')
params['url'] += '&locale=' + q_locale params['url'] += '&locale=' + q_locale
# add safesearch option # add safesearch option
@ -243,15 +244,20 @@ def response(resp):
return results return results
def _fetch_supported_languages(resp): def fetch_traits(engine_traits: EngineTraits):
# pylint: disable=import-outside-toplevel
from searx import network
from searx.locales import region_tag
resp = network.get(about['website'])
text = resp.text text = resp.text
text = text[text.find('INITIAL_PROPS') :] text = text[text.find('INITIAL_PROPS') :]
text = text[text.find('{') : text.find('</script>')] text = text[text.find('{') : text.find('</script>')]
q_initial_props = loads(text) q_initial_props = loads(text)
q_locales = q_initial_props.get('locales') q_locales = q_initial_props.get('locales')
q_valid_locales = [] eng_tag_list = set()
for country, v in q_locales.items(): for country, v in q_locales.items():
for lang in v['langs']: for lang in v['langs']:
@ -261,25 +267,18 @@ def _fetch_supported_languages(resp):
# qwant-news does not support all locales from qwant-web: # qwant-news does not support all locales from qwant-web:
continue continue
q_valid_locales.append(_locale) eng_tag_list.add(_locale)
supported_languages = {} for eng_tag in eng_tag_list:
for q_locale in q_valid_locales:
try: try:
locale = babel.Locale.parse(q_locale, sep='_') sxng_tag = region_tag(babel.Locale.parse(eng_tag, sep='_'))
except babel.core.UnknownLocaleError: except babel.UnknownLocaleError:
print("ERROR: can't determine babel locale of quant's locale %s" % q_locale) print("ERROR: can't determine babel locale of quant's locale %s" % eng_tag)
continue continue
# note: supported_languages (dict) conflict = engine_traits.regions.get(sxng_tag)
# if conflict:
# dict's key is a string build up from a babel.Locale object / the if conflict != eng_tag:
# notation 'xx-XX' (and 'xx') conforms to SearXNG's locale (and print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
# language) notation and dict's values are the locale strings used by continue
# the engine. engine_traits.regions[sxng_tag] = eng_tag
searxng_locale = locale.language + '-' + locale.territory # --> params['language']
supported_languages[searxng_locale] = q_locale
return supported_languages

View file

@ -1,70 +1,80 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
""" # lint: pylint
SepiaSearch (Videos) """SepiaSearch uses the same languages as :py:obj:`Peertube
<searx.engines.peertube>` and the response is identical to the response from the
peertube engines.
""" """
from json import loads from typing import TYPE_CHECKING
from dateutil import parser, relativedelta
from urllib.parse import urlencode from urllib.parse import urlencode
from datetime import datetime from datetime import datetime
# about from searx.engines.peertube import fetch_traits # pylint: disable=unused-import
from searx.engines.peertube import (
# pylint: disable=unused-import
video_response,
safesearch_table,
time_range_table,
)
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = { about = {
# pylint: disable=line-too-long
"website": 'https://sepiasearch.org', "website": 'https://sepiasearch.org',
"wikidata_id": None, "wikidata_id": None,
"official_api_documentation": "https://framagit.org/framasoft/peertube/search-index/-/tree/master/server/controllers/api", # NOQA "official_api_documentation": 'https://docs.joinpeertube.org/api-rest-reference.html#tag/Search/operation/searchVideos',
"use_official_api": True, "use_official_api": True,
"require_api_key": False, "require_api_key": False,
"results": 'JSON', "results": 'JSON',
} }
# engine dependent config
categories = ['videos'] categories = ['videos']
paging = True paging = True
base_url = 'https://sepiasearch.org'
time_range_support = True time_range_support = True
safesearch = True safesearch = True
supported_languages = [
# fmt: off
'en', 'fr', 'ja', 'eu', 'ca', 'cs', 'eo', 'el',
'de', 'it', 'nl', 'es', 'oc', 'gd', 'zh', 'pt',
'sv', 'pl', 'fi', 'ru'
# fmt: on
]
base_url = 'https://sepiasearch.org/api/v1/search/videos'
safesearch_table = {0: 'both', 1: 'false', 2: 'false'}
time_range_table = {
'day': relativedelta.relativedelta(),
'week': relativedelta.relativedelta(weeks=-1),
'month': relativedelta.relativedelta(months=-1),
'year': relativedelta.relativedelta(years=-1),
}
def minute_to_hm(minute):
if isinstance(minute, int):
return "%d:%02d" % (divmod(minute, 60))
return None
def request(query, params): def request(query, params):
"""Assemble request for the SepiaSearch API"""
if not query:
return False
# eng_region = traits.get_region(params['searxng_locale'], 'en_US')
eng_lang = traits.get_language(params['searxng_locale'], None)
params['url'] = ( params['url'] = (
base_url base_url.rstrip("/")
+ '?' + "/api/v1/search/videos?"
+ urlencode( + urlencode(
{ {
'search': query, 'search': query,
'start': (params['pageno'] - 1) * 10, 'start': (params['pageno'] - 1) * 10,
'count': 10, 'count': 10,
'sort': '-match', # -createdAt: sort by date ascending / createdAt: date descending
'sort': '-match', # sort by *match descending*
'nsfw': safesearch_table[params['safesearch']], 'nsfw': safesearch_table[params['safesearch']],
} }
) )
) )
language = params['language'].split('-')[0] if eng_lang is not None:
if language in supported_languages: params['url'] += '&languageOneOf[]=' + eng_lang
params['url'] += '&languageOneOf[]=' + language params['url'] += '&boostLanguages[]=' + eng_lang
if params['time_range'] in time_range_table: if params['time_range'] in time_range_table:
time = datetime.now().date() + time_range_table[params['time_range']] time = datetime.now().date() + time_range_table[params['time_range']]
params['url'] += '&startDate=' + time.isoformat() params['url'] += '&startDate=' + time.isoformat()
@ -73,34 +83,4 @@ def request(query, params):
def response(resp): def response(resp):
results = [] return video_response(resp)
search_results = loads(resp.text)
if 'data' not in search_results:
return []
for result in search_results['data']:
title = result['name']
content = result['description']
thumbnail = result['thumbnailUrl']
publishedDate = parser.parse(result['publishedAt'])
author = result.get('account', {}).get('displayName')
length = minute_to_hm(result.get('duration'))
url = result['url']
results.append(
{
'url': url,
'title': title,
'content': content,
'author': author,
'length': length,
'template': 'videos.html',
'publishedDate': publishedDate,
'iframe_src': result.get('embedUrl'),
'thumbnail': thumbnail,
}
)
return results

View file

@ -1,28 +1,108 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint # lint: pylint
"""Startpage (Web) """Startpage's language & region selectors are a mess ..
.. _startpage regions:
Startpage regions
=================
In the list of regions there are tags we need to map to common region tags::
pt-BR_BR --> pt_BR
zh-CN_CN --> zh_Hans_CN
zh-TW_TW --> zh_Hant_TW
zh-TW_HK --> zh_Hant_HK
en-GB_GB --> en_GB
and there is at least one tag with a three letter language tag (ISO 639-2)::
fil_PH --> fil_PH
The locale code ``no_NO`` from Startpage does not exists and is mapped to
``nb-NO``::
babel.core.UnknownLocaleError: unknown locale 'no_NO'
For reference see languages-subtag at iana; ``no`` is the macrolanguage [1]_ and
W3C recommends subtag over macrolanguage [2]_.
.. [1] `iana: language-subtag-registry
<https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry>`_ ::
type: language
Subtag: nb
Description: Norwegian Bokmål
Added: 2005-10-16
Suppress-Script: Latn
Macrolanguage: no
.. [2]
Use macrolanguages with care. Some language subtags have a Scope field set to
macrolanguage, i.e. this primary language subtag encompasses a number of more
specific primary language subtags in the registry. ... As we recommended for
the collection subtags mentioned above, in most cases you should try to use
the more specific subtags ... `W3: The primary language subtag
<https://www.w3.org/International/questions/qa-choosing-language-tags#langsubtag>`_
.. _startpage languages:
Startpage languages
===================
:py:obj:`send_accept_language_header`:
The displayed name in Startpage's settings page depend on the location of the
IP when ``Accept-Language`` HTTP header is unset. In :py:obj:`fetch_traits`
we use::
'Accept-Language': "en-US,en;q=0.5",
..
to get uniform names independent from the IP).
.. _startpage categories:
Startpage categories
====================
Startpage's category (for Web-search, News, Videos, ..) is set by
:py:obj:`startpage_categ` in settings.yml::
- name: startpage
engine: startpage
startpage_categ: web
...
.. hint::
The default category is ``web`` .. and other categories than ``web`` are not
yet implemented.
""" """
from typing import TYPE_CHECKING
from collections import OrderedDict
import re import re
from time import time
from urllib.parse import urlencode
from unicodedata import normalize, combining from unicodedata import normalize, combining
from time import time
from datetime import datetime, timedelta from datetime import datetime, timedelta
from dateutil import parser import dateutil.parser
from lxml import html import lxml.html
from babel import Locale import babel
from babel.localedata import locale_identifiers
from searx.network import get from searx import network
from searx.utils import extract_text, eval_xpath, match_language from searx.utils import extract_text, eval_xpath, gen_useragent
from searx.exceptions import ( from searx.exceptions import SearxEngineCaptchaException
SearxEngineResponseException, from searx.locales import region_tag
SearxEngineCaptchaException, from searx.enginelib.traits import EngineTraits
)
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about # about
about = { about = {
@ -34,18 +114,28 @@ about = {
"results": 'HTML', "results": 'HTML',
} }
startpage_categ = 'web'
"""Startpage's category, visit :ref:`startpage categories`.
"""
send_accept_language_header = True
"""Startpage tries to guess user's language and territory from the HTTP
``Accept-Language``. Optional the user can select a search-language (can be
different to the UI language) and a region filter.
"""
# engine dependent config # engine dependent config
categories = ['general', 'web'] categories = ['general', 'web']
# there is a mechanism to block "bot" search
# (probably the parameter qid), require
# storing of qid's between mulitble search-calls
paging = True paging = True
supported_languages_url = 'https://www.startpage.com/do/settings' time_range_support = True
safesearch = True
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
safesearch_dict = {0: '0', 1: '1', 2: '1'}
# search-url # search-url
base_url = 'https://startpage.com/' base_url = 'https://www.startpage.com'
search_url = base_url + 'sp/search?' search_url = base_url + '/sp/search'
# specific xpath variables # specific xpath variables
# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"] # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
@ -53,92 +143,193 @@ search_url = base_url + 'sp/search?'
results_xpath = '//div[@class="w-gl__result__main"]' results_xpath = '//div[@class="w-gl__result__main"]'
link_xpath = './/a[@class="w-gl__result-title result-link"]' link_xpath = './/a[@class="w-gl__result-title result-link"]'
content_xpath = './/p[@class="w-gl__description"]' content_xpath = './/p[@class="w-gl__description"]'
search_form_xpath = '//form[@id="search"]'
"""XPath of Startpage's origin search form
.. code: html
<form action="/sp/search" method="post">
<input type="text" name="query" value="" ..>
<input type="hidden" name="t" value="device">
<input type="hidden" name="lui" value="english">
<input type="hidden" name="sc" value="Q7Mt5TRqowKB00">
<input type="hidden" name="cat" value="web">
<input type="hidden" class="abp" id="abp-input" name="abp" value="1">
</form>
"""
# timestamp of the last fetch of 'sc' code # timestamp of the last fetch of 'sc' code
sc_code_ts = 0 sc_code_ts = 0
sc_code = '' sc_code = ''
sc_code_cache_sec = 30
"""Time in seconds the sc-code is cached in memory :py:obj:`get_sc_code`."""
def raise_captcha(resp): def get_sc_code(searxng_locale, params):
"""Get an actual ``sc`` argument from Startpage's search form (HTML page).
if str(resp.url).startswith('https://www.startpage.com/sp/captcha'): Startpage puts a ``sc`` argument on every HTML :py:obj:`search form
raise SearxEngineCaptchaException() <search_form_xpath>`. Without this argument Startpage considers the request
is from a bot. We do not know what is encoded in the value of the ``sc``
argument, but it seems to be a kind of a *time-stamp*.
Startpage's search form generates a new sc-code on each request. This
def get_sc_code(headers): function scrap a new sc-code from Startpage's home page every
"""Get an actual `sc` argument from startpage's home page. :py:obj:`sc_code_cache_sec` seconds.
Startpage puts a `sc` argument on every link. Without this argument
startpage considers the request is from a bot. We do not know what is
encoded in the value of the `sc` argument, but it seems to be a kind of a
*time-stamp*. This *time-stamp* is valid for a few hours.
This function scrap a new *time-stamp* from startpage's home page every hour
(3000 sec).
""" """
global sc_code_ts, sc_code # pylint: disable=global-statement global sc_code_ts, sc_code # pylint: disable=global-statement
if time() > (sc_code_ts + 3000): if sc_code and (time() < (sc_code_ts + sc_code_cache_sec)):
logger.debug("query new sc time-stamp ...") logger.debug("get_sc_code: reuse '%s'", sc_code)
return sc_code
resp = get(base_url, headers=headers) headers = {**params['headers']}
raise_captcha(resp) headers['Origin'] = base_url
dom = html.fromstring(resp.text) headers['Referer'] = base_url + '/'
# headers['Connection'] = 'keep-alive'
# headers['Accept-Encoding'] = 'gzip, deflate, br'
# headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'
# headers['User-Agent'] = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:105.0) Gecko/20100101 Firefox/105.0'
try: # add Accept-Language header
# <input type="hidden" name="sc" value="..."> if searxng_locale == 'all':
sc_code = eval_xpath(dom, '//input[@name="sc"]/@value')[0] searxng_locale = 'en-US'
except IndexError as exc: locale = babel.Locale.parse(searxng_locale, sep='-')
# suspend startpage API --> https://github.com/searxng/searxng/pull/695
raise SearxEngineResponseException(
suspended_time=7 * 24 * 3600, message="PR-695: query new sc time-stamp failed!"
) from exc
sc_code_ts = time() if send_accept_language_header:
logger.debug("new value is: %s", sc_code) ac_lang = locale.language
if locale.territory:
ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % (
locale.language,
locale.territory,
locale.language,
)
headers['Accept-Language'] = ac_lang
get_sc_url = base_url + '/?sc=%s' % (sc_code)
logger.debug("query new sc time-stamp ... %s", get_sc_url)
logger.debug("headers: %s", headers)
resp = network.get(get_sc_url, headers=headers)
# ?? x = network.get('https://www.startpage.com/sp/cdn/images/filter-chevron.svg', headers=headers)
# ?? https://www.startpage.com/sp/cdn/images/filter-chevron.svg
# ?? ping-back URL: https://www.startpage.com/sp/pb?sc=TLsB0oITjZ8F21
if str(resp.url).startswith('https://www.startpage.com/sp/captcha'):
raise SearxEngineCaptchaException(
message="get_sc_code: got redirected to https://www.startpage.com/sp/captcha",
)
dom = lxml.html.fromstring(resp.text)
try:
sc_code = eval_xpath(dom, search_form_xpath + '//input[@name="sc"]/@value')[0]
except IndexError as exc:
logger.debug("suspend startpage API --> https://github.com/searxng/searxng/pull/695")
raise SearxEngineCaptchaException(
message="get_sc_code: [PR-695] query new sc time-stamp failed! (%s)" % resp.url,
) from exc
sc_code_ts = time()
logger.debug("get_sc_code: new value is: %s", sc_code)
return sc_code return sc_code
# do search-request
def request(query, params): def request(query, params):
"""Assemble a Startpage request.
# pylint: disable=line-too-long To avoid CAPTCHA we need to send a well formed HTTP POST request with a
# The format string from Startpage's FFox add-on [1]:: cookie. We need to form a request that is identical to the request build by
# Startpage's search form:
# https://www.startpage.com/do/dsearch?query={searchTerms}&cat=web&pl=ext-ff&language=__MSG_extensionUrlLanguage__&extVersion=1.3.0
#
# [1] https://addons.mozilla.org/en-US/firefox/addon/startpage-private-search/
- in the cookie the **region** is selected
- in the HTTP POST data the **language** is selected
Additionally the arguments form Startpage's search form needs to be set in
HTML POST data / compare ``<input>`` elements: :py:obj:`search_form_xpath`.
"""
if startpage_categ == 'web':
return _request_cat_web(query, params)
logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
return params
def _request_cat_web(query, params):
engine_region = traits.get_region(params['searxng_locale'], 'en-US')
engine_language = traits.get_language(params['searxng_locale'], 'en')
# build arguments
args = { args = {
'query': query, 'query': query,
'page': params['pageno'],
'cat': 'web', 'cat': 'web',
# 'pl': 'ext-ff', 't': 'device',
# 'extVersion': '1.3.0', 'sc': get_sc_code(params['searxng_locale'], params), # hint: this func needs HTTP headers,
# 'abp': "-1", 'with_date': time_range_dict.get(params['time_range'], ''),
'sc': get_sc_code(params['headers']),
} }
# set language if specified if engine_language:
if params['language'] != 'all': args['language'] = engine_language
lang_code = match_language(params['language'], supported_languages, fallback=None) args['lui'] = engine_language
if lang_code:
language_name = supported_languages[lang_code]['alias'] args['abp'] = '1'
args['language'] = language_name if params['pageno'] > 1:
args['lui'] = language_name args['page'] = params['pageno']
# build cookie
lang_homepage = 'en'
cookie = OrderedDict()
cookie['date_time'] = 'world'
cookie['disable_family_filter'] = safesearch_dict[params['safesearch']]
cookie['disable_open_in_new_window'] = '0'
cookie['enable_post_method'] = '1' # hint: POST
cookie['enable_proxy_safety_suggest'] = '1'
cookie['enable_stay_control'] = '1'
cookie['instant_answers'] = '1'
cookie['lang_homepage'] = 's/device/%s/' % lang_homepage
cookie['num_of_results'] = '10'
cookie['suggestions'] = '1'
cookie['wt_unit'] = 'celsius'
if engine_language:
cookie['language'] = engine_language
cookie['language_ui'] = engine_language
if engine_region:
cookie['search_results_region'] = engine_region
params['cookies']['preferences'] = 'N1N'.join(["%sEEE%s" % x for x in cookie.items()])
logger.debug('cookie preferences: %s', params['cookies']['preferences'])
# POST request
logger.debug("data: %s", args)
params['data'] = args
params['method'] = 'POST'
params['url'] = search_url
params['headers']['Origin'] = base_url
params['headers']['Referer'] = base_url + '/'
# is the Accept header needed?
# params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
params['url'] = search_url + urlencode(args)
return params return params
# get response from search-request # get response from search-request
def response(resp): def response(resp):
results = [] dom = lxml.html.fromstring(resp.text)
dom = html.fromstring(resp.text) if startpage_categ == 'web':
return _response_cat_web(dom)
logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
return []
def _response_cat_web(dom):
results = []
# parse results # parse results
for result in eval_xpath(dom, results_xpath): for result in eval_xpath(dom, results_xpath):
@ -173,7 +364,7 @@ def response(resp):
content = content[date_pos:] content = content[date_pos:]
try: try:
published_date = parser.parse(date_string, dayfirst=True) published_date = dateutil.parser.parse(date_string, dayfirst=True)
except ValueError: except ValueError:
pass pass
@ -199,62 +390,103 @@ def response(resp):
return results return results
# get supported languages from their site def fetch_traits(engine_traits: EngineTraits):
def _fetch_supported_languages(resp): """Fetch :ref:`languages <startpage languages>` and :ref:`regions <startpage
# startpage's language selector is a mess each option has a displayed name regions>` from Startpage."""
# and a value, either of which may represent the language name in the native # pylint: disable=too-many-branches
# script, the language name in English, an English transliteration of the
# native name, the English name of the writing script used by the language,
# or occasionally something else entirely.
# this cases are so special they need to be hardcoded, a couple of them are misspellings headers = {
language_names = { 'User-Agent': gen_useragent(),
'english_uk': 'en-GB', 'Accept-Language': "en-US,en;q=0.5", # bing needs to set the English language
'fantizhengwen': ['zh-TW', 'zh-HK'],
'hangul': 'ko',
'malayam': 'ml',
'norsk': 'nb',
'sinhalese': 'si',
'sudanese': 'su',
} }
resp = network.get('https://www.startpage.com/do/settings', headers=headers)
# get the English name of every language known by babel if not resp.ok:
language_names.update( print("ERROR: response from Startpage is not OK.")
{
# fmt: off dom = lxml.html.fromstring(resp.text)
name.lower(): lang_code
# pylint: disable=protected-access # regions
for lang_code, name in Locale('en')._data['languages'].items()
# fmt: on sp_region_names = []
} for option in dom.xpath('//form[@name="settings"]//select[@name="search_results_region"]/option'):
) sp_region_names.append(option.get('value'))
for eng_tag in sp_region_names:
if eng_tag == 'all':
continue
babel_region_tag = {'no_NO': 'nb_NO'}.get(eng_tag, eng_tag) # norway
if '-' in babel_region_tag:
l, r = babel_region_tag.split('-')
r = r.split('_')[-1]
sxng_tag = region_tag(babel.Locale.parse(l + '_' + r, sep='_'))
else:
try:
sxng_tag = region_tag(babel.Locale.parse(babel_region_tag, sep='_'))
except babel.UnknownLocaleError:
print("ERROR: can't determine babel locale of startpage's locale %s" % eng_tag)
continue
conflict = engine_traits.regions.get(sxng_tag)
if conflict:
if conflict != eng_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
continue
engine_traits.regions[sxng_tag] = eng_tag
# languages
catalog_engine2code = {name.lower(): lang_code for lang_code, name in babel.Locale('en').languages.items()}
# get the native name of every language known by babel # get the native name of every language known by babel
for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, locale_identifiers()):
native_name = Locale(lang_code).get_language_name().lower() for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, babel.localedata.locale_identifiers()):
native_name = babel.Locale(lang_code).get_language_name().lower()
# add native name exactly as it is # add native name exactly as it is
language_names[native_name] = lang_code catalog_engine2code[native_name] = lang_code
# add "normalized" language name (i.e. français becomes francais and español becomes espanol) # add "normalized" language name (i.e. français becomes francais and español becomes espanol)
unaccented_name = ''.join(filter(lambda c: not combining(c), normalize('NFKD', native_name))) unaccented_name = ''.join(filter(lambda c: not combining(c), normalize('NFKD', native_name)))
if len(unaccented_name) == len(unaccented_name.encode()): if len(unaccented_name) == len(unaccented_name.encode()):
# add only if result is ascii (otherwise "normalization" didn't work) # add only if result is ascii (otherwise "normalization" didn't work)
language_names[unaccented_name] = lang_code catalog_engine2code[unaccented_name] = lang_code
# values that can't be determined by babel's languages names
catalog_engine2code.update(
{
# traditional chinese used in ..
'fantizhengwen': 'zh_Hant',
# Korean alphabet
'hangul': 'ko',
# Malayalam is one of 22 scheduled languages of India.
'malayam': 'ml',
'norsk': 'nb',
'sinhalese': 'si',
}
)
skip_eng_tags = {
'english_uk', # SearXNG lang 'en' already maps to 'english'
}
dom = html.fromstring(resp.text)
sp_lang_names = []
for option in dom.xpath('//form[@name="settings"]//select[@name="language"]/option'): for option in dom.xpath('//form[@name="settings"]//select[@name="language"]/option'):
sp_lang_names.append((option.get('value'), extract_text(option).lower()))
supported_languages = {} eng_tag = option.get('value')
for sp_option_value, sp_option_text in sp_lang_names: if eng_tag in skip_eng_tags:
lang_code = language_names.get(sp_option_value) or language_names.get(sp_option_text) continue
if isinstance(lang_code, str): name = extract_text(option).lower()
supported_languages[lang_code] = {'alias': sp_option_value}
elif isinstance(lang_code, list):
for _lc in lang_code:
supported_languages[_lc] = {'alias': sp_option_value}
else:
print('Unknown language option in Startpage: {} ({})'.format(sp_option_value, sp_option_text))
return supported_languages sxng_tag = catalog_engine2code.get(eng_tag)
if sxng_tag is None:
sxng_tag = catalog_engine2code[name]
conflict = engine_traits.languages.get(sxng_tag)
if conflict:
if conflict != eng_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
continue
engine_traits.languages[sxng_tag] = eng_tag

View file

@ -1,9 +1,12 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint # lint: pylint
"""Wikidata """This module implements the Wikidata engine. Some implementations are shared
from :ref:`wikipedia engine`.
""" """
# pylint: disable=missing-class-docstring # pylint: disable=missing-class-docstring
from typing import TYPE_CHECKING
from hashlib import md5 from hashlib import md5
from urllib.parse import urlencode, unquote from urllib.parse import urlencode, unquote
from json import loads from json import loads
@ -13,12 +16,17 @@ from babel.dates import format_datetime, format_date, format_time, get_datetime_
from searx.data import WIKIDATA_UNITS from searx.data import WIKIDATA_UNITS
from searx.network import post, get from searx.network import post, get
from searx.utils import match_language, searx_useragent, get_string_replaces_function from searx.utils import searx_useragent, get_string_replaces_function
from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
from searx.engines.wikipedia import ( # pylint: disable=unused-import from searx.engines.wikipedia import fetch_traits as _fetch_traits
_fetch_supported_languages, from searx.enginelib.traits import EngineTraits
supported_languages_url,
) if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about # about
about = { about = {
@ -154,33 +162,35 @@ def send_wikidata_query(query, method='GET'):
def request(query, params): def request(query, params):
language = params['language'].split('-')[0]
if language == 'all': # wikidata does not support zh-classical (zh_Hans) / zh-TW, zh-HK and zh-CN
language = 'en' # mapped to zh
else: sxng_lang = params['searxng_locale'].split('-')[0]
language = match_language(params['language'], supported_languages, language_aliases).split('-')[0] language = traits.get_language(sxng_lang, 'en')
query, attributes = get_query(query, language) query, attributes = get_query(query, language)
logger.debug("request --> language %s // len(attributes): %s", language, len(attributes))
params['method'] = 'POST' params['method'] = 'POST'
params['url'] = SPARQL_ENDPOINT_URL params['url'] = SPARQL_ENDPOINT_URL
params['data'] = {'query': query} params['data'] = {'query': query}
params['headers'] = get_headers() params['headers'] = get_headers()
params['language'] = language params['language'] = language
params['attributes'] = attributes params['attributes'] = attributes
return params return params
def response(resp): def response(resp):
results = [] results = []
jsonresponse = loads(resp.content.decode()) jsonresponse = loads(resp.content.decode())
language = resp.search_params['language'].lower() language = resp.search_params['language']
attributes = resp.search_params['attributes'] attributes = resp.search_params['attributes']
logger.debug("request --> language %s // len(attributes): %s", language, len(attributes))
seen_entities = set() seen_entities = set()
for result in jsonresponse.get('results', {}).get('bindings', []): for result in jsonresponse.get('results', {}).get('bindings', []):
attribute_result = {key: value['value'] for key, value in result.items()} attribute_result = {key: value['value'] for key, value in result.items()}
entity_url = attribute_result['item'] entity_url = attribute_result['item']
@ -756,3 +766,15 @@ def init(engine_settings=None): # pylint: disable=unused-argument
lang = result['name']['xml:lang'] lang = result['name']['xml:lang']
entity_id = result['item']['value'].replace('http://www.wikidata.org/entity/', '') entity_id = result['item']['value'].replace('http://www.wikidata.org/entity/', '')
WIKIDATA_PROPERTIES[(entity_id, lang)] = name.capitalize() WIKIDATA_PROPERTIES[(entity_id, lang)] = name.capitalize()
def fetch_traits(engine_traits: EngineTraits):
"""Use languages evaluated from :py:obj:`wikipedia.fetch_traits
<searx.engines.wikipedia.fetch_traits>` except zh-classical (zh_Hans) what
is not supported by wikidata."""
_fetch_traits(engine_traits)
# wikidata does not support zh-classical (zh_Hans)
engine_traits.languages.pop('zh_Hans')
# wikidata does not have net-locations for the languages
engine_traits.custom['wiki_netloc'] = {}

View file

@ -1,13 +1,26 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
""" # lint: pylint
Wikipedia (Web) """This module implements the Wikipedia engine. Some of this implementations
are shared by other engines:
- :ref:`wikidata engine`
The list of supported languages is fetched from the article linked by
:py:obj:`wikipedia_article_depth`. Unlike traditional search engines, wikipedia
does not support one Wikipedia for all the languages, but there is one Wikipedia
for every language (:py:obj:`fetch_traits`).
""" """
from urllib.parse import quote import urllib.parse
from json import loads import babel
from lxml.html import fromstring
from searx.utils import match_language, searx_useragent from lxml import html
from searx.network import raise_for_httperror
from searx import network
from searx.locales import language_tag
from searx.enginelib.traits import EngineTraits
traits: EngineTraits
# about # about
about = { about = {
@ -19,32 +32,40 @@ about = {
"results": 'JSON', "results": 'JSON',
} }
send_accept_language_header = True send_accept_language_header = True
# search-url wikipedia_article_depth = 'https://meta.wikimedia.org/wiki/Wikipedia_article_depth'
search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}' """The *editing depth* of Wikipedia is one of several possible rough indicators
supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias' of the encyclopedia's collaborative quality, showing how frequently its articles
language_variants = {"zh": ("zh-cn", "zh-hk", "zh-mo", "zh-my", "zh-sg", "zh-tw")} are updated. The measurement of depth was introduced after some limitations of
the classic measurement of article count were realized.
"""
# example: https://zh-classical.wikipedia.org/api/rest_v1/page/summary/日
rest_v1_summary_url = 'https://{wiki_netloc}/api/rest_v1/page/summary/{title}'
"""`wikipedia rest_v1 summary API`_: The summary response includes an extract of
the first paragraph of the page in plain text and HTML as well as the type of
page. This is useful for page previews (fka. Hovercards, aka. Popups) on the web
and link previews in the apps.
.. _wikipedia rest_v1 summary API: https://en.wikipedia.org/api/rest_v1/#/Page%20content/get_page_summary__title_
"""
# set language in base_url
def url_lang(lang):
lang_pre = lang.split('-')[0]
if lang_pre == 'all' or lang_pre not in supported_languages and lang_pre not in language_aliases:
return 'en'
return match_language(lang, supported_languages, language_aliases).split('-')[0]
# do search-request
def request(query, params): def request(query, params):
"""Assemble a request (`wikipedia rest_v1 summary API`_)."""
if query.islower(): if query.islower():
query = query.title() query = query.title()
language = url_lang(params['language']) engine_language = traits.get_language(params['searxng_locale'], 'en')
params['url'] = search_url.format(title=quote(query), language=language) wiki_netloc = traits.custom['wiki_netloc'].get(engine_language, 'https://en.wikipedia.org/wiki/')
title = urllib.parse.quote(query)
# '!wikipedia 日 :zh-TW' --> https://zh-classical.wikipedia.org/
# '!wikipedia 日 :zh' --> https://zh.wikipedia.org/
params['url'] = rest_v1_summary_url.format(wiki_netloc=wiki_netloc, title=title)
params['headers']['User-Agent'] = searx_useragent()
params['raise_for_httperror'] = False params['raise_for_httperror'] = False
params['soft_max_redirects'] = 2 params['soft_max_redirects'] = 2
@ -53,13 +74,14 @@ def request(query, params):
# get response from search-request # get response from search-request
def response(resp): def response(resp):
results = []
if resp.status_code == 404: if resp.status_code == 404:
return [] return []
if resp.status_code == 400: if resp.status_code == 400:
try: try:
api_result = loads(resp.text) api_result = resp.json()
except: except Exception: # pylint: disable=broad-except
pass pass
else: else:
if ( if (
@ -68,49 +90,135 @@ def response(resp):
): ):
return [] return []
raise_for_httperror(resp) network.raise_for_httperror(resp)
results = []
api_result = loads(resp.text)
# skip disambiguation pages
if api_result.get('type') != 'standard':
return []
api_result = resp.json()
title = api_result['title'] title = api_result['title']
wikipedia_link = api_result['content_urls']['desktop']['page'] wikipedia_link = api_result['content_urls']['desktop']['page']
results.append({'url': wikipedia_link, 'title': title, 'content': api_result.get('description', '')})
results.append({'url': wikipedia_link, 'title': title}) if api_result.get('type') == 'standard':
results.append(
results.append( {
{ 'infobox': title,
'infobox': title, 'id': wikipedia_link,
'id': wikipedia_link, 'content': api_result.get('extract', ''),
'content': api_result.get('extract', ''), 'img_src': api_result.get('thumbnail', {}).get('source'),
'img_src': api_result.get('thumbnail', {}).get('source'), 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}],
'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}], }
} )
)
return results return results
# get supported languages from their site # Nonstandard language codes
def _fetch_supported_languages(resp): #
supported_languages = {} # These Wikipedias use language codes that do not conform to the ISO 639
dom = fromstring(resp.text) # standard (which is how wiki subdomains are chosen nowadays).
tables = dom.xpath('//table[contains(@class,"sortable")]')
for table in tables:
# exclude header row
trs = table.xpath('.//tr')[1:]
for tr in trs:
td = tr.xpath('./td')
code = td[3].xpath('./a')[0].text
name = td[1].xpath('./a')[0].text
english_name = td[1].xpath('./a')[0].text
articles = int(td[4].xpath('./a')[0].text.replace(',', ''))
# exclude languages with too few articles
if articles >= 100:
supported_languages[code] = {"name": name, "english_name": english_name}
return supported_languages lang_map = {
'be-tarask': 'bel',
'ak': 'aka',
'als': 'gsw',
'bat-smg': 'sgs',
'cbk-zam': 'cbk',
'fiu-vro': 'vro',
'map-bms': 'map',
'nrm': 'nrf',
'roa-rup': 'rup',
'nds-nl': 'nds',
#'simple: invented code used for the Simple English Wikipedia (not the official IETF code en-simple)
'zh-min-nan': 'nan',
'zh-yue': 'yue',
'an': 'arg',
'zh-classical': 'zh-Hant', # babel maps classical to zh-Hans (for whatever reason)
}
unknown_langs = [
'an', # Aragonese
'ba', # Bashkir
'bar', # Bavarian
'bcl', # Central Bicolano
'be-tarask', # Belarusian variant / Belarusian is already covered by 'be'
'bpy', # Bishnupriya Manipuri is unknown by babel
'hif', # Fiji Hindi
'ilo', # Ilokano
'li', # Limburgish
'sco', # Scots (sco) is not known by babel, Scottish Gaelic (gd) is known by babel
'sh', # Serbo-Croatian
'simple', # simple english is not know as a natural language different to english (babel)
'vo', # Volapük
'wa', # Walloon
]
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages from Wikipedia.
The location of the Wikipedia address of a language is mapped in a
:py:obj:`custom field <searx.enginelib.traits.EngineTraits.custom>`
(``wiki_netloc``). Here is a reduced example:
.. code:: python
traits.custom['wiki_netloc'] = {
"en": "en.wikipedia.org",
..
"gsw": "als.wikipedia.org",
..
"zh": "zh.wikipedia.org",
"zh-classical": "zh-classical.wikipedia.org"
}
"""
engine_traits.custom['wiki_netloc'] = {}
# insert alias to map from a region like zh-CN to a language zh_Hans
engine_traits.languages['zh_Hans'] = 'zh'
resp = network.get(wikipedia_article_depth)
if not resp.ok:
print("ERROR: response from Wikipedia is not OK.")
dom = html.fromstring(resp.text)
for row in dom.xpath('//table[contains(@class,"sortable")]//tbody/tr'):
cols = row.xpath('./td')
if not cols:
continue
cols = [c.text_content().strip() for c in cols]
depth = float(cols[3].replace('-', '0').replace(',', ''))
articles = int(cols[4].replace(',', '').replace(',', ''))
if articles < 10000:
# exclude languages with too few articles
continue
if int(depth) < 20:
# Rough indicator of a Wikipedias quality, showing how frequently
# its articles are updated.
continue
eng_tag = cols[2]
wiki_url = row.xpath('./td[3]/a/@href')[0]
wiki_url = urllib.parse.urlparse(wiki_url)
if eng_tag in unknown_langs:
continue
try:
sxng_tag = language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag), sep='-'))
except babel.UnknownLocaleError:
print("ERROR: %s [%s] is unknown by babel" % (cols[0], eng_tag))
continue
conflict = engine_traits.languages.get(sxng_tag)
if conflict:
if conflict != eng_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
continue
engine_traits.languages[sxng_tag] = eng_tag
engine_traits.custom['wiki_netloc'][eng_tag] = wiki_url.netloc

View file

@ -17,8 +17,10 @@ from searx.utils import (
eval_xpath_getindex, eval_xpath_getindex,
eval_xpath_list, eval_xpath_list,
extract_text, extract_text,
match_language,
) )
from searx.enginelib.traits import EngineTraits
traits: EngineTraits
# about # about
about = { about = {
@ -34,8 +36,7 @@ about = {
categories = ['general', 'web'] categories = ['general', 'web']
paging = True paging = True
time_range_support = True time_range_support = True
supported_languages_url = 'https://search.yahoo.com/preferences/languages' # send_accept_language_header = True
"""Supported languages are read from Yahoo preference page."""
time_range_dict = { time_range_dict = {
'day': ('1d', 'd'), 'day': ('1d', 'd'),
@ -43,15 +44,10 @@ time_range_dict = {
'month': ('1m', 'm'), 'month': ('1m', 'm'),
} }
language_aliases = {
'zh-HK': 'zh_chs',
'zh-CN': 'zh_chs', # dead since 2015 / routed to hk.search.yahoo.com
'zh-TW': 'zh_cht',
}
lang2domain = { lang2domain = {
'zh_chs': 'hk.search.yahoo.com', 'zh_chs': 'hk.search.yahoo.com',
'zh_cht': 'tw.search.yahoo.com', 'zh_cht': 'tw.search.yahoo.com',
'any': 'search.yahoo.com',
'en': 'search.yahoo.com', 'en': 'search.yahoo.com',
'bg': 'search.yahoo.com', 'bg': 'search.yahoo.com',
'cs': 'search.yahoo.com', 'cs': 'search.yahoo.com',
@ -67,21 +63,23 @@ lang2domain = {
} }
"""Map language to domain""" """Map language to domain"""
locale_aliases = {
def _get_language(params): 'zh': 'zh_Hans',
'zh-HK': 'zh_Hans',
lang = language_aliases.get(params['language']) 'zh-CN': 'zh_Hans', # dead since 2015 / routed to hk.search.yahoo.com
if lang is None: 'zh-TW': 'zh_Hant',
lang = match_language(params['language'], supported_languages, language_aliases) }
lang = lang.split('-')[0]
logger.debug("params['language']: %s --> %s", params['language'], lang)
return lang
def request(query, params): def request(query, params):
"""build request""" """build request"""
lang = locale_aliases.get(params['language'], None)
if not lang:
lang = params['language'].split('-')[0]
lang = traits.get_language(lang, traits.all_locale)
offset = (params['pageno'] - 1) * 7 + 1 offset = (params['pageno'] - 1) * 7 + 1
lang = _get_language(params)
age, btf = time_range_dict.get(params['time_range'], ('', '')) age, btf = time_range_dict.get(params['time_range'], ('', ''))
args = urlencode( args = urlencode(
@ -154,13 +152,37 @@ def response(resp):
return results return results
# get supported languages from their site def fetch_traits(engine_traits: EngineTraits):
def _fetch_supported_languages(resp): """Fetch languages from yahoo"""
supported_languages = []
# pylint: disable=import-outside-toplevel
import babel
from searx import network
from searx.locales import language_tag
engine_traits.all_locale = 'any'
resp = network.get('https://search.yahoo.com/preferences/languages')
if not resp.ok:
print("ERROR: response from peertube is not OK.")
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
offset = len('lang_') offset = len('lang_')
for val in eval_xpath_list(dom, '//div[contains(@class, "lang-item")]/input/@value'): eng2sxng = {'zh_chs': 'zh_Hans', 'zh_cht': 'zh_Hant'}
supported_languages.append(val[offset:])
return supported_languages for val in eval_xpath_list(dom, '//div[contains(@class, "lang-item")]/input/@value'):
eng_tag = val[offset:]
try:
sxng_tag = language_tag(babel.Locale.parse(eng2sxng.get(eng_tag, eng_tag)))
except babel.UnknownLocaleError:
print('ERROR: unknown language --> %s' % eng_tag)
continue
conflict = engine_traits.languages.get(sxng_tag)
if conflict:
if conflict != eng_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
continue
engine_traits.languages[sxng_tag] = eng_tag

View file

@ -4,11 +4,11 @@
"""Initialize :py:obj:`LOCALE_NAMES`, :py:obj:`RTL_LOCALES`. """Initialize :py:obj:`LOCALE_NAMES`, :py:obj:`RTL_LOCALES`.
""" """
from typing import Set from typing import Set, Optional, List
import os import os
import pathlib import pathlib
from babel import Locale import babel
from babel.support import Translations from babel.support import Translations
import babel.languages import babel.languages
import babel.core import babel.core
@ -134,7 +134,7 @@ def locales_initialize(directory=None):
flask_babel.get_translations = get_translations flask_babel.get_translations = get_translations
for tag, descr in ADDITIONAL_TRANSLATIONS.items(): for tag, descr in ADDITIONAL_TRANSLATIONS.items():
locale = Locale.parse(LOCALE_BEST_MATCH[tag], sep='-') locale = babel.Locale.parse(LOCALE_BEST_MATCH[tag], sep='-')
LOCALE_NAMES[tag] = descr LOCALE_NAMES[tag] = descr
if locale.text_direction == 'rtl': if locale.text_direction == 'rtl':
RTL_LOCALES.add(tag) RTL_LOCALES.add(tag)
@ -142,7 +142,7 @@ def locales_initialize(directory=None):
for tag in LOCALE_BEST_MATCH: for tag in LOCALE_BEST_MATCH:
descr = LOCALE_NAMES.get(tag) descr = LOCALE_NAMES.get(tag)
if not descr: if not descr:
locale = Locale.parse(tag, sep='-') locale = babel.Locale.parse(tag, sep='-')
LOCALE_NAMES[tag] = get_locale_descr(locale, tag.replace('-', '_')) LOCALE_NAMES[tag] = get_locale_descr(locale, tag.replace('-', '_'))
if locale.text_direction == 'rtl': if locale.text_direction == 'rtl':
RTL_LOCALES.add(tag) RTL_LOCALES.add(tag)
@ -154,12 +154,77 @@ def locales_initialize(directory=None):
tag = dirname.replace('_', '-') tag = dirname.replace('_', '-')
descr = LOCALE_NAMES.get(tag) descr = LOCALE_NAMES.get(tag)
if not descr: if not descr:
locale = Locale.parse(dirname) locale = babel.Locale.parse(dirname)
LOCALE_NAMES[tag] = get_locale_descr(locale, dirname) LOCALE_NAMES[tag] = get_locale_descr(locale, dirname)
if locale.text_direction == 'rtl': if locale.text_direction == 'rtl':
RTL_LOCALES.add(tag) RTL_LOCALES.add(tag)
def region_tag(locale: babel.Locale) -> str:
"""Returns SearXNG's region tag from the locale (e.g. zh-TW , en-US)."""
if not locale.territory:
raise ValueError('%s missed a territory')
return locale.language + '-' + locale.territory
def language_tag(locale: babel.Locale) -> str:
"""Returns SearXNG's language tag from the locale and if exits, the tag
includes the script name (e.g. en, zh_Hant).
"""
sxng_lang = locale.language
if locale.script:
sxng_lang += '_' + locale.script
return sxng_lang
def get_locale(locale_tag: str) -> Optional[babel.Locale]:
"""Returns a :py:obj:`babel.Locale` object parsed from argument
``locale_tag``"""
try:
locale = babel.Locale.parse(locale_tag, sep='-')
return locale
except babel.core.UnknownLocaleError:
return None
def get_offical_locales(
territory: str, languages=None, regional: bool = False, de_facto: bool = True
) -> Set[babel.Locale]:
"""Returns a list of :py:obj:`babel.Locale` with languages from
:py:obj:`babel.languages.get_official_languages`.
:param territory: The territory (country or region) code.
:param languages: A list of language codes the languages from
:py:obj:`babel.languages.get_official_languages` should be in
(intersection). If this argument is ``None``, all official languages in
this territory are used.
:param regional: If the regional flag is set, then languages which are
regionally official are also returned.
:param de_facto: If the de_facto flag is set to `False`, then languages
which are de facto official are not returned.
"""
ret_val = set()
o_languages = babel.languages.get_official_languages(territory, regional=regional, de_facto=de_facto)
if languages:
languages = [l.lower() for l in languages]
o_languages = set(l for l in o_languages if l.lower() in languages)
for lang in o_languages:
try:
locale = babel.Locale.parse(lang + '_' + territory)
ret_val.add(locale)
except babel.UnknownLocaleError:
continue
return ret_val
def get_engine_locale(searxng_locale, engine_locales, default=None): def get_engine_locale(searxng_locale, engine_locales, default=None):
"""Return engine's language (aka locale) string that best fits to argument """Return engine's language (aka locale) string that best fits to argument
``searxng_locale``. ``searxng_locale``.
@ -177,6 +242,10 @@ def get_engine_locale(searxng_locale, engine_locales, default=None):
... ...
'pl-PL' : 'pl_PL', 'pl-PL' : 'pl_PL',
'pt-PT' : 'pt_PT' 'pt-PT' : 'pt_PT'
..
'zh' : 'zh'
'zh_Hans' : 'zh'
'zh_Hant' : 'zh-classical'
} }
.. hint:: .. hint::
@ -210,13 +279,13 @@ def get_engine_locale(searxng_locale, engine_locales, default=None):
engine. engine.
""" """
# pylint: disable=too-many-branches # pylint: disable=too-many-branches, too-many-return-statements
engine_locale = engine_locales.get(searxng_locale) engine_locale = engine_locales.get(searxng_locale)
if engine_locale is not None: if engine_locale is not None:
# There was a 1:1 mapping (e.g. "fr-BE --> fr_BE" or "fr --> fr_FR"), no # There was a 1:1 mapping (e.g. a region "fr-BE --> fr_BE" or a language
# need to narrow language nor territory. # "zh --> zh"), no need to narrow language-script nor territory.
return engine_locale return engine_locale
try: try:
@ -227,6 +296,12 @@ def get_engine_locale(searxng_locale, engine_locales, default=None):
except babel.core.UnknownLocaleError: except babel.core.UnknownLocaleError:
return default return default
searxng_lang = language_tag(locale)
engine_locale = engine_locales.get(searxng_lang)
if engine_locale is not None:
# There was a 1:1 mapping (e.g. "zh-HK --> zh_Hant" or "zh-CN --> zh_Hans")
return engine_locale
# SearXNG's selected locale is not supported by the engine .. # SearXNG's selected locale is not supported by the engine ..
if locale.territory: if locale.territory:
@ -247,10 +322,6 @@ def get_engine_locale(searxng_locale, engine_locales, default=None):
if locale.language: if locale.language:
searxng_lang = locale.language
if locale.script:
searxng_lang += '_' + locale.script
terr_lang_dict = {} terr_lang_dict = {}
for territory, langs in babel.core.get_global("territory_languages").items(): for territory, langs in babel.core.get_global("territory_languages").items():
if not langs.get(searxng_lang, {}).get('official_status'): if not langs.get(searxng_lang, {}).get('official_status'):
@ -303,3 +374,98 @@ def get_engine_locale(searxng_locale, engine_locales, default=None):
engine_locale = default engine_locale = default
return default return default
def match_locale(searxng_locale: str, locale_tag_list: List[str], fallback: Optional[str] = None) -> Optional[str]:
"""Return tag from ``locale_tag_list`` that best fits to ``searxng_locale``.
:param str searxng_locale: SearXNG's internal representation of locale (de,
de-DE, fr-BE, zh, zh-CN, zh-TW ..).
:param list locale_tag_list: The list of locale tags to select from
:param str fallback: fallback locale tag (if unset --> ``None``)
The rules to find a match are implemented in :py:obj:`get_engine_locale`,
the ``engine_locales`` is build up by :py:obj:`build_engine_locales`.
.. hint::
The *SearXNG locale* string and the members of ``locale_tag_list`` has to
be known by babel! The :py:obj:`ADDITIONAL_TRANSLATIONS` are used in the
UI and are not known by babel --> will be ignored.
"""
# searxng_locale = 'es'
# locale_tag_list = ['es-AR', 'es-ES', 'es-MX']
if not searxng_locale:
return fallback
locale = get_locale(searxng_locale)
if locale is None:
return fallback
# normalize to a SearXNG locale that can be passed to get_engine_locale
searxng_locale = language_tag(locale)
if locale.territory:
searxng_locale = region_tag(locale)
# clean up locale_tag_list
tag_list = []
for tag in locale_tag_list:
if tag in ('all', 'auto') or tag in ADDITIONAL_TRANSLATIONS:
continue
tag_list.append(tag)
# emulate fetch_traits
engine_locales = build_engine_locales(tag_list)
return get_engine_locale(searxng_locale, engine_locales, default=fallback)
def build_engine_locales(tag_list: List[str]):
"""From a list of locale tags a dictionary is build that can be passed by
argument ``engine_locales`` to :py:obj:`get_engine_locale`. This function
is mainly used by :py:obj:`match_locale` and is similar to what the
``fetch_traits(..)`` function of engines do.
If there are territory codes in the ``tag_list`` that have a *script code*
additional keys are added to the returned dictionary.
.. code:: python
>>> import locales
>>> engine_locales = locales.build_engine_locales(['en', 'en-US', 'zh', 'zh-CN', 'zh-TW'])
>>> engine_locales
{
'en': 'en', 'en-US': 'en-US',
'zh': 'zh', 'zh-CN': 'zh-CN', 'zh_Hans': 'zh-CN',
'zh-TW': 'zh-TW', 'zh_Hant': 'zh-TW'
}
>>> get_engine_locale('zh-Hans', engine_locales)
'zh-CN'
This function is a good example to understand the language/region model
of SearXNG:
SearXNG only distinguishes between **search languages** and **search
regions**, by adding the *script-tags*, languages with *script-tags* can
be assigned to the **regions** that SearXNG supports.
"""
engine_locales = {}
for tag in tag_list:
locale = get_locale(tag)
if locale is None:
logger.warn("build_engine_locales: skip locale tag %s / unknown by babel", tag)
continue
if locale.territory:
engine_locales[region_tag(locale)] = tag
if locale.script:
engine_locales[language_tag(locale)] = tag
else:
engine_locales[language_tag(locale)] = tag
return engine_locales

View file

@ -13,7 +13,7 @@ from typing import Iterable, Dict, List
import flask import flask
from searx import settings, autocomplete from searx import settings, autocomplete
from searx.engines import Engine from searx.enginelib import Engine
from searx.plugins import Plugin from searx.plugins import Plugin
from searx.locales import LOCALE_NAMES from searx.locales import LOCALE_NAMES
from searx.webutils import VALID_LANGUAGE_CODE from searx.webutils import VALID_LANGUAGE_CODE

View file

@ -4,7 +4,7 @@ from abc import abstractmethod, ABC
import re import re
from searx import settings from searx import settings
from searx.languages import language_codes from searx.sxng_locales import sxng_locales
from searx.engines import categories, engines, engine_shortcuts from searx.engines import categories, engines, engine_shortcuts
from searx.external_bang import get_bang_definition_and_autocomplete from searx.external_bang import get_bang_definition_and_autocomplete
from searx.search import EngineRef from searx.search import EngineRef
@ -84,7 +84,7 @@ class LanguageParser(QueryPartParser):
found = False found = False
# check if any language-code is equal with # check if any language-code is equal with
# declared language-codes # declared language-codes
for lc in language_codes: for lc in sxng_locales:
lang_id, lang_name, country, english_name, _flag = map(str.lower, lc) lang_id, lang_name, country, english_name, _flag = map(str.lower, lc)
# if correct language-code is found # if correct language-code is found
@ -125,7 +125,7 @@ class LanguageParser(QueryPartParser):
self.raw_text_query.autocomplete_list.append(lang) self.raw_text_query.autocomplete_list.append(lang)
return return
for lc in language_codes: for lc in sxng_locales:
if lc[0] not in settings['search']['languages']: if lc[0] not in settings['search']['languages']:
continue continue
lang_id, lang_name, country, english_name, _flag = map(str.lower, lc) lang_id, lang_name, country, english_name, _flag = map(str.lower, lc)

View file

@ -30,7 +30,10 @@ from .abstract import EngineProcessor
logger = logger.getChild('search.processors') logger = logger.getChild('search.processors')
PROCESSORS: Dict[str, EngineProcessor] = {} PROCESSORS: Dict[str, EngineProcessor] = {}
"""Cache request processores, stored by *engine-name* (:py:func:`initialize`)""" """Cache request processores, stored by *engine-name* (:py:func:`initialize`)
:meta hide-value:
"""
def get_processor_class(engine_type): def get_processor_class(engine_type):

View file

@ -138,7 +138,8 @@ class EngineProcessor(ABC):
return False return False
def get_params(self, search_query, engine_category): def get_params(self, search_query, engine_category):
"""Returns a set of *request params* or ``None`` if request is not supported. """Returns a set of (see :ref:`request params <engine request arguments>`) or
``None`` if request is not supported.
Not supported conditions (``None`` is returned): Not supported conditions (``None`` is returned):
@ -159,11 +160,20 @@ class EngineProcessor(ABC):
params['safesearch'] = search_query.safesearch params['safesearch'] = search_query.safesearch
params['time_range'] = search_query.time_range params['time_range'] = search_query.time_range
params['engine_data'] = search_query.engine_data.get(self.engine_name, {}) params['engine_data'] = search_query.engine_data.get(self.engine_name, {})
params['searxng_locale'] = search_query.lang
# deprecated / vintage --> use params['searxng_locale']
#
# Conditions related to engine's traits are implemented in engine.traits
# module. Don't do 'locale' decissions here in the abstract layer of the
# search processor, just pass the value from user's choice unchanged to
# the engine request.
if hasattr(self.engine, 'language') and self.engine.language: if hasattr(self.engine, 'language') and self.engine.language:
params['language'] = self.engine.language params['language'] = self.engine.language
else: else:
params['language'] = search_query.lang params['language'] = search_query.lang
return params return params
@abstractmethod @abstractmethod

View file

@ -51,6 +51,9 @@ class OnlineProcessor(EngineProcessor):
super().initialize() super().initialize()
def get_params(self, search_query, engine_category): def get_params(self, search_query, engine_category):
"""Returns a set of :ref:`request params <engine request online>` or ``None``
if request is not supported.
"""
params = super().get_params(search_query, engine_category) params = super().get_params(search_query, engine_category)
if params is None: if params is None:
return None return None
@ -184,11 +187,6 @@ class OnlineProcessor(EngineProcessor):
self.handle_exception(result_container, e, suspend=True) self.handle_exception(result_container, e, suspend=True)
self.logger.exception('CAPTCHA') self.logger.exception('CAPTCHA')
except SearxEngineTooManyRequestsException as e: except SearxEngineTooManyRequestsException as e:
if "google" in self.engine_name:
self.logger.warn(
"Set to 'true' the use_mobile_ui parameter in the 'engines:'"
" section of your settings.yml file if google is blocked for you."
)
self.handle_exception(result_container, e, suspend=True) self.handle_exception(result_container, e, suspend=True)
self.logger.exception('Too many requests') self.logger.exception('Too many requests')
except SearxEngineAccessDeniedException as e: except SearxEngineAccessDeniedException as e:
@ -223,7 +221,7 @@ class OnlineProcessor(EngineProcessor):
'test': ['unique_results'], 'test': ['unique_results'],
} }
if getattr(self.engine, 'supported_languages', []): if getattr(self.engine, 'traits', False):
tests['lang_fr'] = { tests['lang_fr'] = {
'matrix': {'query': 'paris', 'lang': 'fr'}, 'matrix': {'query': 'paris', 'lang': 'fr'},
'result_container': ['not_empty', ('has_language', 'fr')], 'result_container': ['not_empty', ('has_language', 'fr')],

View file

@ -38,8 +38,8 @@ class OnlineCurrencyProcessor(OnlineProcessor):
engine_type = 'online_currency' engine_type = 'online_currency'
def get_params(self, search_query, engine_category): def get_params(self, search_query, engine_category):
"""Returns a set of *request params* or ``None`` if search query does not match """Returns a set of :ref:`request params <engine request online_currency>`
to :py:obj:`parser_re`.""" or ``None`` if search query does not match to :py:obj:`parser_re`."""
params = super().get_params(search_query, engine_category) params = super().get_params(search_query, engine_category)
if params is None: if params is None:

View file

@ -18,8 +18,9 @@ class OnlineDictionaryProcessor(OnlineProcessor):
engine_type = 'online_dictionary' engine_type = 'online_dictionary'
def get_params(self, search_query, engine_category): def get_params(self, search_query, engine_category):
"""Returns a set of *request params* or ``None`` if search query does not match """Returns a set of :ref:`request params <engine request online_dictionary>` or
to :py:obj:`parser_re`.""" ``None`` if search query does not match to :py:obj:`parser_re`.
"""
params = super().get_params(search_query, engine_category) params = super().get_params(search_query, engine_category)
if params is None: if params is None:
return None return None

View file

@ -20,9 +20,10 @@ class OnlineUrlSearchProcessor(OnlineProcessor):
engine_type = 'online_url_search' engine_type = 'online_url_search'
def get_params(self, search_query, engine_category): def get_params(self, search_query, engine_category):
"""Returns a set of *request params* or ``None`` if search query does not match """Returns a set of :ref:`request params <engine request online>` or ``None`` if
to at least one of :py:obj:`re_search_urls`. search query does not match to :py:obj:`re_search_urls`.
""" """
params = super().get_params(search_query, engine_category) params = super().get_params(search_query, engine_category)
if params is None: if params is None:
return None return None

View file

@ -731,22 +731,9 @@ engines:
- name: google - name: google
engine: google engine: google
shortcut: go shortcut: go
# see https://docs.searxng.org/src/searx.engines.google.html#module-searx.engines.google
use_mobile_ui: false
# additional_tests: # additional_tests:
# android: *test_android # android: *test_android
# - name: google italian
# engine: google
# shortcut: goit
# use_mobile_ui: false
# language: it
# - name: google mobile ui
# engine: google
# shortcut: gomui
# use_mobile_ui: true
- name: google images - name: google images
engine: google_images engine: google_images
shortcut: goi shortcut: goi
@ -1762,9 +1749,8 @@ engines:
engine: peertube engine: peertube
shortcut: ptb shortcut: ptb
paging: true paging: true
# https://instances.joinpeertube.org/instances # alternatives see: https://instances.joinpeertube.org/instances
base_url: https://peertube.biz/ # base_url: https://tube.4aem.com
# base_url: https://tube.tardis.world/
categories: videos categories: videos
disabled: true disabled: true
timeout: 6.0 timeout: 6.0

View file

@ -12,13 +12,13 @@ import logging
from base64 import b64decode from base64 import b64decode
from os.path import dirname, abspath from os.path import dirname, abspath
from searx.languages import language_codes as languages from .sxng_locales import sxng_locales
searx_dir = abspath(dirname(__file__)) searx_dir = abspath(dirname(__file__))
logger = logging.getLogger('searx') logger = logging.getLogger('searx')
OUTPUT_FORMATS = ['html', 'csv', 'json', 'rss'] OUTPUT_FORMATS = ['html', 'csv', 'json', 'rss']
LANGUAGE_CODES = ['all', 'auto'] + list(l[0] for l in languages) SXNG_LOCALE_TAGS = ['all', 'auto'] + list(l[0] for l in sxng_locales)
SIMPLE_STYLE = ('auto', 'light', 'dark') SIMPLE_STYLE = ('auto', 'light', 'dark')
CATEGORIES_AS_TABS = { CATEGORIES_AS_TABS = {
'general': {}, 'general': {},
@ -156,8 +156,8 @@ SCHEMA = {
'safe_search': SettingsValue((0, 1, 2), 0), 'safe_search': SettingsValue((0, 1, 2), 0),
'autocomplete': SettingsValue(str, ''), 'autocomplete': SettingsValue(str, ''),
'autocomplete_min': SettingsValue(int, 4), 'autocomplete_min': SettingsValue(int, 4),
'default_lang': SettingsValue(tuple(LANGUAGE_CODES + ['']), ''), 'default_lang': SettingsValue(tuple(SXNG_LOCALE_TAGS + ['']), ''),
'languages': SettingSublistValue(LANGUAGE_CODES, LANGUAGE_CODES), 'languages': SettingSublistValue(SXNG_LOCALE_TAGS, SXNG_LOCALE_TAGS),
'ban_time_on_fail': SettingsValue(numbers.Real, 5), 'ban_time_on_fail': SettingsValue(numbers.Real, 5),
'max_ban_time_on_fail': SettingsValue(numbers.Real, 120), 'max_ban_time_on_fail': SettingsValue(numbers.Real, 120),
'suspended_times': { 'suspended_times': {

View file

@ -1,73 +1,120 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# list of language codes '''List of SearXNG's locale codes.
# this file is generated automatically by utils/fetch_languages.py
language_codes = ( This file is generated automatically by::
('af-ZA', 'Afrikaans', 'Suid-Afrika', 'Afrikaans', '\U0001f1ff\U0001f1e6'),
('ar-EG', 'العربية', 'مصر', 'Arabic', '\U0001f1ea\U0001f1ec'), ./manage pyenv.cmd searxng_extra/update/update_engine_traits.py
('be-BY', 'Беларуская', 'Беларусь', 'Belarusian', '\U0001f1e7\U0001f1fe'), '''
sxng_locales = (
('ar', 'العربية', '', 'Arabic', '\U0001f310'),
('bg', 'Български', '', 'Bulgarian', '\U0001f310'),
('bg-BG', 'Български', 'България', 'Bulgarian', '\U0001f1e7\U0001f1ec'), ('bg-BG', 'Български', 'България', 'Bulgarian', '\U0001f1e7\U0001f1ec'),
('ca', 'Català', '', 'Catalan', '\U0001f310'),
('ca-ES', 'Català', 'Espanya', 'Catalan', '\U0001f1ea\U0001f1f8'), ('ca-ES', 'Català', 'Espanya', 'Catalan', '\U0001f1ea\U0001f1f8'),
('cs', 'Čeština', '', 'Czech', '\U0001f310'),
('cs-CZ', 'Čeština', 'Česko', 'Czech', '\U0001f1e8\U0001f1ff'), ('cs-CZ', 'Čeština', 'Česko', 'Czech', '\U0001f1e8\U0001f1ff'),
('da', 'Dansk', '', 'Danish', '\U0001f310'),
('da-DK', 'Dansk', 'Danmark', 'Danish', '\U0001f1e9\U0001f1f0'), ('da-DK', 'Dansk', 'Danmark', 'Danish', '\U0001f1e9\U0001f1f0'),
('de', 'Deutsch', '', 'German', '\U0001f310'), ('de', 'Deutsch', '', 'German', '\U0001f310'),
('de-AT', 'Deutsch', 'Österreich', 'German', '\U0001f1e6\U0001f1f9'), ('de-AT', 'Deutsch', 'Österreich', 'German', '\U0001f1e6\U0001f1f9'),
('de-CH', 'Deutsch', 'Schweiz', 'German', '\U0001f1e8\U0001f1ed'), ('de-CH', 'Deutsch', 'Schweiz', 'German', '\U0001f1e8\U0001f1ed'),
('de-DE', 'Deutsch', 'Deutschland', 'German', '\U0001f1e9\U0001f1ea'), ('de-DE', 'Deutsch', 'Deutschland', 'German', '\U0001f1e9\U0001f1ea'),
('el', 'Ελληνικά', '', 'Greek', '\U0001f310'),
('el-GR', 'Ελληνικά', 'Ελλάδα', 'Greek', '\U0001f1ec\U0001f1f7'), ('el-GR', 'Ελληνικά', 'Ελλάδα', 'Greek', '\U0001f1ec\U0001f1f7'),
('en', 'English', '', 'English', '\U0001f310'), ('en', 'English', '', 'English', '\U0001f310'),
('en-AU', 'English', 'Australia', 'English', '\U0001f1e6\U0001f1fa'), ('en-AU', 'English', 'Australia', 'English', '\U0001f1e6\U0001f1fa'),
('en-CA', 'English', 'Canada', 'English', '\U0001f1e8\U0001f1e6'), ('en-CA', 'English', 'Canada', 'English', '\U0001f1e8\U0001f1e6'),
('en-GB', 'English', 'United Kingdom', 'English', '\U0001f1ec\U0001f1e7'), ('en-GB', 'English', 'United Kingdom', 'English', '\U0001f1ec\U0001f1e7'),
('en-IE', 'English', 'Ireland', 'English', '\U0001f1ee\U0001f1ea'), ('en-IE', 'English', 'Ireland', 'English', '\U0001f1ee\U0001f1ea'),
('en-IN', 'English', 'India', 'English', '\U0001f1ee\U0001f1f3'),
('en-MY', 'English', 'Malaysia', 'English', '\U0001f1f2\U0001f1fe'), ('en-MY', 'English', 'Malaysia', 'English', '\U0001f1f2\U0001f1fe'),
('en-NZ', 'English', 'New Zealand', 'English', '\U0001f1f3\U0001f1ff'), ('en-NZ', 'English', 'New Zealand', 'English', '\U0001f1f3\U0001f1ff'),
('en-PH', 'English', 'Philippines', 'English', '\U0001f1f5\U0001f1ed'),
('en-US', 'English', 'United States', 'English', '\U0001f1fa\U0001f1f8'), ('en-US', 'English', 'United States', 'English', '\U0001f1fa\U0001f1f8'),
('en-ZA', 'English', 'South Africa', 'English', '\U0001f1ff\U0001f1e6'),
('es', 'Español', '', 'Spanish', '\U0001f310'), ('es', 'Español', '', 'Spanish', '\U0001f310'),
('es-AR', 'Español', 'Argentina', 'Spanish', '\U0001f1e6\U0001f1f7'), ('es-AR', 'Español', 'Argentina', 'Spanish', '\U0001f1e6\U0001f1f7'),
('es-CL', 'Español', 'Chile', 'Spanish', '\U0001f1e8\U0001f1f1'), ('es-CL', 'Español', 'Chile', 'Spanish', '\U0001f1e8\U0001f1f1'),
('es-ES', 'Español', 'España', 'Spanish', '\U0001f1ea\U0001f1f8'), ('es-ES', 'Español', 'España', 'Spanish', '\U0001f1ea\U0001f1f8'),
('es-MX', 'Español', 'México', 'Spanish', '\U0001f1f2\U0001f1fd'), ('es-MX', 'Español', 'México', 'Spanish', '\U0001f1f2\U0001f1fd'),
('es-US', 'Español', 'Estados Unidos', 'Spanish', '\U0001f1fa\U0001f1f8'),
('et', 'Eesti', '', 'Estonian', '\U0001f310'),
('et-EE', 'Eesti', 'Eesti', 'Estonian', '\U0001f1ea\U0001f1ea'), ('et-EE', 'Eesti', 'Eesti', 'Estonian', '\U0001f1ea\U0001f1ea'),
('fa-IR', 'فارسی', 'ایران', 'Persian', '\U0001f1ee\U0001f1f7'), ('fi', 'Suomi', '', 'Finnish', '\U0001f310'),
('fi-FI', 'Suomi', 'Suomi', 'Finnish', '\U0001f1eb\U0001f1ee'), ('fi-FI', 'Suomi', 'Suomi', 'Finnish', '\U0001f1eb\U0001f1ee'),
('fil-PH', 'Filipino', 'Pilipinas', 'Filipino', '\U0001f1f5\U0001f1ed'),
('fr', 'Français', '', 'French', '\U0001f310'), ('fr', 'Français', '', 'French', '\U0001f310'),
('fr-BE', 'Français', 'Belgique', 'French', '\U0001f1e7\U0001f1ea'), ('fr-BE', 'Français', 'Belgique', 'French', '\U0001f1e7\U0001f1ea'),
('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'), ('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'),
('fr-CH', 'Français', 'Suisse', 'French', '\U0001f1e8\U0001f1ed'), ('fr-CH', 'Français', 'Suisse', 'French', '\U0001f1e8\U0001f1ed'),
('fr-FR', 'Français', 'France', 'French', '\U0001f1eb\U0001f1f7'), ('fr-FR', 'Français', 'France', 'French', '\U0001f1eb\U0001f1f7'),
('he-IL', 'עברית', 'ישראל', 'Hebrew', '\U0001f1ee\U0001f1f1'), ('he', 'עברית', '', 'Hebrew', '\U0001f1ee\U0001f1f7'),
('hi-IN', 'हिन्दी', 'भारत', 'Hindi', '\U0001f1ee\U0001f1f3'), ('hi', 'हिन्दी', '', 'Hindi', '\U0001f310'),
('hr-HR', 'Hrvatski', 'Hrvatska', 'Croatian', '\U0001f1ed\U0001f1f7'), ('hr', 'Hrvatski', '', 'Croatian', '\U0001f310'),
('hu', 'Magyar', '', 'Hungarian', '\U0001f310'),
('hu-HU', 'Magyar', 'Magyarország', 'Hungarian', '\U0001f1ed\U0001f1fa'), ('hu-HU', 'Magyar', 'Magyarország', 'Hungarian', '\U0001f1ed\U0001f1fa'),
('id', 'Indonesia', '', 'Indonesian', '\U0001f310'),
('id-ID', 'Indonesia', 'Indonesia', 'Indonesian', '\U0001f1ee\U0001f1e9'), ('id-ID', 'Indonesia', 'Indonesia', 'Indonesian', '\U0001f1ee\U0001f1e9'),
('is-IS', 'Íslenska', 'Ísland', 'Icelandic', '\U0001f1ee\U0001f1f8'), ('is', 'Íslenska', '', 'Icelandic', '\U0001f310'),
('it', 'Italiano', '', 'Italian', '\U0001f310'),
('it-CH', 'Italiano', 'Svizzera', 'Italian', '\U0001f1e8\U0001f1ed'),
('it-IT', 'Italiano', 'Italia', 'Italian', '\U0001f1ee\U0001f1f9'), ('it-IT', 'Italiano', 'Italia', 'Italian', '\U0001f1ee\U0001f1f9'),
('ja', '日本語', '', 'Japanese', '\U0001f310'),
('ja-JP', '日本語', '日本', 'Japanese', '\U0001f1ef\U0001f1f5'), ('ja-JP', '日本語', '日本', 'Japanese', '\U0001f1ef\U0001f1f5'),
('ko', '한국어', '', 'Korean', '\U0001f310'),
('ko-KR', '한국어', '대한민국', 'Korean', '\U0001f1f0\U0001f1f7'), ('ko-KR', '한국어', '대한민국', 'Korean', '\U0001f1f0\U0001f1f7'),
('lt-LT', 'Lietuvių', 'Lietuva', 'Lithuanian', '\U0001f1f1\U0001f1f9'), ('lt', 'Lietuvių', '', 'Lithuanian', '\U0001f310'),
('lv-LV', 'Latviešu', 'Latvija', 'Latvian', '\U0001f1f1\U0001f1fb'), ('lv', 'Latviešu', '', 'Latvian', '\U0001f310'),
('nb', 'Norsk Bokmål', '', 'Norwegian Bokmål', '\U0001f310'),
('nb-NO', 'Norsk Bokmål', 'Norge', 'Norwegian Bokmål', '\U0001f1f3\U0001f1f4'),
('nl', 'Nederlands', '', 'Dutch', '\U0001f310'), ('nl', 'Nederlands', '', 'Dutch', '\U0001f310'),
('nl-BE', 'Nederlands', 'België', 'Dutch', '\U0001f1e7\U0001f1ea'), ('nl-BE', 'Nederlands', 'België', 'Dutch', '\U0001f1e7\U0001f1ea'),
('nl-NL', 'Nederlands', 'Nederland', 'Dutch', '\U0001f1f3\U0001f1f1'), ('nl-NL', 'Nederlands', 'Nederland', 'Dutch', '\U0001f1f3\U0001f1f1'),
('no-NO', 'Norsk', '', 'Norwegian (Bokmål)', '\U0001f1f3\U0001f1f4'), ('pl', 'Polski', '', 'Polish', '\U0001f310'),
('pl-PL', 'Polski', 'Polska', 'Polish', '\U0001f1f5\U0001f1f1'), ('pl-PL', 'Polski', 'Polska', 'Polish', '\U0001f1f5\U0001f1f1'),
('pt', 'Português', '', 'Portuguese', '\U0001f310'), ('pt', 'Português', '', 'Portuguese', '\U0001f310'),
('pt-BR', 'Português', 'Brasil', 'Portuguese', '\U0001f1e7\U0001f1f7'), ('pt-BR', 'Português', 'Brasil', 'Portuguese', '\U0001f1e7\U0001f1f7'),
('pt-PT', 'Português', 'Portugal', 'Portuguese', '\U0001f1f5\U0001f1f9'), ('pt-PT', 'Português', 'Portugal', 'Portuguese', '\U0001f1f5\U0001f1f9'),
('ro', 'Română', '', 'Romanian', '\U0001f310'),
('ro-RO', 'Română', 'România', 'Romanian', '\U0001f1f7\U0001f1f4'), ('ro-RO', 'Română', 'România', 'Romanian', '\U0001f1f7\U0001f1f4'),
('ru', 'Русский', '', 'Russian', '\U0001f310'),
('ru-RU', 'Русский', 'Россия', 'Russian', '\U0001f1f7\U0001f1fa'), ('ru-RU', 'Русский', 'Россия', 'Russian', '\U0001f1f7\U0001f1fa'),
('sk-SK', 'Slovenčina', 'Slovensko', 'Slovak', '\U0001f1f8\U0001f1f0'), ('sk', 'Slovenčina', '', 'Slovak', '\U0001f310'),
('sl-SI', 'Slovenščina', 'Slovenija', 'Slovenian', '\U0001f1f8\U0001f1ee'), ('sl', 'Slovenščina', '', 'Slovenian', '\U0001f310'),
('sr-RS', 'Српски', 'Србија', 'Serbian', '\U0001f1f7\U0001f1f8'), ('sr', 'Српски', '', 'Serbian', '\U0001f310'),
('sv', 'Svenska', '', 'Swedish', '\U0001f310'),
('sv-SE', 'Svenska', 'Sverige', 'Swedish', '\U0001f1f8\U0001f1ea'), ('sv-SE', 'Svenska', 'Sverige', 'Swedish', '\U0001f1f8\U0001f1ea'),
('sw-TZ', 'Kiswahili', 'Tanzania', 'Swahili', '\U0001f1f9\U0001f1ff'), ('th', 'ไทย', '', 'Thai', '\U0001f310'),
('th-TH', 'ไทย', 'ไทย', 'Thai', '\U0001f1f9\U0001f1ed'), ('th-TH', 'ไทย', 'ไทย', 'Thai', '\U0001f1f9\U0001f1ed'),
('tr', 'Türkçe', '', 'Turkish', '\U0001f310'),
('tr-TR', 'Türkçe', 'Türkiye', 'Turkish', '\U0001f1f9\U0001f1f7'), ('tr-TR', 'Türkçe', 'Türkiye', 'Turkish', '\U0001f1f9\U0001f1f7'),
('uk-UA', 'Українська', 'Україна', 'Ukrainian', '\U0001f1fa\U0001f1e6'), ('uk', 'Українська', '', 'Ukrainian', '\U0001f310'),
('vi-VN', 'Tiếng Việt', 'Việt Nam', 'Vietnamese', '\U0001f1fb\U0001f1f3'), ('vi', 'Tiếng Việt', '', 'Vietnamese', '\U0001f310'),
('zh', '中文', '', 'Chinese', '\U0001f310'), ('zh', '中文', '', 'Chinese', '\U0001f310'),
('zh-CN', '中文', '中国', 'Chinese', '\U0001f1e8\U0001f1f3'), ('zh-CN', '中文', '中国', 'Chinese', '\U0001f1e8\U0001f1f3'),
('zh-HK', '中文', '中國香港', 'Chinese', '\U0001f1ed\U0001f1f0'), ('zh-HK', '中文', '中國香港特別行政區', 'Chinese', '\U0001f1ed\U0001f1f0'),
('zh-TW', '中文', '台灣', 'Chinese', '\U0001f1f9\U0001f1fc'), ('zh-TW', '中文', '台灣', 'Chinese', '\U0001f1f9\U0001f1fc'),
) )
'''
A list of five-digit tuples:
0. SearXNG's internal locale tag (a language or region tag)
1. Name of the language (:py:obj:`babel.core.Locale.get_language_name`)
2. For region tags the name of the region (:py:obj:`babel.core.Locale.get_territory_name`).
Empty string for language tags.
3. English language name (from :py:obj:`babel.core.Locale.english_name`)
4. Unicode flag (emoji) that fits to SearXNG's internal region tag. Languages
are represented by a globe (🌐)
.. code:: python
('en', 'English', '', 'English', '🌐'),
('en-CA', 'English', 'Canada', 'English', '🇨🇦'),
('en-US', 'English', 'United States', 'English', '🇺🇸'),
..
('fr', 'Français', '', 'French', '🌐'),
('fr-BE', 'Français', 'Belgique', 'French', '🇧🇪'),
('fr-CA', 'Français', 'Canada', 'French', '🇨🇦'),
:meta hide-value:
'''

View file

@ -1,12 +1,12 @@
<select class="language" id="language" name="language" aria-label="{{ _('Search language') }}">{{- '' -}} <select class="language" id="language" name="language" aria-label="{{ _('Search language') }}">{{- '' -}}
<option value="all" {% if current_language == 'all' %}selected="selected"{% endif %}>{{ _('Default language') }}</option> <option value="all" {% if current_language == 'all' %}selected="selected"{% endif %}>{{ _('Default language') }} [all]</option>
<option value="auto" {% if current_language == 'auto' %}selected="selected"{% endif %}> <option value="auto" {% if current_language == 'auto' %}selected="selected"{% endif %}>
{{- _('Auto-detect') -}} {{- _('Auto-detect') -}}
{%- if current_language == 'auto' %} ({{ search_language }}){%- endif -%} {%- if current_language == 'auto' %} ({{ search_language }}){%- endif -%}
</option> </option>
{%- for lang_id,lang_name,country_name,english_name,flag in language_codes | sort(attribute=1) -%} {%- for sxng_tag,lang_name,country_name,english_name,flag in sxng_locales | sort(attribute=1) -%}
<option value="{{ lang_id }}" {% if lang_id == current_language %}selected="selected"{% endif %}> <option value="{{ sxng_tag }}" {% if sxng_tag == current_language %}selected="selected"{% endif %}>
{% if flag %}{{ flag }} {% endif%} {{- lang_name }} {% if country_name %}({{ country_name }}) {% endif %} {% if flag %}{{ flag }} {% endif%} {{- lang_name }} {% if country_name %} - {{ country_name }} {% endif %} [{{sxng_tag}}]
</option> </option>
{%- endfor -%} {%- endfor -%}
</select> </select>

View file

@ -115,10 +115,10 @@
<legend id="pref_language">{{ _('Search language') }}</legend> <legend id="pref_language">{{ _('Search language') }}</legend>
<p class="value">{{- '' -}} <p class="value">{{- '' -}}
<select name='language' aria-labelledby="pref_language" aria-describedby="desc_language">{{- '' -}} <select name='language' aria-labelledby="pref_language" aria-describedby="desc_language">{{- '' -}}
<option value="all" {% if current_language == 'all' %}selected="selected"{% endif %}>{{ _('Default language') }}</option> <option value="all" {% if current_language == 'all' %}selected="selected"{% endif %}>{{ _('Default language') }} [all]</option>
<option value="auto" {% if current_language == 'auto' %}selected="selected"{% endif %}>{{ _('Auto-detect') }}</option> <option value="auto" {% if current_language == 'auto' %}selected="selected"{% endif %}>{{ _('Auto-detect') }} [auto]</option>
{%- for lang_id,lang_name,country_name,english_name,flag in language_codes | sort(attribute=1) -%} {%- for sxng_tag,lang_name,country_name,english_name,flag in sxng_locales | sort(attribute=1) -%}
<option value="{{ lang_id }}" {% if lang_id == current_language %}selected="selected"{% endif %}>{% if flag %}{{ flag }} {% endif%} {{- lang_name }} {% if country_name %}({{ country_name }}) {% endif %}</option> <option value="{{ sxng_tag }}" {% if sxng_tag == current_language %}selected="selected"{% endif %}>{% if flag %}{{ flag }} {% endif%} {{- lang_name }} {% if country_name %} - {{ country_name }} {% endif %} [{{sxng_tag}}]</option>
{%- endfor -%} {%- endfor -%}
</select>{{- '' -}} </select>{{- '' -}}
</p> </p>

View file

@ -18,13 +18,11 @@ from urllib.parse import urljoin, urlparse
from lxml import html from lxml import html
from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
from babel.core import get_global
from searx import settings from searx import settings
from searx.data import USER_AGENTS, data_dir from searx.data import USER_AGENTS, data_dir
from searx.version import VERSION_TAG from searx.version import VERSION_TAG
from searx.languages import language_codes from searx.sxng_locales import sxng_locales
from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
from searx import logger from searx import logger
@ -53,8 +51,8 @@ _LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {}
_FASTTEXT_MODEL: Optional["fasttext.FastText._FastText"] = None _FASTTEXT_MODEL: Optional["fasttext.FastText._FastText"] = None
"""fasttext model to predict laguage of a search term""" """fasttext model to predict laguage of a search term"""
SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in language_codes]) SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in sxng_locales])
"""Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`).""" """Languages supported by most searxng engines (:py:obj:`searx.sxng_locales.sxng_locales`)."""
class _NotSetClass: # pylint: disable=too-few-public-methods class _NotSetClass: # pylint: disable=too-few-public-methods
@ -355,102 +353,16 @@ def is_valid_lang(lang) -> Optional[Tuple[bool, str, str]]:
is_abbr = len(lang) == 2 is_abbr = len(lang) == 2
lang = lang.lower() lang = lang.lower()
if is_abbr: if is_abbr:
for l in language_codes: for l in sxng_locales:
if l[0][:2] == lang: if l[0][:2] == lang:
return (True, l[0][:2], l[3].lower()) return (True, l[0][:2], l[3].lower())
return None return None
for l in language_codes: for l in sxng_locales:
if l[1].lower() == lang or l[3].lower() == lang: if l[1].lower() == lang or l[3].lower() == lang:
return (True, l[0][:2], l[3].lower()) return (True, l[0][:2], l[3].lower())
return None return None
def _get_lang_to_lc_dict(lang_list: List[str]) -> Dict[str, str]:
key = str(lang_list)
value = _LANG_TO_LC_CACHE.get(key, None)
if value is None:
value = {}
for lang in lang_list:
value.setdefault(lang.split('-')[0], lang)
_LANG_TO_LC_CACHE[key] = value
return value
# babel's get_global contains all sorts of miscellaneous locale and territory related data
# see get_global in: https://github.com/python-babel/babel/blob/master/babel/core.py
def _get_from_babel(lang_code: str, key):
match = get_global(key).get(lang_code.replace('-', '_'))
# for some keys, such as territory_aliases, match may be a list
if isinstance(match, str):
return match.replace('_', '-')
return match
def _match_language(lang_code: str, lang_list=[], custom_aliases={}) -> Optional[str]: # pylint: disable=W0102
"""auxiliary function to match lang_code in lang_list"""
# replace language code with a custom alias if necessary
if lang_code in custom_aliases:
lang_code = custom_aliases[lang_code]
if lang_code in lang_list:
return lang_code
# try to get the most likely country for this language
subtags = _get_from_babel(lang_code, 'likely_subtags')
if subtags:
if subtags in lang_list:
return subtags
subtag_parts = subtags.split('-')
new_code = subtag_parts[0] + '-' + subtag_parts[-1]
if new_code in custom_aliases:
new_code = custom_aliases[new_code]
if new_code in lang_list:
return new_code
# try to get the any supported country for this language
return _get_lang_to_lc_dict(lang_list).get(lang_code)
def match_language( # pylint: disable=W0102
locale_code, lang_list=[], custom_aliases={}, fallback: Optional[str] = 'en-US'
) -> Optional[str]:
"""get the language code from lang_list that best matches locale_code"""
# try to get language from given locale_code
language = _match_language(locale_code, lang_list, custom_aliases)
if language:
return language
locale_parts = locale_code.split('-')
lang_code = locale_parts[0]
# if locale_code has script, try matching without it
if len(locale_parts) > 2:
language = _match_language(lang_code + '-' + locale_parts[-1], lang_list, custom_aliases)
if language:
return language
# try to get language using an equivalent country code
if len(locale_parts) > 1:
country_alias = _get_from_babel(locale_parts[-1], 'territory_aliases')
if country_alias:
language = _match_language(lang_code + '-' + country_alias[0], lang_list, custom_aliases)
if language:
return language
# try to get language using an equivalent language code
alias = _get_from_babel(lang_code, 'language_aliases')
if alias:
language = _match_language(alias, lang_list, custom_aliases)
if language:
return language
if lang_code != locale_code:
# try to get language from given language without giving the country
language = _match_language(lang_code, lang_list, custom_aliases)
return language or fallback
def load_module(filename: str, module_dir: str) -> types.ModuleType: def load_module(filename: str, module_dir: str) -> types.ModuleType:
modname = splitext(filename)[0] modname = splitext(filename)[0]
modpath = join(module_dir, filename) modpath = join(module_dir, filename)

View file

@ -89,7 +89,6 @@ from searx.utils import (
html_to_text, html_to_text,
gen_useragent, gen_useragent,
dict_subset, dict_subset,
match_language,
) )
from searx.version import VERSION_STRING, GIT_URL, GIT_BRANCH from searx.version import VERSION_STRING, GIT_URL, GIT_BRANCH
from searx.query import RawTextQuery from searx.query import RawTextQuery
@ -117,12 +116,13 @@ from searx.locales import (
RTL_LOCALES, RTL_LOCALES,
localeselector, localeselector,
locales_initialize, locales_initialize,
match_locale,
) )
# renaming names from searx imports ... # renaming names from searx imports ...
from searx.autocomplete import search_autocomplete, backends as autocomplete_backends from searx.autocomplete import search_autocomplete, backends as autocomplete_backends
from searx.languages import language_codes as languages
from searx.redisdb import initialize as redis_initialize from searx.redisdb import initialize as redis_initialize
from searx.sxng_locales import sxng_locales
from searx.search import SearchWithPlugins, initialize as search_initialize from searx.search import SearchWithPlugins, initialize as search_initialize
from searx.network import stream as http_stream, set_context_network_name from searx.network import stream as http_stream, set_context_network_name
from searx.search.checker import get_result as checker_get_result from searx.search.checker import get_result as checker_get_result
@ -227,7 +227,7 @@ def _get_browser_language(req, lang_list):
if '-' in lang: if '-' in lang:
lang_parts = lang.split('-') lang_parts = lang.split('-')
lang = "{}-{}".format(lang_parts[0], lang_parts[-1].upper()) lang = "{}-{}".format(lang_parts[0], lang_parts[-1].upper())
locale = match_language(lang, lang_list, fallback=None) locale = match_locale(lang, lang_list, fallback=None)
if locale is not None: if locale is not None:
return locale return locale
return 'en' return 'en'
@ -407,7 +407,7 @@ def get_client_settings():
def render(template_name: str, **kwargs): def render(template_name: str, **kwargs):
# pylint: disable=too-many-statements
kwargs['client_settings'] = str( kwargs['client_settings'] = str(
base64.b64encode( base64.b64encode(
bytes( bytes(
@ -438,17 +438,20 @@ def render(template_name: str, **kwargs):
kwargs['OTHER_CATEGORY'] = OTHER_CATEGORY kwargs['OTHER_CATEGORY'] = OTHER_CATEGORY
# i18n # i18n
kwargs['language_codes'] = [l for l in languages if l[0] in settings['search']['languages']] kwargs['sxng_locales'] = [l for l in sxng_locales if l[0] in settings['search']['languages']]
locale = request.preferences.get_value('locale') locale = request.preferences.get_value('locale')
kwargs['locale_rfc5646'] = _get_locale_rfc5646(locale) kwargs['locale_rfc5646'] = _get_locale_rfc5646(locale)
if locale in RTL_LOCALES and 'rtl' not in kwargs: if locale in RTL_LOCALES and 'rtl' not in kwargs:
kwargs['rtl'] = True kwargs['rtl'] = True
if 'current_language' not in kwargs: if 'current_language' not in kwargs:
kwargs['current_language'] = match_language( _locale = request.preferences.get_value('language')
request.preferences.get_value('language'), settings['search']['languages'] if _locale in ('auto', 'all'):
) kwargs['current_language'] = _locale
else:
kwargs['current_language'] = match_locale(_locale, settings['search']['languages'])
# values from settings # values from settings
kwargs['search_formats'] = [x for x in settings['search']['formats'] if x != 'html'] kwargs['search_formats'] = [x for x in settings['search']['formats'] if x != 'html']
@ -810,6 +813,13 @@ def search():
) )
) )
if search_query.lang in ('auto', 'all'):
current_language = search_query.lang
else:
current_language = match_locale(
search_query.lang, settings['search']['languages'], fallback=request.preferences.get_value("language")
)
# search_query.lang contains the user choice (all, auto, en, ...) # search_query.lang contains the user choice (all, auto, en, ...)
# when the user choice is "auto", search.search_query.lang contains the detected language # when the user choice is "auto", search.search_query.lang contains the detected language
# otherwise it is equals to search_query.lang # otherwise it is equals to search_query.lang
@ -832,12 +842,8 @@ def search():
result_container.unresponsive_engines result_container.unresponsive_engines
), ),
current_locale = request.preferences.get_value("locale"), current_locale = request.preferences.get_value("locale"),
current_language = match_language( current_language = current_language,
search_query.lang, search_language = match_locale(
settings['search']['languages'],
fallback=request.preferences.get_value("language")
),
search_language = match_language(
search.search_query.lang, search.search_query.lang,
settings['search']['languages'], settings['search']['languages'],
fallback=request.preferences.get_value("language") fallback=request.preferences.get_value("language")
@ -907,16 +913,11 @@ def autocompleter():
# and there is a query part # and there is a query part
if len(raw_text_query.autocomplete_list) == 0 and len(sug_prefix) > 0: if len(raw_text_query.autocomplete_list) == 0 and len(sug_prefix) > 0:
# get language from cookie # get SearXNG's locale and autocomplete backend from cookie
language = request.preferences.get_value('language') sxng_locale = request.preferences.get_value('language')
if not language or language == 'all': backend_name = request.preferences.get_value('autocomplete')
language = 'en'
else:
language = language.split('-')[0]
# run autocompletion for result in search_autocomplete(backend_name, sug_prefix, sxng_locale):
raw_results = search_autocomplete(request.preferences.get_value('autocomplete'), sug_prefix, language)
for result in raw_results:
# attention: this loop will change raw_text_query object and this is # attention: this loop will change raw_text_query object and this is
# the reason why the sug_prefix was stored before (see above) # the reason why the sug_prefix was stored before (see above)
if result != sug_prefix: if result != sug_prefix:
@ -1001,7 +1002,9 @@ def preferences():
'rate80': rate80, 'rate80': rate80,
'rate95': rate95, 'rate95': rate95,
'warn_timeout': e.timeout > settings['outgoing']['request_timeout'], 'warn_timeout': e.timeout > settings['outgoing']['request_timeout'],
'supports_selected_language': _is_selected_language_supported(e, request.preferences), 'supports_selected_language': e.traits.is_locale_supported(
str(request.preferences.get_value('language') or 'all')
),
'result_count': result_count, 'result_count': result_count,
} }
# end of stats # end of stats
@ -1052,7 +1055,9 @@ def preferences():
# supports # supports
supports = {} supports = {}
for _, e in filtered_engines.items(): for _, e in filtered_engines.items():
supports_selected_language = _is_selected_language_supported(e, request.preferences) supports_selected_language = e.traits.is_locale_supported(
str(request.preferences.get_value('language') or 'all')
)
safesearch = e.safesearch safesearch = e.safesearch
time_range_support = e.time_range_support time_range_support = e.time_range_support
for checker_test_name in checker_results.get(e.name, {}).get('errors', {}): for checker_test_name in checker_results.get(e.name, {}).get('errors', {}):
@ -1099,16 +1104,6 @@ def preferences():
) )
def _is_selected_language_supported(engine, preferences: Preferences): # pylint: disable=redefined-outer-name
language = preferences.get_value('language')
if language == 'all':
return True
x = match_language(
language, getattr(engine, 'supported_languages', []), getattr(engine, 'language_aliases', {}), None
)
return bool(x)
@app.route('/image_proxy', methods=['GET']) @app.route('/image_proxy', methods=['GET'])
def image_proxy(): def image_proxy():
# pylint: disable=too-many-return-statements, too-many-branches # pylint: disable=too-many-return-statements, too-many-branches
@ -1327,10 +1322,7 @@ def config():
if not request.preferences.validate_token(engine): if not request.preferences.validate_token(engine):
continue continue
supported_languages = engine.supported_languages _languages = engine.traits.languages.keys()
if isinstance(engine.supported_languages, dict):
supported_languages = list(engine.supported_languages.keys())
_engines.append( _engines.append(
{ {
'name': name, 'name': name,
@ -1339,7 +1331,8 @@ def config():
'enabled': not engine.disabled, 'enabled': not engine.disabled,
'paging': engine.paging, 'paging': engine.paging,
'language_support': engine.language_support, 'language_support': engine.language_support,
'supported_languages': supported_languages, 'languages': list(_languages),
'regions': list(engine.traits.regions.keys()),
'safesearch': engine.safesearch, 'safesearch': engine.safesearch,
'time_range_support': engine.time_range_support, 'time_range_support': engine.time_range_support,
'timeout': engine.timeout, 'timeout': engine.timeout,

View file

@ -1,4 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import annotations
import os import os
import pathlib import pathlib
import csv import csv
@ -8,7 +10,7 @@ import re
import inspect import inspect
import itertools import itertools
from datetime import datetime, timedelta from datetime import datetime, timedelta
from typing import Iterable, List, Tuple, Dict from typing import Iterable, List, Tuple, Dict, TYPE_CHECKING
from io import StringIO from io import StringIO
from codecs import getincrementalencoder from codecs import getincrementalencoder
@ -16,7 +18,10 @@ from codecs import getincrementalencoder
from flask_babel import gettext, format_date from flask_babel import gettext, format_date
from searx import logger, settings from searx import logger, settings
from searx.engines import Engine, OTHER_CATEGORY from searx.engines import OTHER_CATEGORY
if TYPE_CHECKING:
from searx.enginelib import Engine
VALID_LANGUAGE_CODE = re.compile(r'^[a-z]{2,3}(-[a-zA-Z]{2})?$') VALID_LANGUAGE_CODE = re.compile(r'^[a-z]{2,3}(-[a-zA-Z]{2})?$')

View file

@ -18,8 +18,8 @@ from os.path import join
from lxml.html import fromstring from lxml.html import fromstring
from searx.engines import wikidata, set_loggers from searx.engines import wikidata, set_loggers
from searx.utils import extract_text, match_language from searx.utils import extract_text
from searx.locales import LOCALE_NAMES, locales_initialize from searx.locales import LOCALE_NAMES, locales_initialize, match_locale
from searx import searx_dir from searx import searx_dir
from searx.utils import gen_useragent, detect_language from searx.utils import gen_useragent, detect_language
import searx.search import searx.search
@ -225,9 +225,9 @@ def fetch_website_description(engine_name, website):
fetched_lang, desc = get_website_description(website, lang, WIKIPEDIA_LANGUAGES[lang]) fetched_lang, desc = get_website_description(website, lang, WIKIPEDIA_LANGUAGES[lang])
if fetched_lang is None or desc is None: if fetched_lang is None or desc is None:
continue continue
matched_lang = match_language(fetched_lang, LANGUAGES, fallback=None) matched_lang = match_locale(fetched_lang, LANGUAGES, fallback=None)
if matched_lang is None: if matched_lang is None:
fetched_wikipedia_lang = match_language(fetched_lang, WIKIPEDIA_LANGUAGES.values(), fallback=None) fetched_wikipedia_lang = match_locale(fetched_lang, WIKIPEDIA_LANGUAGES.values(), fallback=None)
matched_lang = wikipedia_languages_r.get(fetched_wikipedia_lang) matched_lang = wikipedia_languages_r.get(fetched_wikipedia_lang)
if matched_lang is not None: if matched_lang is not None:
update_description(engine_name, matched_lang, desc, website, replace=False) update_description(engine_name, matched_lang, desc, website, replace=False)

View file

@ -0,0 +1,198 @@
#!/usr/bin/env python
# lint: pylint
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Update :py:obj:`searx.enginelib.traits.EngineTraitsMap` and :origin:`searx/languages.py`
:py:obj:`searx.enginelib.traits.EngineTraitsMap.ENGINE_TRAITS_FILE`:
Persistence of engines traits, fetched from the engines.
:origin:`searx/languages.py`
Is generated from intersecting each engine's supported traits.
The script :origin:`searxng_extra/update/update_engine_traits.py` is called in
the :origin:`CI Update data ... <.github/workflows/data-update.yml>`
"""
# pylint: disable=invalid-name
from unicodedata import lookup
from pathlib import Path
from pprint import pformat
import babel
from searx import settings, searx_dir
from searx import network
from searx.engines import load_engines
from searx.enginelib.traits import EngineTraitsMap
# Output files.
languages_file = Path(searx_dir) / 'sxng_locales.py'
languages_file_header = """\
# -*- coding: utf-8 -*-
'''List of SearXNG's locale codes.
This file is generated automatically by::
./manage pyenv.cmd searxng_extra/update/update_engine_traits.py
'''
sxng_locales = (
"""
languages_file_footer = """,
)
'''
A list of five-digit tuples:
0. SearXNG's internal locale tag (a language or region tag)
1. Name of the language (:py:obj:`babel.core.Locale.get_language_name`)
2. For region tags the name of the region (:py:obj:`babel.core.Locale.get_territory_name`).
Empty string for language tags.
3. English language name (from :py:obj:`babel.core.Locale.english_name`)
4. Unicode flag (emoji) that fits to SearXNG's internal region tag. Languages
are represented by a globe (\U0001F310)
.. code:: python
('en', 'English', '', 'English', '\U0001f310'),
('en-CA', 'English', 'Canada', 'English', '\U0001f1e8\U0001f1e6'),
('en-US', 'English', 'United States', 'English', '\U0001f1fa\U0001f1f8'),
..
('fr', 'Français', '', 'French', '\U0001f310'),
('fr-BE', 'Français', 'Belgique', 'French', '\U0001f1e7\U0001f1ea'),
('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'),
:meta hide-value:
'''
"""
lang2emoji = {
'ha': '\U0001F1F3\U0001F1EA', # Hausa / Niger
'bs': '\U0001F1E7\U0001F1E6', # Bosnian / Bosnia & Herzegovina
'jp': '\U0001F1EF\U0001F1F5', # Japanese
'ua': '\U0001F1FA\U0001F1E6', # Ukrainian
'he': '\U0001F1EE\U0001F1F7', # Hebrew
}
def main():
load_engines(settings['engines'])
# traits_map = EngineTraitsMap.from_data()
traits_map = fetch_traits_map()
sxng_tag_list = filter_locales(traits_map)
write_languages_file(sxng_tag_list)
def fetch_traits_map():
"""Fetchs supported languages for each engine and writes json file with those."""
network.set_timeout_for_thread(10.0)
def log(msg):
print(msg)
traits_map = EngineTraitsMap.fetch_traits(log=log)
print("fetched properties from %s engines" % len(traits_map))
print("write json file: %s" % traits_map.ENGINE_TRAITS_FILE)
traits_map.save_data()
return traits_map
def filter_locales(traits_map: EngineTraitsMap):
"""Filter language & region tags by a threshold."""
min_eng_per_region = 11
min_eng_per_lang = 13
_ = {}
for eng in traits_map.values():
for reg in eng.regions.keys():
_[reg] = _.get(reg, 0) + 1
regions = set(k for k, v in _.items() if v >= min_eng_per_region)
lang_from_region = set(k.split('-')[0] for k in regions)
_ = {}
for eng in traits_map.values():
for lang in eng.languages.keys():
# ignore script types like zh_Hant, zh_Hans or sr_Latin, pa_Arab (they
# already counted by existence of 'zh' or 'sr', 'pa')
if '_' in lang:
# print("ignore %s" % lang)
continue
_[lang] = _.get(lang, 0) + 1
languages = set(k for k, v in _.items() if v >= min_eng_per_lang)
sxng_tag_list = set()
sxng_tag_list.update(regions)
sxng_tag_list.update(lang_from_region)
sxng_tag_list.update(languages)
return sxng_tag_list
def write_languages_file(sxng_tag_list):
language_codes = []
for sxng_tag in sorted(sxng_tag_list):
sxng_locale: babel.Locale = babel.Locale.parse(sxng_tag, sep='-')
flag = get_unicode_flag(sxng_locale) or ''
item = (
sxng_tag,
sxng_locale.get_language_name().title(),
sxng_locale.get_territory_name() or '',
sxng_locale.english_name.split(' (')[0],
UnicodeEscape(flag),
)
language_codes.append(item)
language_codes = tuple(language_codes)
with open(languages_file, 'w', encoding='utf-8') as new_file:
file_content = "{header} {language_codes}{footer}".format(
header=languages_file_header,
language_codes=pformat(language_codes, width=120, indent=4)[1:-1],
footer=languages_file_footer,
)
new_file.write(file_content)
new_file.close()
class UnicodeEscape(str):
"""Escape unicode string in :py:obj:`pprint.pformat`"""
def __repr__(self):
return "'" + "".join([chr(c) for c in self.encode('unicode-escape')]) + "'"
def get_unicode_flag(locale: babel.Locale):
"""Determine a unicode flag (emoji) that fits to the ``locale``"""
emoji = lang2emoji.get(locale.language)
if emoji:
return emoji
if not locale.territory:
return '\U0001F310'
emoji = lang2emoji.get(locale.territory.lower())
if emoji:
return emoji
try:
c1 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[0])
c2 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[1])
# print("OK : %s --> %s%s" % (locale, c1, c2))
except KeyError as exc:
print("ERROR: %s --> %s" % (locale, exc))
return None
return c1 + c2
if __name__ == "__main__":
main()

View file

@ -1,313 +0,0 @@
#!/usr/bin/env python
# lint: pylint
# SPDX-License-Identifier: AGPL-3.0-or-later
"""This script generates languages.py from intersecting each engine's supported
languages.
Output files: :origin:`searx/data/engines_languages.json` and
:origin:`searx/languages.py` (:origin:`CI Update data ...
<.github/workflows/data-update.yml>`).
"""
# pylint: disable=invalid-name
from unicodedata import lookup
import json
from pathlib import Path
from pprint import pformat
from babel import Locale, UnknownLocaleError
from babel.languages import get_global
from babel.core import parse_locale
from searx import settings, searx_dir
from searx.engines import load_engines, engines
from searx.network import set_timeout_for_thread
# Output files.
engines_languages_file = Path(searx_dir) / 'data' / 'engines_languages.json'
languages_file = Path(searx_dir) / 'languages.py'
# Fetches supported languages for each engine and writes json file with those.
def fetch_supported_languages():
set_timeout_for_thread(10.0)
engines_languages = {}
names = list(engines)
names.sort()
for engine_name in names:
if hasattr(engines[engine_name], 'fetch_supported_languages'):
engines_languages[engine_name] = engines[engine_name].fetch_supported_languages()
print("fetched %s languages from engine %s" % (len(engines_languages[engine_name]), engine_name))
if type(engines_languages[engine_name]) == list: # pylint: disable=unidiomatic-typecheck
engines_languages[engine_name] = sorted(engines_languages[engine_name])
print("fetched languages from %s engines" % len(engines_languages))
# write json file
with open(engines_languages_file, 'w', encoding='utf-8') as f:
json.dump(engines_languages, f, indent=2, sort_keys=True)
return engines_languages
# Get babel Locale object from lang_code if possible.
def get_locale(lang_code):
try:
locale = Locale.parse(lang_code, sep='-')
return locale
except (UnknownLocaleError, ValueError):
return None
lang2emoji = {
'ha': '\U0001F1F3\U0001F1EA', # Hausa / Niger
'bs': '\U0001F1E7\U0001F1E6', # Bosnian / Bosnia & Herzegovina
'jp': '\U0001F1EF\U0001F1F5', # Japanese
'ua': '\U0001F1FA\U0001F1E6', # Ukrainian
'he': '\U0001F1EE\U0001F1F7', # Hebrew
}
def get_unicode_flag(lang_code):
"""Determine a unicode flag (emoji) that fits to the ``lang_code``"""
emoji = lang2emoji.get(lang_code.lower())
if emoji:
return emoji
if len(lang_code) == 2:
return '\U0001F310'
language = territory = script = variant = ''
try:
language, territory, script, variant = parse_locale(lang_code, '-')
except ValueError as exc:
print(exc)
# https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
if not territory:
# https://www.unicode.org/emoji/charts/emoji-list.html#country-flag
emoji = lang2emoji.get(language)
if not emoji:
print(
"%s --> language: %s / territory: %s / script: %s / variant: %s"
% (lang_code, language, territory, script, variant)
)
return emoji
emoji = lang2emoji.get(territory.lower())
if emoji:
return emoji
try:
c1 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + territory[0])
c2 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + territory[1])
# print("%s --> territory: %s --> %s%s" %(lang_code, territory, c1, c2 ))
except KeyError as exc:
print("%s --> territory: %s --> %s" % (lang_code, territory, exc))
return None
return c1 + c2
def get_territory_name(lang_code):
country_name = None
locale = get_locale(lang_code)
try:
if locale is not None:
country_name = locale.get_territory_name()
except FileNotFoundError as exc:
print("ERROR: %s --> %s" % (locale, exc))
return country_name
# Join all language lists.
def join_language_lists(engines_languages):
language_list = {}
for engine_name in engines_languages:
for lang_code in engines_languages[engine_name]:
# apply custom fixes if necessary
if lang_code in getattr(engines[engine_name], 'language_aliases', {}).values():
lang_code = next(
lc for lc, alias in engines[engine_name].language_aliases.items() if lang_code == alias
)
locale = get_locale(lang_code)
# ensure that lang_code uses standard language and country codes
if locale and locale.territory:
lang_code = "{lang}-{country}".format(lang=locale.language, country=locale.territory)
short_code = lang_code.split('-')[0]
# add language without country if not in list
if short_code not in language_list:
if locale:
# get language's data from babel's Locale object
language_name = locale.get_language_name().title()
english_name = locale.english_name.split(' (')[0]
elif short_code in engines_languages['wikipedia']:
# get language's data from wikipedia if not known by babel
language_name = engines_languages['wikipedia'][short_code]['name']
english_name = engines_languages['wikipedia'][short_code]['english_name']
else:
language_name = None
english_name = None
# add language to list
language_list[short_code] = {
'name': language_name,
'english_name': english_name,
'counter': set(),
'countries': {},
}
# add language with country if not in list
if lang_code != short_code and lang_code not in language_list[short_code]['countries']:
country_name = ''
if locale:
# get country name from babel's Locale object
try:
country_name = locale.get_territory_name()
except FileNotFoundError as exc:
print("ERROR: %s --> %s" % (locale, exc))
locale = None
language_list[short_code]['countries'][lang_code] = {
'country_name': country_name,
'counter': set(),
}
# count engine for both language_country combination and language alone
language_list[short_code]['counter'].add(engine_name)
if lang_code != short_code:
language_list[short_code]['countries'][lang_code]['counter'].add(engine_name)
return language_list
# Filter language list so it only includes the most supported languages and countries
def filter_language_list(all_languages):
min_engines_per_lang = 12
min_engines_per_country = 7
# pylint: disable=consider-using-dict-items, consider-iterating-dictionary
main_engines = [
engine_name
for engine_name in engines.keys()
if 'general' in engines[engine_name].categories
and engines[engine_name].supported_languages
and not engines[engine_name].disabled
]
# filter list to include only languages supported by most engines or all default general engines
filtered_languages = {
code: lang
for code, lang in all_languages.items()
if (
len(lang['counter']) >= min_engines_per_lang
or all(main_engine in lang['counter'] for main_engine in main_engines)
)
}
def _copy_lang_data(lang, country_name=None):
new_dict = {}
new_dict['name'] = all_languages[lang]['name']
new_dict['english_name'] = all_languages[lang]['english_name']
if country_name:
new_dict['country_name'] = country_name
return new_dict
# for each language get country codes supported by most engines or at least one country code
filtered_languages_with_countries = {}
for lang, lang_data in filtered_languages.items():
countries = lang_data['countries']
filtered_countries = {}
# get language's country codes with enough supported engines
for lang_country, country_data in countries.items():
if len(country_data['counter']) >= min_engines_per_country:
filtered_countries[lang_country] = _copy_lang_data(lang, country_data['country_name'])
# add language without countries too if there's more than one country to choose from
if len(filtered_countries) > 1:
filtered_countries[lang] = _copy_lang_data(lang, None)
elif len(filtered_countries) == 1:
lang_country = next(iter(filtered_countries))
# if no country has enough engines try to get most likely country code from babel
if not filtered_countries:
lang_country = None
subtags = get_global('likely_subtags').get(lang)
if subtags:
country_code = subtags.split('_')[-1]
if len(country_code) == 2:
lang_country = "{lang}-{country}".format(lang=lang, country=country_code)
if lang_country:
filtered_countries[lang_country] = _copy_lang_data(lang, None)
else:
filtered_countries[lang] = _copy_lang_data(lang, None)
filtered_languages_with_countries.update(filtered_countries)
return filtered_languages_with_countries
class UnicodeEscape(str):
"""Escape unicode string in :py:obj:`pprint.pformat`"""
def __repr__(self):
return "'" + "".join([chr(c) for c in self.encode('unicode-escape')]) + "'"
# Write languages.py.
def write_languages_file(languages):
file_headers = (
"# -*- coding: utf-8 -*-",
"# list of language codes",
"# this file is generated automatically by utils/fetch_languages.py",
"language_codes = (\n",
)
language_codes = []
for code in sorted(languages):
name = languages[code]['name']
if name is None:
print("ERROR: languages['%s'] --> %s" % (code, languages[code]))
continue
flag = get_unicode_flag(code) or ''
item = (
code,
languages[code]['name'].split(' (')[0],
get_territory_name(code) or '',
languages[code].get('english_name') or '',
UnicodeEscape(flag),
)
language_codes.append(item)
language_codes = tuple(language_codes)
with open(languages_file, 'w', encoding='utf-8') as new_file:
file_content = "{file_headers} {language_codes},\n)\n".format(
# fmt: off
file_headers = '\n'.join(file_headers),
language_codes = pformat(language_codes, indent=4)[1:-1]
# fmt: on
)
new_file.write(file_content)
new_file.close()
if __name__ == "__main__":
load_engines(settings['engines'])
_engines_languages = fetch_supported_languages()
_all_languages = join_language_lists(_engines_languages)
_filtered_languages = filter_language_list(_all_languages)
write_languages_file(_filtered_languages)

View file

@ -50,7 +50,7 @@ from pathlib import Path
from searx import searx_dir from searx import searx_dir
from searx.network import set_timeout_for_thread from searx.network import set_timeout_for_thread
from searx.engines import wikidata, set_loggers from searx.engines import wikidata, set_loggers
from searx.languages import language_codes from searx.sxng_locales import sxng_locales
from searx.engines.openstreetmap import get_key_rank, VALUE_TO_LINK from searx.engines.openstreetmap import get_key_rank, VALUE_TO_LINK
set_loggers(wikidata, 'wikidata') set_loggers(wikidata, 'wikidata')
@ -76,7 +76,7 @@ GROUP BY ?key ?item ?itemLabel
ORDER BY ?key ?item ?itemLabel ORDER BY ?key ?item ?itemLabel
""" """
LANGUAGES = [l[0].lower() for l in language_codes] LANGUAGES = [l[0].lower() for l in sxng_locales]
PRESET_KEYS = { PRESET_KEYS = {
('wikidata',): {'en': 'Wikidata'}, ('wikidata',): {'en': 'Wikidata'},

111
tests/unit/test_locales.py Normal file
View file

@ -0,0 +1,111 @@
# -*- coding: utf-8 -*-
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Test some code from module :py:obj:`searx.locales`"""
from searx import locales
from searx.sxng_locales import sxng_locales
from tests import SearxTestCase
class TestLocales(SearxTestCase):
"""Implemented tests:
- :py:obj:`searx.locales.match_locale`
"""
def test_match_locale(self):
locale_tag_list = [x[0] for x in sxng_locales]
# Test SearXNG search languages
self.assertEqual(locales.match_locale('de', locale_tag_list), 'de')
self.assertEqual(locales.match_locale('fr', locale_tag_list), 'fr')
self.assertEqual(locales.match_locale('zh', locale_tag_list), 'zh')
# Test SearXNG search regions
self.assertEqual(locales.match_locale('ca-es', locale_tag_list), 'ca-ES')
self.assertEqual(locales.match_locale('de-at', locale_tag_list), 'de-AT')
self.assertEqual(locales.match_locale('de-de', locale_tag_list), 'de-DE')
self.assertEqual(locales.match_locale('en-UK', locale_tag_list), 'en-GB')
self.assertEqual(locales.match_locale('fr-be', locale_tag_list), 'fr-BE')
self.assertEqual(locales.match_locale('fr-be', locale_tag_list), 'fr-BE')
self.assertEqual(locales.match_locale('fr-ca', locale_tag_list), 'fr-CA')
self.assertEqual(locales.match_locale('fr-ch', locale_tag_list), 'fr-CH')
self.assertEqual(locales.match_locale('zh-cn', locale_tag_list), 'zh-CN')
self.assertEqual(locales.match_locale('zh-tw', locale_tag_list), 'zh-TW')
self.assertEqual(locales.match_locale('zh-hk', locale_tag_list), 'zh-HK')
# Test language script code
self.assertEqual(locales.match_locale('zh-hans', locale_tag_list), 'zh-CN')
self.assertEqual(locales.match_locale('zh-hans-cn', locale_tag_list), 'zh-CN')
self.assertEqual(locales.match_locale('zh-hant', locale_tag_list), 'zh-TW')
self.assertEqual(locales.match_locale('zh-hant-tw', locale_tag_list), 'zh-TW')
# Test individual locale lists
self.assertEqual(locales.match_locale('es', [], fallback='fallback'), 'fallback')
self.assertEqual(locales.match_locale('de', ['de-CH', 'de-DE']), 'de-DE')
self.assertEqual(locales.match_locale('de', ['de-CH', 'de-DE']), 'de-DE')
self.assertEqual(locales.match_locale('es', ['ES']), 'ES')
self.assertEqual(locales.match_locale('es', ['es-AR', 'es-ES', 'es-MX']), 'es-ES')
self.assertEqual(locales.match_locale('es-AR', ['es-AR', 'es-ES', 'es-MX']), 'es-AR')
self.assertEqual(locales.match_locale('es-CO', ['es-AR', 'es-ES']), 'es-ES')
self.assertEqual(locales.match_locale('es-CO', ['es-AR']), 'es-AR')
# Tests from the commit message of 9ae409a05a
# Assumption:
# A. When a user selects a language the results should be optimized according to
# the selected language.
#
# B. When user selects a language and a territory the results should be
# optimized with first priority on territory and second on language.
# Assume we have an engine that supports the follwoing locales:
locale_tag_list = ['zh-CN', 'zh-HK', 'nl-BE', 'fr-CA']
# Examples (Assumption A.)
# ------------------------
# A user selects region 'zh-TW' which should end in zh_HK.
# hint: CN is 'Hans' and HK ('Hant') fits better to TW ('Hant')
self.assertEqual(locales.match_locale('zh-TW', locale_tag_list), 'zh-HK')
# A user selects only the language 'zh' which should end in CN
self.assertEqual(locales.match_locale('zh', locale_tag_list), 'zh-CN')
# A user selects only the language 'fr' which should end in fr_CA
self.assertEqual(locales.match_locale('fr', locale_tag_list), 'fr-CA')
# The difference in priority on the territory is best shown with a
# engine that supports the following locales:
locale_tag_list = ['fr-FR', 'fr-CA', 'en-GB', 'nl-BE']
# A user selects only a language
self.assertEqual(locales.match_locale('en', locale_tag_list), 'en-GB')
# hint: the engine supports fr_FR and fr_CA since no territory is given,
# fr_FR takes priority ..
self.assertEqual(locales.match_locale('fr', locale_tag_list), 'fr-FR')
# Examples (Assumption B.)
# ------------------------
# A user selects region 'fr-BE' which should end in nl-BE
self.assertEqual(locales.match_locale('fr-BE', locale_tag_list), 'nl-BE')
# If the user selects a language and there are two locales like the
# following:
locale_tag_list = ['fr-BE', 'fr-CH']
# The get_engine_locale selects the locale by looking at the "population
# percent" and this percentage has an higher amount in BE (68.%)
# compared to CH (21%)
self.assertEqual(locales.match_locale('fr', locale_tag_list), 'fr-BE')

View file

@ -87,39 +87,6 @@ class TestUtils(SearxTestCase):
html = '<p><b>Lorem ipsum</i>dolor sit amet</p>' html = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
self.assertEqual(utils.html_to_text(html), "Lorem ipsum") self.assertEqual(utils.html_to_text(html), "Lorem ipsum")
def test_match_language(self):
self.assertEqual(utils.match_language('es', ['es']), 'es')
self.assertEqual(utils.match_language('es', [], fallback='fallback'), 'fallback')
self.assertEqual(utils.match_language('ja', ['jp'], {'ja': 'jp'}), 'jp')
# handle script tags
self.assertEqual(utils.match_language('zh-CN', ['zh-Hans-CN', 'zh-Hant-TW']), 'zh-Hans-CN')
self.assertEqual(utils.match_language('zh-TW', ['zh-Hans-CN', 'zh-Hant-TW']), 'zh-Hant-TW')
self.assertEqual(utils.match_language('zh-Hans-CN', ['zh-CN', 'zh-TW']), 'zh-CN')
self.assertEqual(utils.match_language('zh-Hant-TW', ['zh-CN', 'zh-TW']), 'zh-TW')
self.assertEqual(utils.match_language('zh-Hans', ['zh-CN', 'zh-TW', 'zh-HK']), 'zh-CN')
self.assertEqual(utils.match_language('zh-Hant', ['zh-CN', 'zh-TW', 'zh-HK']), 'zh-TW')
aliases = {'en-GB': 'en-UK', 'he': 'iw'}
# guess country
self.assertEqual(utils.match_language('de-DE', ['de']), 'de')
self.assertEqual(utils.match_language('de', ['de-DE']), 'de-DE')
self.assertEqual(utils.match_language('es-CO', ['es-AR', 'es-ES', 'es-MX']), 'es-ES')
self.assertEqual(utils.match_language('es-CO', ['es-MX']), 'es-MX')
self.assertEqual(utils.match_language('en-UK', ['en-AU', 'en-GB', 'en-US']), 'en-GB')
self.assertEqual(utils.match_language('en-GB', ['en-AU', 'en-UK', 'en-US'], aliases), 'en-UK')
# language aliases
self.assertEqual(utils.match_language('iw', ['he']), 'he')
self.assertEqual(utils.match_language('he', ['iw'], aliases), 'iw')
self.assertEqual(utils.match_language('iw-IL', ['he']), 'he')
self.assertEqual(utils.match_language('he-IL', ['iw'], aliases), 'iw')
self.assertEqual(utils.match_language('iw', ['he-IL']), 'he-IL')
self.assertEqual(utils.match_language('he', ['iw-IL'], aliases), 'iw-IL')
self.assertEqual(utils.match_language('iw-IL', ['he-IL']), 'he-IL')
self.assertEqual(utils.match_language('he-IL', ['iw-IL'], aliases), 'iw-IL')
def test_ecma_unscape(self): def test_ecma_unscape(self):
self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space') self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space')
self.assertEqual(utils.ecma_unescape('text using %xx: %F3'), 'text using %xx: ó') self.assertEqual(utils.ecma_unescape('text using %xx: %F3'), 'text using %xx: ó')

View file

@ -52,9 +52,6 @@ enabled_plugins:
engines: engines:
- name: google
use_mobile_ui: true
# - name: fdroid # - name: fdroid
# disabled: false # disabled: false
# #