Merge pull request #683 from return42/fix-doc

Document & Pylint scripts in searxng_extra/update
This commit is contained in:
Martin Fischer 2022-01-05 19:46:00 +01:00 committed by GitHub
commit 160f3e022e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 206 additions and 62 deletions

View file

@ -1,14 +1,15 @@
.. _searxng_extra: .. _searxng_extra:
====================================================== =============================
Tooling box ``searxng_extra`` for developers and users Tooling box ``searxng_extra``
====================================================== =============================
In the folder :origin:`searxng_extra/` we maintain some tools useful for In the folder :origin:`searxng_extra/` we maintain some tools useful for CI and
developers. developers.
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
:caption: Contents :caption: Contents
update
standalone_searx.py standalone_searx.py

View file

@ -0,0 +1,88 @@
=========================
``searxng_extra/update/``
=========================
:origin:`[source] <searxng_extra/update/__init__.py>`
Scripts to update static data in :origin:`searx/data/`
.. _update_ahmia_blacklist.py:
``update_ahmia_blacklist.py``
=============================
:origin:`[source] <searxng_extra/update/update_ahmia_blacklist.py>`
.. automodule:: searxng_extra.update.update_ahmia_blacklist
:members:
``update_currencies.py``
========================
:origin:`[source] <searxng_extra/update/update_currencies.py>`
.. automodule:: searxng_extra.update.update_currencies
:members:
``update_engine_descriptions.py``
=================================
:origin:`[source] <searxng_extra/update/update_engine_descriptions.py>`
.. automodule:: searxng_extra.update.update_engine_descriptions
:members:
``update_external_bangs.py``
============================
:origin:`[source] <searxng_extra/update/update_external_bangs.py>`
.. automodule:: searxng_extra.update.update_external_bangs
:members:
``update_firefox_version.py``
=============================
:origin:`[source] <searxng_extra/update/update_firefox_version.py>`
.. automodule:: searxng_extra.update.update_firefox_version
:members:
``update_languages.py``
=======================
:origin:`[source] <searxng_extra/update/update_languages.py>`
.. automodule:: searxng_extra.update.update_languages
:members:
``update_osm_keys_tags.py``
===========================
:origin:`[source] <searxng_extra/update/update_osm_keys_tags.py>`
.. automodule:: searxng_extra.update.update_osm_keys_tags
:members:
``update_pygments.py``
======================
:origin:`[source] <searxng_extra/update/update_pygments.py>`
.. automodule:: searxng_extra.update.update_pygments
:members:
``update_wikidata_units.py``
============================
:origin:`[source] <searxng_extra/update/update_wikidata_units.py>`
.. automodule:: searxng_extra.update.update_wikidata_units
:members:

View file

@ -1,10 +1,15 @@
#!/usr/bin/env python #!/usr/bin/env python
# lint: pylint
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
"""This script saves `Ahmia's blacklist`_ for onion sites.
# This script saves Ahmia's blacklist for onion sites. Output file: :origin:`searx/data/ahmia_blacklist.txt` (:origin:`CI Update data
# More info in https://ahmia.fi/blacklist/ ... <.github/workflows/data-update.yml>`).
.. _Ahmia's blacklist: https://ahmia.fi/blacklist/
"""
# set path
from os.path import join from os.path import join
import requests import requests
@ -17,15 +22,14 @@ def fetch_ahmia_blacklist():
resp = requests.get(URL, timeout=3.0) resp = requests.get(URL, timeout=3.0)
if resp.status_code != 200: if resp.status_code != 200:
raise Exception("Error fetching Ahmia blacklist, HTTP code " + resp.status_code) raise Exception("Error fetching Ahmia blacklist, HTTP code " + resp.status_code)
else: return resp.text.split()
blacklist = resp.text.split()
return blacklist
def get_ahmia_blacklist_filename(): def get_ahmia_blacklist_filename():
return join(join(searx_dir, "data"), "ahmia_blacklist.txt") return join(join(searx_dir, "data"), "ahmia_blacklist.txt")
blacklist = fetch_ahmia_blacklist() if __name__ == '__main__':
with open(get_ahmia_blacklist_filename(), "w") as f: blacklist = fetch_ahmia_blacklist()
f.write('\n'.join(blacklist)) with open(get_ahmia_blacklist_filename(), "w", encoding='utf-8') as f:
f.write('\n'.join(blacklist))

View file

@ -1,13 +1,22 @@
#!/usr/bin/env python #!/usr/bin/env python
# lint: pylint
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
"""Fetch currencies from :origin:`searx/engines/wikidata.py` engine.
Output file: :origin:`searx/data/currencies.json` (:origin:`CI Update data ...
<.github/workflows/data-update.yml>`).
"""
# pylint: disable=invalid-name
import re import re
import unicodedata import unicodedata
import json import json
# set path # set path
from sys import path from os.path import join
from os.path import realpath, dirname, join
from searx import searx_dir from searx import searx_dir
from searx.locales import LOCALE_NAMES from searx.locales import LOCALE_NAMES

View file

@ -1,6 +1,16 @@
#!/usr/bin/env python #!/usr/bin/env python
# lint: pylint
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
"""Fetch website description from websites and from
:origin:`searx/engines/wikidata.py` engine.
Output file: :origin:`searx/data/engine_descriptions.json`.
"""
# pylint: disable=invalid-name, global-statement
import json import json
from urllib.parse import urlparse from urllib.parse import urlparse
from os.path import join from os.path import join
@ -102,7 +112,7 @@ def get_wikipedia_summary(lang, pageid):
response.raise_for_status() response.raise_for_status()
api_result = json.loads(response.text) api_result = json.loads(response.text)
return api_result.get('extract') return api_result.get('extract')
except: except Exception: # pylint: disable=broad-except
return None return None
@ -134,7 +144,7 @@ def get_website_description(url, lang1, lang2=None):
try: try:
response = searx.network.get(url, headers=headers, timeout=10) response = searx.network.get(url, headers=headers, timeout=10)
response.raise_for_status() response.raise_for_status()
except Exception: except Exception: # pylint: disable=broad-except
return (None, None) return (None, None)
try: try:

View file

@ -1,17 +1,20 @@
#!/usr/bin/env python #!/usr/bin/env python
# lint: pylint # lint: pylint
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
""" """Update :origin:`searx/data/external_bangs.json` using the duckduckgo bangs
Update searx/data/external_bangs.json using the duckduckgo bangs. (:origin:`CI Update data ... <.github/workflows/data-update.yml>`).
https://duckduckgo.com/newbang loads:
https://duckduckgo.com/newbang loads
* a javascript which provides the bang version ( https://duckduckgo.com/bv1.js ) * a javascript which provides the bang version ( https://duckduckgo.com/bv1.js )
* a JSON file which contains the bangs ( https://duckduckgo.com/bang.v260.js for example ) * a JSON file which contains the bangs ( https://duckduckgo.com/bang.v260.js for example )
This script loads the javascript, then the bangs. This script loads the javascript, then the bangs.
The javascript URL may change in the future ( for example https://duckduckgo.com/bv2.js ), The javascript URL may change in the future ( for example
but most probably it will requires to update RE_BANG_VERSION https://duckduckgo.com/bv2.js ), but most probably it will requires to update
RE_BANG_VERSION
""" """
# pylint: disable=C0116 # pylint: disable=C0116

View file

@ -1,21 +1,30 @@
#!/usr/bin/env python #!/usr/bin/env python
# lint: pylint
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
"""Fetch firefox useragent signatures
Output file: :origin:`searx/data/useragents.json` (:origin:`CI Update data ...
<.github/workflows/data-update.yml>`).
"""
import json import json
import requests
import re import re
from os.path import dirname, join from os.path import join
from urllib.parse import urlparse, urljoin from urllib.parse import urlparse, urljoin
from distutils.version import LooseVersion, StrictVersion from distutils.version import LooseVersion
import requests
from lxml import html from lxml import html
from searx import searx_dir from searx import searx_dir
URL = 'https://ftp.mozilla.org/pub/firefox/releases/' URL = 'https://ftp.mozilla.org/pub/firefox/releases/'
RELEASE_PATH = '/pub/firefox/releases/' RELEASE_PATH = '/pub/firefox/releases/'
NORMAL_REGEX = re.compile('^[0-9]+\.[0-9](\.[0-9])?$') NORMAL_REGEX = re.compile(r'^[0-9]+\.[0-9](\.[0-9])?$')
# BETA_REGEX = re.compile('.*[0-9]b([0-9\-a-z]+)$') # BETA_REGEX = re.compile(r'.*[0-9]b([0-9\-a-z]+)$')
# ESR_REGEX = re.compile('^[0-9]+\.[0-9](\.[0-9])?esr$') # ESR_REGEX = re.compile(r'^[0-9]+\.[0-9](\.[0-9])?esr$')
# #
useragents = { useragents = {
@ -32,20 +41,19 @@ def fetch_firefox_versions():
resp = requests.get(URL, timeout=2.0) resp = requests.get(URL, timeout=2.0)
if resp.status_code != 200: if resp.status_code != 200:
raise Exception("Error fetching firefox versions, HTTP code " + resp.status_code) raise Exception("Error fetching firefox versions, HTTP code " + resp.status_code)
else: dom = html.fromstring(resp.text)
dom = html.fromstring(resp.text) versions = []
versions = []
for link in dom.xpath('//a/@href'): for link in dom.xpath('//a/@href'):
url = urlparse(urljoin(URL, link)) url = urlparse(urljoin(URL, link))
path = url.path path = url.path
if path.startswith(RELEASE_PATH): if path.startswith(RELEASE_PATH):
version = path[len(RELEASE_PATH) : -1] version = path[len(RELEASE_PATH) : -1]
if NORMAL_REGEX.match(version): if NORMAL_REGEX.match(version):
versions.append(LooseVersion(version)) versions.append(LooseVersion(version))
list.sort(versions, reverse=True) list.sort(versions, reverse=True)
return versions return versions
def fetch_firefox_last_versions(): def fetch_firefox_last_versions():
@ -66,6 +74,7 @@ def get_useragents_filename():
return join(join(searx_dir, "data"), "useragents.json") return join(join(searx_dir, "data"), "useragents.json")
useragents["versions"] = fetch_firefox_last_versions() if __name__ == '__main__':
with open(get_useragents_filename(), "w") as f: useragents["versions"] = fetch_firefox_last_versions()
json.dump(useragents, f, indent=4, ensure_ascii=False) with open(get_useragents_filename(), "w", encoding='utf-8') as f:
json.dump(useragents, f, indent=4, ensure_ascii=False)

View file

@ -1,9 +1,17 @@
#!/usr/bin/env python #!/usr/bin/env python
# SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint
# This script generates languages.py from intersecting each engine's supported languages. # SPDX-License-Identifier: AGPL-3.0-or-later
# """This script generates languages.py from intersecting each engine's supported
# Output files: searx/data/engines_languages.json and searx/languages.py languages.
Output files: :origin:`searx/data/engines_languages.json` and
:origin:`searx/languages.py` (:origin:`CI Update data ...
<.github/workflows/data-update.yml>`).
"""
# pylint: disable=invalid-name
import json import json
from pathlib import Path from pathlib import Path
@ -24,7 +32,7 @@ languages_file = Path(searx_dir) / 'languages.py'
def fetch_supported_languages(): def fetch_supported_languages():
set_timeout_for_thread(10.0) set_timeout_for_thread(10.0)
engines_languages = dict() engines_languages = {}
names = list(engines) names = list(engines)
names.sort() names.sort()
@ -32,7 +40,7 @@ def fetch_supported_languages():
if hasattr(engines[engine_name], 'fetch_supported_languages'): if hasattr(engines[engine_name], 'fetch_supported_languages'):
engines_languages[engine_name] = engines[engine_name].fetch_supported_languages() engines_languages[engine_name] = engines[engine_name].fetch_supported_languages()
print("fetched %s languages from engine %s" % (len(engines_languages[engine_name]), engine_name)) print("fetched %s languages from engine %s" % (len(engines_languages[engine_name]), engine_name))
if type(engines_languages[engine_name]) == list: if type(engines_languages[engine_name]) == list: # pylint: disable=unidiomatic-typecheck
engines_languages[engine_name] = sorted(engines_languages[engine_name]) engines_languages[engine_name] = sorted(engines_languages[engine_name])
print("fetched languages from %s engines" % len(engines_languages)) print("fetched languages from %s engines" % len(engines_languages))
@ -55,7 +63,7 @@ def get_locale(lang_code):
# Join all language lists. # Join all language lists.
def join_language_lists(engines_languages): def join_language_lists(engines_languages):
language_list = dict() language_list = {}
for engine_name in engines_languages: for engine_name in engines_languages:
for lang_code in engines_languages[engine_name]: for lang_code in engines_languages[engine_name]:
@ -91,7 +99,7 @@ def join_language_lists(engines_languages):
'name': language_name, 'name': language_name,
'english_name': english_name, 'english_name': english_name,
'counter': set(), 'counter': set(),
'countries': dict(), 'countries': {},
} }
# add language with country if not in list # add language with country if not in list
@ -119,6 +127,7 @@ def join_language_lists(engines_languages):
def filter_language_list(all_languages): def filter_language_list(all_languages):
min_engines_per_lang = 13 min_engines_per_lang = 13
min_engines_per_country = 7 min_engines_per_country = 7
# pylint: disable=consider-using-dict-items, consider-iterating-dictionary
main_engines = [ main_engines = [
engine_name engine_name
for engine_name in engines.keys() for engine_name in engines.keys()
@ -138,7 +147,7 @@ def filter_language_list(all_languages):
} }
def _copy_lang_data(lang, country_name=None): def _copy_lang_data(lang, country_name=None):
new_dict = dict() new_dict = {}
new_dict['name'] = all_languages[lang]['name'] new_dict['name'] = all_languages[lang]['name']
new_dict['english_name'] = all_languages[lang]['english_name'] new_dict['english_name'] = all_languages[lang]['english_name']
if country_name: if country_name:
@ -146,10 +155,10 @@ def filter_language_list(all_languages):
return new_dict return new_dict
# for each language get country codes supported by most engines or at least one country code # for each language get country codes supported by most engines or at least one country code
filtered_languages_with_countries = dict() filtered_languages_with_countries = {}
for lang, lang_data in filtered_languages.items(): for lang, lang_data in filtered_languages.items():
countries = lang_data['countries'] countries = lang_data['countries']
filtered_countries = dict() filtered_countries = {}
# get language's country codes with enough supported engines # get language's country codes with enough supported engines
for lang_country, country_data in countries.items(): for lang_country, country_data in countries.items():
@ -211,7 +220,7 @@ def write_languages_file(languages):
language_codes = tuple(language_codes) language_codes = tuple(language_codes)
with open(languages_file, 'w') as new_file: with open(languages_file, 'w', encoding='utf-8') as new_file:
file_content = "{file_headers} {language_codes},\n)\n".format( file_content = "{file_headers} {language_codes},\n)\n".format(
# fmt: off # fmt: off
file_headers = '\n'.join(file_headers), file_headers = '\n'.join(file_headers),
@ -224,7 +233,7 @@ def write_languages_file(languages):
if __name__ == "__main__": if __name__ == "__main__":
load_engines(settings['engines']) load_engines(settings['engines'])
engines_languages = fetch_supported_languages() _engines_languages = fetch_supported_languages()
all_languages = join_language_lists(engines_languages) _all_languages = join_language_lists(_engines_languages)
filtered_languages = filter_language_list(all_languages) _filtered_languages = filter_language_list(_all_languages)
write_languages_file(filtered_languages) write_languages_file(_filtered_languages)

View file

@ -5,7 +5,10 @@
To get the i18n names, the scripts uses `Wikidata Query Service`_ instead of for To get the i18n names, the scripts uses `Wikidata Query Service`_ instead of for
example `OSM tags API`_ (sidenote: the actual change log from example `OSM tags API`_ (sidenote: the actual change log from
map.atownsend.org.uk_ might be useful to normalize OSM tags) map.atownsend.org.uk_ might be useful to normalize OSM tags).
Output file: :origin:`searx/data/osm_keys_tags` (:origin:`CI Update data ...
<.github/workflows/data-update.yml>`).
.. _Wikidata Query Service: https://query.wikidata.org/ .. _Wikidata Query Service: https://query.wikidata.org/
.. _OSM tags API: https://taginfo.openstreetmap.org/taginfo/apidoc .. _OSM tags API: https://taginfo.openstreetmap.org/taginfo/apidoc

View file

@ -3,6 +3,13 @@
# lint: pylint # lint: pylint
# pylint: disable=missing-module-docstring # pylint: disable=missing-module-docstring
"""Fetch units from :origin:`searx/engines/wikidata.py` engine.
Output file: :origin:`searx/data/wikidata_units.json` (:origin:`CI Update data
... <.github/workflows/data-update.yml>`).
"""
import json import json
import collections import collections
@ -54,5 +61,6 @@ def get_wikidata_units_filename():
return join(join(searx_dir, "data"), "wikidata_units.json") return join(join(searx_dir, "data"), "wikidata_units.json")
with open(get_wikidata_units_filename(), 'w', encoding="utf8") as f: if __name__ == '__main__':
json.dump(get_data(), f, indent=4, ensure_ascii=False) with open(get_wikidata_units_filename(), 'w', encoding="utf8") as f:
json.dump(get_data(), f, indent=4, ensure_ascii=False)