fix utils/fetch_languages to work with new languages

This commit is contained in:
Marc Abonce Seguin 2020-09-14 00:07:45 -07:00 committed by Alexandre Flament
parent 41800835f9
commit c86504b47a
4 changed files with 2912 additions and 1054 deletions

View file

@ -239,7 +239,8 @@ test.sh:
test.pep8: pyenvinstall test.pep8: pyenvinstall
@echo "TEST pep8" @echo "TEST pep8"
$(Q)$(PY_ENV_ACT); pep8 --exclude='searx/static, searx/engines/gigablast.py' --max-line-length=120 --ignore "E402,W503" searx tests $(Q)$(PY_ENV_ACT); pep8 --exclude='searx/static, searx/languages.py, searx/engines/gigablast.py' \
--max-line-length=120 --ignore "E402,W503" searx tests
test.unit: pyenvinstall test.unit: pyenvinstall
@echo "TEST tests/unit" @echo "TEST tests/unit"

File diff suppressed because it is too large Load diff

View file

@ -1,75 +1,96 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# list of language codes # list of language codes
# this file is generated automatically by utils/update_search_languages.py # this file is generated automatically by utils/fetch_languages.py
language_codes = \
language_codes = ( ( ('af-ZA', 'Afrikaans', '', 'Afrikaans'),
("af-NA", "Afrikaans", "", "Afrikaans"), ('am-ET', 'አማርኛ', '', 'Amharic'),
("ar-SA", "العربية", "", "Arabic"), ('ar-EG', 'العربية', '', 'Arabic'),
("be-BY", "Беларуская", "", "Belarusian"), ('az-AZ', 'Azərbaycan', '', 'Azerbaijani'),
("bg-BG", "Български", "", "Bulgarian"), ('be-BY', 'Беларуская', '', 'Belarusian'),
("ca-AD", "Català", "", "Catalan"), ('bg-BG', 'Български', '', 'Bulgarian'),
("cs-CZ", "Čeština", "", "Czech"), ('bn-BD', 'বাংলা', '', 'Bangla'),
("da-DK", "Dansk", "", "Danish"), ('bs-BA', 'Bosanski', '', 'Bosnian'),
("de", "Deutsch", "", "German"), ('ca-ES', 'Català', '', 'Catalan'),
("de-AT", "Deutsch", "Österreich", "German"), ('cs-CZ', 'Čeština', '', 'Czech'),
("de-CH", "Deutsch", "Schweiz", "German"), ('da-DK', 'Dansk', '', 'Danish'),
("de-DE", "Deutsch", "Deutschland", "German"), ('de', 'Deutsch', '', 'German'),
("el-GR", "Ελληνικά", "", "Greek"), ('de-AT', 'Deutsch', 'Österreich', 'German'),
("en", "English", "", "English"), ('de-CH', 'Deutsch', 'Schweiz', 'German'),
("en-AU", "English", "Australia", "English"), ('de-DE', 'Deutsch', 'Deutschland', 'German'),
("en-CA", "English", "Canada", "English"), ('el-GR', 'Ελληνικά', '', 'Greek'),
("en-GB", "English", "United Kingdom", "English"), ('en', 'English', '', 'English'),
("en-IE", "English", "Ireland", "English"), ('en-AU', 'English', 'Australia', 'English'),
("en-IN", "English", "India", "English"), ('en-CA', 'English', 'Canada', 'English'),
("en-NZ", "English", "New Zealand", "English"), ('en-GB', 'English', 'United Kingdom', 'English'),
("en-PH", "English", "Philippines", "English"), ('en-IE', 'English', 'Ireland', 'English'),
("en-SG", "English", "Singapore", "English"), ('en-IN', 'English', 'India', 'English'),
("en-US", "English", "United States", "English"), ('en-NZ', 'English', 'New Zealand', 'English'),
("es", "Español", "", "Spanish"), ('en-PH', 'English', 'Philippines', 'English'),
("es-AR", "Español", "Argentina", "Spanish"), ('en-SG', 'English', 'Singapore', 'English'),
("es-CL", "Español", "Chile", "Spanish"), ('en-US', 'English', 'United States', 'English'),
("es-ES", "Español", "España", "Spanish"), ('es', 'Español', '', 'Spanish'),
("es-MX", "Español", "México", "Spanish"), ('es-AR', 'Español', 'Argentina', 'Spanish'),
("et-EE", "Eesti", "", "Estonian"), ('es-CL', 'Español', 'Chile', 'Spanish'),
("fa-IR", "فارسی", "", "Persian"), ('es-ES', 'Español', 'España', 'Spanish'),
("fi-FI", "Suomi", "", "Finnish"), ('es-MX', 'Español', 'México', 'Spanish'),
("fr", "Français", "", "French"), ('et-EE', 'Eesti', '', 'Estonian'),
("fr-BE", "Français", "Belgique", "French"), ('fa-IR', 'فارسی', '', 'Persian'),
("fr-CA", "Français", "Canada", "French"), ('fi-FI', 'Suomi', '', 'Finnish'),
("fr-CH", "Français", "Suisse", "French"), ('fo-FO', 'Føroyskt', '', 'Faroese'),
("fr-FR", "Français", "France", "French"), ('fr', 'Français', '', 'French'),
("he-IL", "עברית", "", "Hebrew"), ('fr-BE', 'Français', 'Belgique', 'French'),
("hr-HR", "Hrvatski", "", "Croatian"), ('fr-CA', 'Français', 'Canada', 'French'),
("hu-HU", "Magyar", "", "Hungarian"), ('fr-CH', 'Français', 'Suisse', 'French'),
("hy-AM", "Հայերեն", "", "Armenian"), ('fr-FR', 'Français', 'France', 'French'),
("id-ID", "Indonesia", "", "Indonesian"), ('he-IL', 'עברית', '', 'Hebrew'),
("is-IS", "Íslenska", "", "Icelandic"), ('hr-HR', 'Hrvatski', '', 'Croatian'),
("it-IT", "Italiano", "", "Italian"), ('hu-HU', 'Magyar', '', 'Hungarian'),
("ja-JP", "日本語", "", "Japanese"), ('hy-AM', 'Հայերեն', '', 'Armenian'),
("ko-KR", "한국어", "", "Korean"), ('id-ID', 'Indonesia', '', 'Indonesian'),
("lt-LT", "Lietuvių", "", "Lithuanian"), ('is-IS', 'Íslenska', '', 'Icelandic'),
("lv-LV", "Latviešu", "", "Latvian"), ('it-IT', 'Italiano', '', 'Italian'),
("ms-MY", "Melayu", "", "Malay"), ('ja-JP', '日本語', '', 'Japanese'),
("nb-NO", "Norsk Bokmål", "", "Norwegian Bokmål"), ('ka-GE', 'ქართული', '', 'Georgian'),
("nl", "Nederlands", "", "Dutch"), ('kk-KZ', 'Қазақ Тілі', '', 'Kazakh'),
("nl-BE", "Nederlands", "België", "Dutch"), ('km-KH', 'ខ្មែរ', '', 'Khmer'),
("nl-NL", "Nederlands", "Nederland", "Dutch"), ('ko-KR', '한국어', '', 'Korean'),
("pl-PL", "Polski", "", "Polish"), ('ky-KG', 'Кыргызча', '', 'Kyrgyz'),
("pt", "Português", "", "Portuguese"), ('lo-LA', 'ລາວ', '', 'Lao'),
("pt-BR", "Português", "Brasil", "Portuguese"), ('lt-LT', 'Lietuvių', '', 'Lithuanian'),
("pt-PT", "Português", "Portugal", "Portuguese"), ('lv-LV', 'Latviešu', '', 'Latvian'),
("ro-RO", "Română", "", "Romanian"), ('mk-MK', 'Македонски', '', 'Macedonian'),
("ru-RU", "Русский", "", "Russian"), ('mn-MN', 'Монгол', '', 'Mongolian'),
("sk-SK", "Slovenčina", "", "Slovak"), ('ms-MY', 'Melayu', '', 'Malay'),
("sl-SI", "Slovenščina", "", "Slovenian"), ('mt-MT', 'Malti', '', 'Maltese'),
("sr-RS", "Srpski", "", "Serbian"), ('nb-NO', 'Norsk Bokmål', '', 'Norwegian Bokmål'),
("sv-SE", "Svenska", "", "Swedish"), ('ne-NP', 'नेपाली', '', 'Nepali'),
("sw-KE", "Kiswahili", "", "Swahili"), ('nl', 'Nederlands', '', 'Dutch'),
("th-TH", "ไทย", "", "Thai"), ('nl-BE', 'Nederlands', 'België', 'Dutch'),
("tr-TR", "Türkçe", "", "Turkish"), ('nl-NL', 'Nederlands', 'Nederland', 'Dutch'),
("uk-UA", "Українська", "", "Ukrainian"), ('pl-PL', 'Polski', '', 'Polish'),
("vi-VN", "Tiếng Việt", "", "Vietnamese"), ('pt', 'Português', '', 'Portuguese'),
("zh", "中文", "", "Chinese"), ('pt-BR', 'Português', 'Brasil', 'Portuguese'),
("zh-CN", "中文", "中国", "Chinese"), ('pt-PT', 'Português', 'Portugal', 'Portuguese'),
("zh-TW", "中文", "台灣", "Chinese") ('ro-RO', 'Română', '', 'Romanian'),
) ('ru-RU', 'Русский', '', 'Russian'),
('rw-RW', 'Kinyarwanda', '', 'Kinyarwanda'),
('si-LK', 'සිංහල', '', 'Sinhala'),
('sk-SK', 'Slovenčina', '', 'Slovak'),
('sl-SI', 'Slovenščina', '', 'Slovenian'),
('so-SO', 'Soomaali', '', 'Somali'),
('sq-AL', 'Shqip', '', 'Albanian'),
('sr-RS', 'Srpski', '', 'Serbian'),
('sv-SE', 'Svenska', '', 'Swedish'),
('sw-TZ', 'Kiswahili', '', 'Swahili'),
('tg-TJ', 'Тоҷикӣ', '', 'Tajik'),
('th-TH', 'ไทย', '', 'Thai'),
('ti-ET', 'ትግርኛ', '', 'Tigrinya'),
('tk-TM', 'Türkmen Dili', '', 'Turkmen'),
('tr-TR', 'Türkçe', '', 'Turkish'),
('uk-UA', 'Українська', '', 'Ukrainian'),
('ur-PK', 'اردو', '', 'Urdu'),
('uz-UZ', 'OZbek', '', 'Uzbek'),
('vi-VN', 'Tiếng Việt', '', 'Vietnamese'),
('zh', '中文', '', 'Chinese'),
('zh-CN', '中文', '中国', 'Chinese'),
('zh-TW', '中文', '台灣', 'Chinese'))

View file

@ -6,7 +6,7 @@
# are written in current directory to avoid overwriting in case something goes wrong. # are written in current directory to avoid overwriting in case something goes wrong.
import json import json
import io from pprint import pformat
from sys import path from sys import path
from babel import Locale, UnknownLocaleError from babel import Locale, UnknownLocaleError
from babel.languages import get_global from babel.languages import get_global
@ -23,7 +23,7 @@ languages_file = 'languages.py'
# Fetchs supported languages for each engine and writes json file with those. # Fetchs supported languages for each engine and writes json file with those.
def fetch_supported_languages(): def fetch_supported_languages():
engines_languages = {} engines_languages = dict()
names = list(engines) names = list(engines)
names.sort() names.sort()
@ -51,19 +51,9 @@ def get_locale(lang_code):
return None return None
# Append engine_name to list of engines that support locale.
def add_engine_counter(lang_code, engine_name, languages):
if lang_code in languages:
if 'counter' not in languages[lang_code]:
languages[lang_code]['counter'] = [engine_name]
elif engine_name not in languages[lang_code]['counter']:
languages[lang_code]['counter'].append(engine_name)
# Join all language lists. # Join all language lists.
# TODO: Add language names from engine's language list if name not known by babel.
def join_language_lists(engines_languages): def join_language_lists(engines_languages):
language_list = {} language_list = dict()
for engine_name in engines_languages: for engine_name in engines_languages:
for lang_code in engines_languages[engine_name]: for lang_code in engines_languages[engine_name]:
@ -76,32 +66,51 @@ def join_language_lists(engines_languages):
# ensure that lang_code uses standard language and country codes # ensure that lang_code uses standard language and country codes
if locale and locale.territory: if locale and locale.territory:
lang_code = locale.language + '-' + locale.territory lang_code = "{lang}-{country}".format(lang=locale.language, country=locale.territory)
short_code = lang_code.split('-')[0]
# add locale if it's not in list # add language without country if not in list
if lang_code not in language_list: if short_code not in language_list:
if locale: if locale:
language_list[lang_code] = {'name': locale.get_language_name().title(), # get language's data from babel's Locale object
'english_name': locale.english_name, language_name = locale.get_language_name().title()
'country': locale.get_territory_name() or ''} english_name = locale.english_name.split(' (')[0]
elif short_code in engines_languages['wikipedia']:
# also add language without country # get language's data from wikipedia if not known by babel
if locale.language not in language_list: language_name = engines_languages['wikipedia'][short_code]['name']
language_list[locale.language] = {'name': locale.get_language_name().title(), english_name = engines_languages['wikipedia'][short_code]['english_name']
'english_name': locale.english_name}
else: else:
language_list[lang_code] = {} language_name = None
english_name = None
# add language to list
language_list[short_code] = {'name': language_name,
'english_name': english_name,
'counter': set(),
'countries': dict()}
# add language with country if not in list
if lang_code != short_code and lang_code not in language_list[short_code]['countries']:
country_name = ''
if locale:
# get country name from babel's Locale object
country_name = locale.get_territory_name()
language_list[short_code]['countries'][lang_code] = {'country_name': country_name,
'counter': set()}
# count engine for both language_country combination and language alone # count engine for both language_country combination and language alone
add_engine_counter(lang_code, engine_name, language_list) language_list[short_code]['counter'].add(engine_name)
add_engine_counter(lang_code.split('-')[0], engine_name, language_list) if lang_code != short_code:
language_list[short_code]['countries'][lang_code]['counter'].add(engine_name)
return language_list return language_list
# Filter language list so it only includes the most supported languages and countries. # Filter language list so it only includes the most supported languages and countries
def filter_language_list(all_languages): def filter_language_list(all_languages):
min_supported_engines = 10 min_engines_per_lang = 15
min_engines_per_country = 10
main_engines = [engine_name for engine_name in engines.keys() main_engines = [engine_name for engine_name in engines.keys()
if 'general' in engines[engine_name].categories and if 'general' in engines[engine_name].categories and
engines[engine_name].supported_languages and engines[engine_name].supported_languages and
@ -110,82 +119,84 @@ def filter_language_list(all_languages):
# filter list to include only languages supported by most engines or all default general engines # filter list to include only languages supported by most engines or all default general engines
filtered_languages = {code: lang for code, lang filtered_languages = {code: lang for code, lang
in all_languages.items() in all_languages.items()
if (len(lang.get('counter', [])) >= min_supported_engines or if (len(lang['counter']) >= min_engines_per_lang or
all(main_engine in lang.get('counter', []) all(main_engine in lang['counter']
for main_engine in main_engines))} for main_engine in main_engines))}
return filtered_languages def _copy_lang_data(lang, country_name=None):
new_dict = dict()
new_dict['name'] = all_languages[lang]['name']
new_dict['english_name'] = all_languages[lang]['english_name']
if country_name:
new_dict['country_name'] = country_name
return new_dict
def _country_count(i):
return len(countries[sorted_countries[i]]['counter'])
# Add country codes to languages without one and filter out language codes. # for each language get country codes supported by most engines or at least one country code
def assign_country_codes(filtered_languages, all_languages): filtered_languages_with_countries = dict()
sorted_languages = sorted(all_languages, for lang, lang_data in filtered_languages.items():
key=lambda lang: len(all_languages[lang].get('counter', [])), countries = lang_data['countries']
reverse=True) filtered_countries = dict()
previous_lang = None
previous_code = None
countries = 0
for current_code in sorted(filtered_languages):
current_lang = current_code.split('-')[0]
# count country codes per language # get language's country codes with enough supported engines
if current_lang == previous_lang: for lang_country, country_data in countries.items():
countries += 1 if len(country_data['counter']) >= min_engines_per_country:
filtered_countries[lang_country] = _copy_lang_data(lang, country_data['country_name'])
else: # add language without countries too if there's more than one country to choose from
if previous_lang is not None: if len(filtered_countries) > 1:
# if language has no single country code filtered_countries[lang] = _copy_lang_data(lang)
if countries == 0: elif len(filtered_countries) == 1:
# try to get country code with most supported engines # if there's only one country per language, it's not necessary to show country name
for l in sorted_languages: lang_country = next(iter(filtered_countries))
l_parts = l.split('-') filtered_countries[lang_country]['country_name'] = None
if len(l_parts) == 2 and l_parts[0] == previous_lang:
filtered_languages[l] = all_languages[l]
filtered_languages[l]['country'] = ''
countries = 1
break
if countries == 0: # if no country has enough engines try to get most likely country code from babel
# get most likely country code from babel if not filtered_countries:
subtags = get_global('likely_subtags').get(previous_lang) lang_country = None
if subtags: subtags = get_global('likely_subtags').get(lang)
subtag_parts = subtags.split('_') if subtags:
new_code = subtag_parts[0] + '-' + subtag_parts[-1] country_code = subtags.split('_')[-1]
filtered_languages[new_code] = all_languages[previous_lang] if len(country_code) == 2:
countries = 1 lang_country = "{lang}-{country}".format(lang=lang, country=country_code)
if countries == 1: if lang_country:
# remove countryless version of language if there's only one country filtered_countries[lang_country] = _copy_lang_data(lang)
del filtered_languages[previous_lang] else:
if previous_code in filtered_languages: filtered_countries[lang] = _copy_lang_data(lang)
filtered_languages[previous_code]['country'] = ''
countries = 0 filtered_languages_with_countries.update(filtered_countries)
previous_lang = current_lang
previous_code = current_code return filtered_languages_with_countries
# Write languages.py. # Write languages.py.
def write_languages_file(languages): def write_languages_file(languages):
new_file = open(languages_file, 'wb') file_headers = (
file_content = '# -*- coding: utf-8 -*-\n'\ "# -*- coding: utf-8 -*-",
+ '# list of language codes\n'\ "# list of language codes",
+ '# this file is generated automatically by utils/update_search_languages.py\n'\ "# this file is generated automatically by utils/fetch_languages.py",
+ '\nlanguage_codes = (' "language_codes ="
for code in sorted(languages): )
if 'name' in languages[code]:
file_content += '\n ("' + code + '"'\ language_codes = tuple([
+ ', "' + languages[code]['name'].split(' (')[0] + '"'\ (
+ ', "' + languages[code].get('country', '') + '"'\ code,
+ ', "' + languages[code].get('english_name', '').split(' (')[0] + '"),' languages[code]['name'].split(' (')[0],
else: languages[code].get('country_name') or '',
print('ignore ',languages[code]) languages[code].get('english_name') or ''
# remove last comma ) for code in sorted(languages)
file_content = file_content[:-1] ])
file_content += '\n)\n'
new_file.write(file_content.encode()) with open(languages_file, 'w') as new_file:
new_file.close() file_content = "{file_headers} \\\n{language_codes}".format(
file_headers='\n'.join(file_headers),
language_codes=pformat(language_codes, indent=4)
)
new_file.write(file_content)
new_file.close()
if __name__ == "__main__": if __name__ == "__main__":
@ -193,5 +204,4 @@ if __name__ == "__main__":
engines_languages = fetch_supported_languages() engines_languages = fetch_supported_languages()
all_languages = join_language_lists(engines_languages) all_languages = join_language_lists(engines_languages)
filtered_languages = filter_language_list(all_languages) filtered_languages = filter_language_list(all_languages)
assign_country_codes(filtered_languages, all_languages)
write_languages_file(filtered_languages) write_languages_file(filtered_languages)