fix fetch_langauges to be more accurate

Add languages supported by either all default general engines or 10 engines.
This commit is contained in:
Marc Abonce Seguin 2018-02-14 16:17:46 -06:00
parent b9d4c0523e
commit d1eae9359f
5 changed files with 154 additions and 150 deletions

File diff suppressed because one or more lines are too long

View file

@ -23,7 +23,7 @@ from searx.url_utils import urlencode
categories = ['general'] categories = ['general']
paging = True paging = True
language_support = True language_support = True
supported_languages_url = 'https://duckduckgo.com/d2030.js' supported_languages_url = 'https://duckduckgo.com/util/u172.js'
time_range_support = True time_range_support = True
# search-url # search-url

View file

@ -72,7 +72,7 @@ country_to_hostname = {
'RO': 'www.google.ro', # Romania 'RO': 'www.google.ro', # Romania
'RU': 'www.google.ru', # Russia 'RU': 'www.google.ru', # Russia
'SK': 'www.google.sk', # Slovakia 'SK': 'www.google.sk', # Slovakia
'SL': 'www.google.si', # Slovenia (SL -> si) 'SI': 'www.google.si', # Slovenia
'SE': 'www.google.se', # Sweden 'SE': 'www.google.se', # Sweden
'TH': 'www.google.co.th', # Thailand 'TH': 'www.google.co.th', # Thailand
'TR': 'www.google.com.tr', # Turkey 'TR': 'www.google.com.tr', # Turkey

View file

@ -5,11 +5,7 @@
language_codes = ( language_codes = (
(u"ar-SA", u"العربية", u"", u"Arabic"), (u"ar-SA", u"العربية", u"", u"Arabic"),
(u"bg-BG", u"Български", u"", u"Bulgarian"), (u"bg-BG", u"Български", u"", u"Bulgarian"),
(u"ca", u"Català", u"", u"Catalan"), (u"ca-ES", u"Català", u"", u"Catalan"),
(u"ca-AD", u"Català", u"Andorra", u"Catalan"),
(u"ca-CT", u"Català", u"", u"Catalan"),
(u"ca-ES", u"Català", u"Espanya", u"Catalan"),
(u"ca-FR", u"Català", u"França", u"Catalan"),
(u"cs-CZ", u"Čeština", u"", u"Czech"), (u"cs-CZ", u"Čeština", u"", u"Czech"),
(u"da-DK", u"Dansk", u"", u"Danish"), (u"da-DK", u"Dansk", u"", u"Danish"),
(u"de", u"Deutsch", u"", u"German"), (u"de", u"Deutsch", u"", u"German"),
@ -21,55 +17,51 @@ language_codes = (
(u"en-AU", u"English", u"Australia", u"English"), (u"en-AU", u"English", u"Australia", u"English"),
(u"en-CA", u"English", u"Canada", u"English"), (u"en-CA", u"English", u"Canada", u"English"),
(u"en-GB", u"English", u"United Kingdom", u"English"), (u"en-GB", u"English", u"United Kingdom", u"English"),
(u"en-ID", u"English", u"Indonesia", u"English"),
(u"en-IE", u"English", u"Ireland", u"English"),
(u"en-IN", u"English", u"India", u"English"), (u"en-IN", u"English", u"India", u"English"),
(u"en-MY", u"English", u"Malaysia", u"English"), (u"en-MY", u"English", u"Malaysia", u"English"),
(u"en-NZ", u"English", u"New Zealand", u"English"),
(u"en-PH", u"English", u"Philippines", u"English"),
(u"en-SG", u"English", u"Singapore", u"English"),
(u"en-US", u"English", u"United States", u"English"), (u"en-US", u"English", u"United States", u"English"),
(u"en-ZA", u"English", u"South Africa", u"English"),
(u"es", u"Español", u"", u"Spanish"), (u"es", u"Español", u"", u"Spanish"),
(u"es-AD", u"Español", u"Andorra", u"Spanish"),
(u"es-AR", u"Español", u"Argentina", u"Spanish"), (u"es-AR", u"Español", u"Argentina", u"Spanish"),
(u"es-CL", u"Español", u"Chile", u"Spanish"),
(u"es-CO", u"Español", u"Colombia", u"Spanish"),
(u"es-ES", u"Español", u"España", u"Spanish"), (u"es-ES", u"Español", u"España", u"Spanish"),
(u"es-MX", u"Español", u"México", u"Spanish"), (u"es-MX", u"Español", u"México", u"Spanish"),
(u"es-PE", u"Español", u"Perú", u"Spanish"),
(u"es-US", u"Español", u"Estados Unidos", u"Spanish"),
(u"et-EE", u"Eesti", u"", u"Estonian"), (u"et-EE", u"Eesti", u"", u"Estonian"),
(u"fa-IR", u"فارسی", u"", u"Persian"),
(u"fi-FI", u"Suomi", u"", u"Finnish"), (u"fi-FI", u"Suomi", u"", u"Finnish"),
(u"fr", u"Français", u"", u"French"), (u"fr", u"Français", u"", u"French"),
(u"fr-AD", u"Français", u"Andorre", u"French"),
(u"fr-BE", u"Français", u"Belgique", u"French"), (u"fr-BE", u"Français", u"Belgique", u"French"),
(u"fr-CA", u"Français", u"Canada", u"French"), (u"fr-CA", u"Français", u"Canada", u"French"),
(u"fr-CH", u"Français", u"Suisse", u"French"), (u"fr-CH", u"Français", u"Suisse", u"French"),
(u"fr-FR", u"Français", u"France", u"French"), (u"fr-FR", u"Français", u"France", u"French"),
(u"he-IL", u"עברית", u"", u"Hebrew"), (u"he-IL", u"עברית", u"", u"Hebrew"),
(u"hr-HR", u"Hrvatski", u"", u"Croatian"),
(u"hu-HU", u"Magyar", u"", u"Hungarian"), (u"hu-HU", u"Magyar", u"", u"Hungarian"),
(u"it", u"Italiano", u"", u"Italian"), (u"id-ID", u"Indonesia", u"", u"Indonesian"),
(u"it-CH", u"Italiano", u"Svizzera", u"Italian"), (u"is-IS", u"Íslenska", u"", u"Icelandic"),
(u"it-IT", u"Italiano", u"Italia", u"Italian"), (u"it-IT", u"Italiano", u"", u"Italian"),
(u"ja-JP", u"日本語", u"", u"Japanese"), (u"ja-JP", u"日本語", u"", u"Japanese"),
(u"ko-KR", u"한국어", u"", u"Korean"), (u"ko-KR", u"한국어", u"", u"Korean"),
(u"lt-LT", u"Lietuvių", u"", u"Lithuanian"),
(u"lv-LV", u"Latviešu", u"", u"Latvian"),
(u"ms-MY", u"Bahasa Melayu", u"", u"Malay"),
(u"nb-NO", u"Norsk Bokmål", u"", u"Norwegian Bokmål"),
(u"nl", u"Nederlands", u"", u"Dutch"), (u"nl", u"Nederlands", u"", u"Dutch"),
(u"nl-BE", u"Nederlands", u"België", u"Dutch"), (u"nl-BE", u"Nederlands", u"België", u"Dutch"),
(u"nl-NL", u"Nederlands", u"Nederland", u"Dutch"), (u"nl-NL", u"Nederlands", u"Nederland", u"Dutch"),
(u"no-NO", u"Norsk", u"", u"Norwegian"),
(u"pl-PL", u"Polski", u"", u"Polish"), (u"pl-PL", u"Polski", u"", u"Polish"),
(u"pt", u"Português", u"", u"Portuguese"), (u"pt", u"Português", u"", u"Portuguese"),
(u"pt-AD", u"Português", u"Andorra", u"Portuguese"),
(u"pt-BR", u"Português", u"Brasil", u"Portuguese"), (u"pt-BR", u"Português", u"Brasil", u"Portuguese"),
(u"pt-PT", u"Português", u"Portugal", u"Portuguese"), (u"pt-PT", u"Português", u"Portugal", u"Portuguese"),
(u"ro-RO", u"Română", u"", u"Romanian"), (u"ro-RO", u"Română", u"", u"Romanian"),
(u"ru-RU", u"Русский", u"", u"Russian"), (u"ru-RU", u"Русский", u"", u"Russian"),
(u"sk-SK", u"Slovenčina", u"", u"Slovak"),
(u"sl-SI", u"Slovenščina", u"", u"Slovenian"),
(u"sr-RS", u"Српски", u"", u"Serbian"),
(u"sv-SE", u"Svenska", u"", u"Swedish"), (u"sv-SE", u"Svenska", u"", u"Swedish"),
(u"th-TH", u"ไทย", u"", u"Thai"), (u"th-TH", u"ไทย", u"", u"Thai"),
(u"tr-TR", u"Türkçe", u"", u"Turkish"), (u"tr-TR", u"Türkçe", u"", u"Turkish"),
(u"uk-UA", u"Українська", u"", u"Ukrainian"),
(u"vi-VN", u"Tiếng Việt", u"", u"Vietnamese"),
(u"zh", u"中文", u"", u"Chinese"), (u"zh", u"中文", u"", u"Chinese"),
(u"zh-CN", u"中文", u"中国", u"Chinese"), (u"zh-CN", u"中文", u"中国", u"Chinese"),
(u"zh-HK", u"中文", u"香港", u"Chinese"), (u"zh-TW", u"中文", u"台灣", u"Chinese")
(u"zh-TW", u"中文", u"台湾", u"Chinese")
) )

View file

@ -2,83 +2,40 @@
# This script generates languages.py from intersecting each engine's supported languages. # This script generates languages.py from intersecting each engine's supported languages.
# #
# The country names are obtained from http://api.geonames.org which requires registering as a user.
#
# Output files (engines_languages.json and languages.py) # Output files (engines_languages.json and languages.py)
# are written in current directory to avoid overwriting in case something goes wrong. # are written in current directory to avoid overwriting in case something goes wrong.
from requests import get from json import dump
from lxml.html import fromstring
from json import loads, dump
import io import io
from sys import path from sys import path
from babel import Locale, UnknownLocaleError
from babel.languages import get_global
path.append('../searx') # noqa path.append('../searx') # noqa
from searx import settings from searx import settings
from searx.url_utils import urlencode
from searx.engines import initialize_engines, engines from searx.engines import initialize_engines, engines
# Geonames API for country names.
geonames_user = '' # ADD USER NAME HERE
country_names_url = 'http://api.geonames.org/countryInfoJSON?{parameters}'
# Output files. # Output files.
engines_languages_file = 'engines_languages.json' engines_languages_file = 'engines_languages.json'
languages_file = 'languages.py' languages_file = 'languages.py'
engines_languages = {} # custom fixes for non standard locale codes
# sl-SL is technically not invalid, but still a mistake
# TODO: move to respective engines
# To filter out invalid codes and dialects. locale_fixes = {
def valid_code(lang_code): 'sl-sl': 'sl-SI',
# filter invalid codes 'ar-xa': 'ar-SA',
# sl-SL is technically not invalid, but still a mistake 'es-xl': 'es-419',
invalid_codes = ['sl-SL', 'wt-WT', 'jw'] 'zh-chs': 'zh-Hans-CN',
invalid_countries = ['UK', 'XA', 'XL'] 'zh-cht': 'zh-Hant-TW',
if lang_code[:2] == 'xx'\ 'tzh-tw': 'zh-Hant-TW',
or lang_code in invalid_codes\ 'tzh-hk': 'zh-Hant-HK'
or lang_code[-2:] in invalid_countries\ }
or is_dialect(lang_code):
return False
return True
# Language codes with any additional tags other than language and country.
def is_dialect(lang_code):
lang_code = lang_code.split('-')
if len(lang_code) > 2 or len(lang_code[0]) > 3:
return True
if len(lang_code) == 2 and len(lang_code[1]) > 2:
return True
return False
# Get country name in specified language.
def get_country_name(locale):
if geonames_user is '':
return ''
locale = locale.split('-')
if len(locale) != 2:
return ''
url = country_names_url.format(parameters=urlencode({'lang': locale[0],
'country': locale[1],
'username': geonames_user}))
response = get(url)
json = loads(response.text)
content = json.get('geonames', None)
if content is None or len(content) != 1:
print("No country name found for " + locale[0] + "-" + locale[1])
return ''
return content[0].get('countryName', '')
# Fetchs supported languages for each engine and writes json file with those. # Fetchs supported languages for each engine and writes json file with those.
def fetch_supported_languages(): def fetch_supported_languages():
initialize_engines(settings['engines']) engines_languages = {}
for engine_name in engines: for engine_name in engines:
if hasattr(engines[engine_name], 'fetch_supported_languages'): if hasattr(engines[engine_name], 'fetch_supported_languages'):
try: try:
@ -90,81 +47,134 @@ def fetch_supported_languages():
with io.open(engines_languages_file, "w", encoding="utf-8") as f: with io.open(engines_languages_file, "w", encoding="utf-8") as f:
dump(engines_languages, f, ensure_ascii=False) dump(engines_languages, f, ensure_ascii=False)
return engines_languages
# Get babel Locale object from lang_code if possible.
def get_locale(lang_code):
try:
locale = Locale.parse(lang_code, sep='-')
return locale
except (UnknownLocaleError, ValueError):
return None
# Append engine_name to list of engines that support locale.
def add_engine_counter(lang_code, engine_name, languages):
if lang_code in languages:
if 'counter' not in languages[lang_code]:
languages[lang_code]['counter'] = [engine_name]
elif engine_name not in languages[lang_code]['counter']:
languages[lang_code]['counter'].append(engine_name)
# Join all language lists. # Join all language lists.
# Iterate all languages supported by each engine. # TODO: Add language names from engine's language list if name not known by babel.
def join_language_lists(): def join_language_lists(engines_languages):
global languages language_list = {}
# include wikipedia first for more accurate language names
languages = {code: lang for code, lang
in engines_languages['wikipedia'].items()
if valid_code(code)}
for engine_name in engines_languages: for engine_name in engines_languages:
for locale in engines_languages[engine_name]: for lang_code in engines_languages[engine_name]:
if valid_code(locale):
# if language is not on list or if it has no name yet # apply custom fixes if necessary
if locale not in languages or not languages[locale].get('name'): if lang_code.lower() in locale_fixes:
if isinstance(engines_languages[engine_name], dict): lang_code = locale_fixes[lang_code.lower()]
languages[locale] = engines_languages[engine_name][locale]
locale = get_locale(lang_code)
# ensure that lang_code uses standard language and country codes
if locale and locale.territory:
lang_code = locale.language + '-' + locale.territory
# add locale if it's not in list
if lang_code not in language_list:
if locale:
language_list[lang_code] = {'name': locale.get_language_name().title(),
'english_name': locale.english_name,
'country': locale.get_territory_name() or ''}
# also add language without country
if locale.language not in language_list:
language_list[locale.language] = {'name': locale.get_language_name().title(),
'english_name': locale.english_name}
else: else:
languages[locale] = {} language_list[lang_code] = {}
# add to counter of engines that support given language # count engine for both language_country combination and language alone
lang = locale.split('-')[0] add_engine_counter(lang_code, engine_name, language_list)
if lang in languages: add_engine_counter(lang_code.split('-')[0], engine_name, language_list)
if 'counter' not in languages[lang]:
languages[lang]['counter'] = [engine_name]
elif engine_name not in languages[lang]['counter']:
languages[lang]['counter'].append(engine_name)
# filter list to include only languages supported by most engines return language_list
min_supported_engines = int(0.70 * len(engines_languages))
languages = {code: lang for code, lang
in languages.items()
if len(lang.get('counter', [])) >= min_supported_engines or
len(languages.get(code.split('-')[0], {}).get('counter', [])) >= min_supported_engines}
# get locales that have no name or country yet
for locale in languages.keys():
# try to get language names
if not languages[locale].get('name'):
name = languages.get(locale.split('-')[0], {}).get('name', None)
if name:
languages[locale]['name'] = name
else:
# filter out locales with no name
del languages[locale]
continue
# try to get language name in english
if not languages[locale].get('english_name'):
languages[locale]['english_name'] = languages.get(locale.split('-')[0], {}).get('english_name', '')
# try to get country name
if locale.find('-') > 0 and not languages[locale].get('country'):
languages[locale]['country'] = get_country_name(locale) or ''
# Remove countryless language if language is featured in only one country. # Filter language list so it only includes the most supported languages and countries.
def filter_single_country_languages(): def filter_language_list(all_languages):
prev_lang = None min_supported_engines = 10
prev_code = None main_engines = [engine_name for engine_name in engines.keys()
for code in sorted(languages): if 'general' in engines[engine_name].categories and
lang = code.split('-')[0] engines[engine_name].supported_languages and
if lang == prev_lang: not engines[engine_name].disabled]
countries += 1
else: # filter list to include only languages supported by most engines or all default general engines
if prev_lang is not None and countries == 1: filtered_languages = {code: lang for code, lang
del languages[prev_lang] in all_languages.items()
languages[prev_code]['country'] = '' if (len(lang.get('counter', [])) >= min_supported_engines or
all(main_engine in lang.get('counter', [])
for main_engine in main_engines))}
return filtered_languages
# Add country codes to languages without one and filter out language codes.
def assign_country_codes(filtered_languages, all_languages):
sorted_languages = sorted(all_languages,
key=lambda lang: len(all_languages[lang].get('counter', [])),
reverse=True)
previous_lang = None
previous_code = None
countries = 0 countries = 0
prev_lang = lang for current_code in sorted(filtered_languages):
prev_code = code current_lang = current_code.split('-')[0]
# count country codes per language
if current_lang == previous_lang:
countries += 1
else:
if previous_lang is not None:
# if language has no single country code
if countries == 0:
# try to get country code with most supported engines
for l in sorted_languages:
l_parts = l.split('-')
if len(l_parts) == 2 and l_parts[0] == previous_lang:
filtered_languages[l] = all_languages[l]
filtered_languages[l]['country'] = ''
countries = 1
break
if countries == 0:
# get most likely country code from babel
subtags = get_global('likely_subtags').get(previous_lang)
if subtags:
subtag_parts = subtags.split('_')
new_code = subtag_parts[0] + '-' + subtag_parts[-1]
filtered_languages[new_code] = all_languages[previous_lang]
countries = 1
if countries == 1:
# remove countryless version of language if there's only one country
del filtered_languages[previous_lang]
if previous_code in filtered_languages:
filtered_languages[previous_code]['country'] = ''
countries = 0
previous_lang = current_lang
previous_code = current_code
# Write languages.py. # Write languages.py.
def write_languages_file(): def write_languages_file(languages):
new_file = open(languages_file, 'wb') new_file = open(languages_file, 'wb')
file_content = '# -*- coding: utf-8 -*-\n'\ file_content = '# -*- coding: utf-8 -*-\n'\
+ '# list of language codes\n'\ + '# list of language codes\n'\
@ -183,7 +193,9 @@ def write_languages_file():
if __name__ == "__main__": if __name__ == "__main__":
fetch_supported_languages() initialize_engines(settings['engines'])
join_language_lists() engines_languages = fetch_supported_languages()
filter_single_country_languages() all_languages = join_language_lists(engines_languages)
write_languages_file() filtered_languages = filter_language_list(all_languages)
assign_country_codes(filtered_languages, all_languages)
write_languages_file(filtered_languages)