Merge pull request #1252 from MarcAbonce/search-languages

[mod] Refactor engine's search language handling
This commit is contained in:
Adam Tauber 2018-04-05 17:27:07 +02:00 committed by GitHub
commit 283f6c9053
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
43 changed files with 414 additions and 306 deletions

File diff suppressed because one or more lines are too long

View file

@ -20,13 +20,14 @@ import sys
import threading
from os.path import realpath, dirname
from io import open
from babel.localedata import locale_identifiers
from flask_babel import gettext
from operator import itemgetter
from json import loads
from requests import get
from searx import settings
from searx import logger
from searx.utils import load_module
from searx.utils import load_module, match_language
logger = logger.getChild('engines')
@ -38,6 +39,8 @@ engines = {}
categories = {'general': []}
languages = loads(open(engine_dir + '/../data/engines_languages.json', 'r', encoding='utf-8').read())
babel_langs = [lang_parts[0] + '-' + lang_parts[-1] if len(lang_parts) > 1 else lang_parts[0]
for lang_parts in (lang_code.split('_') for lang_code in locale_identifiers())]
engine_shortcuts = {}
engine_default_args = {'paging': False,
@ -97,6 +100,22 @@ def load_engine(engine_data):
if engine_data['name'] in languages:
setattr(engine, 'supported_languages', languages[engine_data['name']])
# find custom aliases for non standard language codes
if hasattr(engine, 'supported_languages'):
if hasattr(engine, 'language_aliases'):
language_aliases = getattr(engine, 'language_aliases')
else:
language_aliases = {}
for engine_lang in getattr(engine, 'supported_languages'):
iso_lang = match_language(engine_lang, babel_langs, fallback=None)
if iso_lang and iso_lang != engine_lang and not engine_lang.startswith(iso_lang) and \
iso_lang not in getattr(engine, 'supported_languages'):
language_aliases[iso_lang] = engine_lang
if language_aliases:
setattr(engine, 'language_aliases', language_aliases)
# assign language fetching method if auxiliary method exists
if hasattr(engine, '_fetch_supported_languages'):
setattr(engine, 'fetch_supported_languages',

View file

@ -99,13 +99,13 @@ supported_languages = dict(lang_urls, **main_langs)
# do search-request
def request(query, params):
# translate the locale (e.g. 'en_US') to language code ('en')
# translate the locale (e.g. 'en-US') to language code ('en')
language = locale_to_lang_code(params['language'])
# if our language is hosted on the main site, we need to add its name
# to the query in order to narrow the results to that language
if language in main_langs:
query += '(' + main_langs[language] + ')'
query += b' (' + main_langs[language] + b')'
# prepare the request parameters
query = urlencode({'search': query})

View file

@ -16,12 +16,14 @@
from lxml import html
from searx.engines.xpath import extract_text
from searx.url_utils import urlencode
from searx.utils import match_language
# engine dependent config
categories = ['general']
paging = True
language_support = True
supported_languages_url = 'https://www.bing.com/account/general'
language_aliases = {'zh-CN': 'zh-CHS', 'zh-TW': 'zh-CHT', 'zh-HK': 'zh-CHT'}
# search-url
base_url = 'https://www.bing.com/'
@ -32,9 +34,9 @@ search_string = 'search?{query}&first={offset}'
def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1
lang = params['language'].split('-')[0].upper()
lang = match_language(params['language'], supported_languages, language_aliases)
query = u'language:{} {}'.format(lang, query.decode('utf-8')).encode('utf-8')
query = u'language:{} {}'.format(lang.split('-')[0].upper(), query.decode('utf-8')).encode('utf-8')
search_path = search_string.format(
query=urlencode({'q': query}),

View file

@ -19,6 +19,7 @@ from lxml import html
from json import loads
import re
from searx.url_utils import urlencode
from searx.utils import match_language
# engine dependent config
categories = ['images']
@ -46,26 +47,6 @@ safesearch_types = {2: 'STRICT',
_quote_keys_regex = re.compile('({|,)([a-z][a-z0-9]*):(")', re.I | re.U)
# get supported region code
def get_region_code(lang, lang_list=None):
region = None
if lang in (lang_list or supported_languages):
region = lang
elif lang.startswith('no'):
region = 'nb-NO'
else:
# try to get a supported country code with language
lang = lang.split('-')[0]
for lc in (lang_list or supported_languages):
if lang == lc.split('-')[0]:
region = lc
break
if region:
return region.lower()
else:
return 'en-us'
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1
@ -74,7 +55,7 @@ def request(query, params):
query=urlencode({'q': query}),
offset=offset)
language = get_region_code(params['language'])
language = match_language(params['language'], supported_languages).lower()
params['cookies']['SRCHHPGUSR'] = \
'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')

View file

@ -14,8 +14,8 @@
from datetime import datetime
from dateutil import parser
from lxml import etree
from searx.utils import list_get
from searx.engines.bing import _fetch_supported_languages, supported_languages_url
from searx.utils import list_get, match_language
from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases
from searx.url_utils import urlencode, urlparse, parse_qsl
# engine dependent config
@ -71,7 +71,7 @@ def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1
language = params['language']
language = match_language(params['language'], supported_languages, language_aliases)
params['url'] = _get_url(query, language, offset, params['time_range'])

View file

@ -12,9 +12,10 @@
from json import loads
from lxml import html
from searx.engines.bing_images import _fetch_supported_languages, supported_languages_url, get_region_code
from searx.engines.bing_images import _fetch_supported_languages, supported_languages_url
from searx.engines.xpath import extract_text
from searx.url_utils import urlencode
from searx.utils import match_language
categories = ['videos']
@ -47,8 +48,8 @@ def request(query, params):
'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
# language cookie
region = get_region_code(params['language'], lang_list=supported_languages)
params['cookies']['_EDGE_S'] = 'mkt=' + region + '&F=1'
language = match_language(params['language'], supported_languages).lower()
params['cookies']['_EDGE_S'] = 'mkt=' + language + '&F=1'
# query and paging
params['url'] = search_url.format(query=urlencode({'q': query}),

View file

@ -15,6 +15,7 @@
from json import loads
from datetime import datetime
from searx.url_utils import urlencode
from searx.utils import match_language
# engine dependent config
categories = ['videos']
@ -32,7 +33,7 @@ supported_languages_url = 'https://api.dailymotion.com/languages'
# do search-request
def request(query, params):
locale = params['language']
locale = match_language(params['language'], supported_languages)
params['url'] = search_url.format(
query=urlencode({'search': query, 'localization': locale}),

View file

@ -18,14 +18,25 @@ from json import loads
from searx.engines.xpath import extract_text
from searx.poolrequests import get
from searx.url_utils import urlencode
from searx.utils import match_language
# engine dependent config
categories = ['general']
paging = True
language_support = True
supported_languages_url = 'https://duckduckgo.com/d2030.js'
supported_languages_url = 'https://duckduckgo.com/util/u172.js'
time_range_support = True
language_aliases = {
'ar-SA': 'ar-XA',
'es-419': 'es-XL',
'ja': 'jp-JP',
'ko': 'kr-KR',
'sl-SI': 'sl-SL',
'zh-TW': 'tzh-TW',
'zh-HK': 'tzh-HK'
}
# search-url
url = 'https://duckduckgo.com/html?{query}&s={offset}&dc={dc_param}'
time_range_url = '&df={range}'
@ -42,34 +53,12 @@ content_xpath = './/a[@class="result__snippet"]'
# match query's language to a region code that duckduckgo will accept
def get_region_code(lang, lang_list=None):
# custom fixes for languages
if lang[:2] == 'ja':
region_code = 'jp-jp'
elif lang[:2] == 'sl':
region_code = 'sl-sl'
elif lang == 'zh-TW':
region_code = 'tw-tzh'
elif lang == 'zh-HK':
region_code = 'hk-tzh'
elif lang[-2:] == 'SA':
region_code = 'xa-' + lang.split('-')[0]
elif lang[-2:] == 'GB':
region_code = 'uk-' + lang.split('-')[0]
else:
region_code = lang.split('-')
if len(region_code) == 2:
def get_region_code(lang, lang_list=[]):
lang_code = match_language(lang, lang_list, language_aliases, 'wt-WT')
lang_parts = lang_code.split('-')
# country code goes first
region_code = region_code[1].lower() + '-' + region_code[0].lower()
else:
# tries to get a country code from language
region_code = region_code[0].lower()
for lc in (lang_list or supported_languages):
lc = lc.split('-')
if region_code == lc[0]:
region_code = lc[1].lower() + '-' + lc[0].lower()
break
return region_code
return lang_parts[1].lower() + '-' + lang_parts[0].lower()
# do search-request
@ -79,7 +68,7 @@ def request(query, params):
offset = (params['pageno'] - 1) * 30
region_code = get_region_code(params['language'])
region_code = get_region_code(params['language'], supported_languages)
params['url'] = url.format(
query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset)

View file

@ -2,9 +2,9 @@ import json
from lxml import html
from re import compile
from searx.engines.xpath import extract_text
from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url
from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, language_aliases
from searx.url_utils import urlencode
from searx.utils import html_to_text
from searx.utils import html_to_text, match_language
url = 'https://api.duckduckgo.com/'\
+ '?{query}&format=json&pretty=0&no_redirect=1&d=1'
@ -24,7 +24,8 @@ def result_to_text(url, text, htmlResult):
def request(query, params):
params['url'] = url.format(query=urlencode({'q': query}))
params['headers']['Accept-Language'] = params['language'].split('-')[0]
language = match_language(params['language'], supported_languages, language_aliases)
params['headers']['Accept-Language'] = language.split('-')[0]
return params

View file

@ -15,7 +15,10 @@
from json import loads
from searx.engines.xpath import extract_text
from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, get_region_code
from searx.engines.duckduckgo import (
_fetch_supported_languages, supported_languages_url,
get_region_code, language_aliases
)
from searx.poolrequests import get
from searx.url_utils import urlencode

View file

@ -14,6 +14,7 @@ from lxml import html, etree
from searx.engines.xpath import extract_text, extract_url
from searx import logger
from searx.url_utils import urlencode, urlparse, parse_qsl
from searx.utils import match_language
logger = logger.getChild('google engine')
@ -72,7 +73,7 @@ country_to_hostname = {
'RO': 'www.google.ro', # Romania
'RU': 'www.google.ru', # Russia
'SK': 'www.google.sk', # Slovakia
'SL': 'www.google.si', # Slovenia (SL -> si)
'SI': 'www.google.si', # Slovenia
'SE': 'www.google.se', # Sweden
'TH': 'www.google.co.th', # Thailand
'TR': 'www.google.com.tr', # Turkey
@ -165,22 +166,20 @@ def extract_text_from_dom(result, xpath):
def request(query, params):
offset = (params['pageno'] - 1) * 10
# temporary fix until a way of supporting en-US is found
if params['language'] == 'en-US':
params['language'] = 'en-GB'
if params['language'][:2] == 'jv':
language = 'jw'
country = 'ID'
url_lang = 'lang_jw'
else:
language_array = params['language'].lower().split('-')
if len(language_array) == 2:
language = match_language(params['language'], supported_languages)
language_array = language.split('-')
if params['language'].find('-') > 0:
country = params['language'].split('-')[1]
elif len(language_array) == 2:
country = language_array[1]
else:
country = 'US'
language = language_array[0] + ',' + language_array[0] + '-' + country
url_lang = 'lang_' + language_array[0]
# temporary fix until a way of supporting en-US is found
if language == 'en-US':
country = 'GB'
url_lang = 'lang_' + language
if use_locale_domain:
google_hostname = country_to_hostname.get(country.upper(), default_hostname)
@ -196,7 +195,7 @@ def request(query, params):
if params['time_range'] in time_range_dict:
params['url'] += time_range_search.format(range=time_range_dict[params['time_range']])
params['headers']['Accept-Language'] = language
params['headers']['Accept-Language'] = language + ',' + language + '-' + country
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
params['google_hostname'] = google_hostname

View file

@ -13,6 +13,7 @@
from lxml import html
from searx.engines.google import _fetch_supported_languages, supported_languages_url
from searx.url_utils import urlencode
from searx.utils import match_language
# search-url
categories = ['news']
@ -50,8 +51,9 @@ def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}),
search_options=urlencode(search_options))
language_array = params['language'].lower().split('-')
params['url'] += '&lr=lang_' + language_array[0]
language = match_language(params['language'], supported_languages).split('-')[0]
if language:
params['url'] += '&lr=lang_' + language
return params

View file

@ -14,6 +14,7 @@ from datetime import datetime
from json import loads
from searx.utils import html_to_text
from searx.url_utils import urlencode
from searx.utils import match_language
# engine dependent config
categories = None
@ -45,16 +46,8 @@ def request(query, params):
offset=offset)
# add language tag
if params['language'] == 'no' or params['language'].startswith('no-'):
params['language'] = params['language'].replace('no', 'nb', 1)
if params['language'].find('-') < 0:
# tries to get a country code from language
for lang in supported_languages:
lc = lang.split('-')
if params['language'] == lc[0]:
params['language'] = lang
break
params['url'] += '&locale=' + params['language'].replace('-', '_').lower()
language = match_language(params['language'], supported_languages)
params['url'] += '&locale=' + language.replace('-', '_').lower()
return params

View file

@ -14,6 +14,7 @@ from json import loads
import re
from lxml.html import fromstring
from searx.url_utils import unquote, urlencode
from searx.utils import match_language
# engine dependent config
categories = ['general', 'images']
@ -35,11 +36,8 @@ regex_img_url_remove_start = re.compile(b'^https?://i\.swisscows\.ch/\?link=')
# do search-request
def request(query, params):
if params['language'].split('-')[0] == 'no':
region = 'nb-NO'
else:
region = params['language']
ui_language = params['language'].split('-')[0]
region = match_language(params['language'], supported_languages)
ui_language = region.split('-')[0]
search_path = search_string.format(
query=urlencode({'query': query, 'uiLanguage': ui_language, 'region': region}),

View file

@ -16,6 +16,7 @@ from searx.poolrequests import get
from searx.engines.xpath import extract_text
from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url
from searx.url_utils import urlencode
from searx.utils import match_language
from json import loads
from lxml.html import fromstring
@ -56,7 +57,7 @@ calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]'
def request(query, params):
language = params['language'].split('-')[0]
language = match_language(params['language'], supported_languages).split('-')[0]
params['url'] = url_search.format(
query=urlencode({'label': query, 'language': language}))
@ -68,7 +69,7 @@ def response(resp):
html = fromstring(resp.text)
wikidata_ids = html.xpath(wikidata_ids_xpath)
language = resp.search_params['language'].split('-')[0]
language = match_language(resp.search_params['language'], supported_languages).split('-')[0]
# TODO: make requests asynchronous to avoid timeout when result_count > 1
for wikidata_id in wikidata_ids[:result_count]:

View file

@ -13,6 +13,7 @@
from json import loads
from lxml.html import fromstring
from searx.url_utils import quote, urlencode
from searx.utils import match_language
# search-url
base_url = u'https://{language}.wikipedia.org/'
@ -30,13 +31,7 @@ supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
# set language in base_url
def url_lang(lang):
lang = lang.split('-')[0]
if lang not in supported_languages:
language = 'en'
else:
language = lang
return language
return match_language(lang, supported_languages).split('-')[0]
# do search-request

View file

@ -14,6 +14,7 @@
from lxml import html
from searx.engines.xpath import extract_text, extract_url
from searx.url_utils import unquote, urlencode
from searx.utils import match_language
# engine dependent config
categories = ['general']
@ -39,6 +40,8 @@ time_range_dict = {'day': ['1d', 'd'],
'week': ['1w', 'w'],
'month': ['1m', 'm']}
language_aliases = {'zh-CN': 'zh-CHS', 'zh-TW': 'zh-CHT', 'zh-HK': 'zh-CHT'}
# remove yahoo-specific tracking-url
def parse_url(url_string):
@ -70,23 +73,16 @@ def _get_url(query, offset, language, time_range):
lang=language)
def _get_language(params):
if params['language'][:2] == 'zh':
if params['language'] == 'zh' or params['language'] == 'zh-CH':
return 'szh'
else:
return 'tzh'
else:
return params['language'].split('-')[0]
# do search-request
def request(query, params):
if params['time_range'] and params['time_range'] not in time_range_dict:
return params
offset = (params['pageno'] - 1) * 10 + 1
language = _get_language(params)
language = match_language(params['language'], supported_languages, language_aliases)
if language not in language_aliases.values():
language = language.split('-')[0]
language = language.replace('-', '_').lower()
params['url'] = _get_url(query, offset, language, params['time_range'])
@ -145,7 +141,11 @@ def _fetch_supported_languages(resp):
dom = html.fromstring(resp.text)
options = dom.xpath('//div[@id="yschlang"]/span/label/input')
for option in options:
code = option.xpath('./@value')[0][5:].replace('_', '-')
code_parts = option.xpath('./@value')[0][5:].split('_')
if len(code_parts) == 2:
code = code_parts[0] + '-' + code_parts[1].upper()
else:
code = code_parts[0]
supported_languages.append(code)
return supported_languages

View file

@ -13,9 +13,12 @@ import re
from datetime import datetime, timedelta
from lxml import html
from searx.engines.xpath import extract_text, extract_url
from searx.engines.yahoo import parse_url, _fetch_supported_languages, supported_languages_url
from searx.engines.yahoo import (
parse_url, _fetch_supported_languages, supported_languages_url, language_aliases
)
from dateutil import parser
from searx.url_utils import urlencode
from searx.utils import match_language
# engine dependent config
categories = ['news']
@ -38,7 +41,7 @@ suggestion_xpath = '//div[contains(@class,"VerALSOTRY")]//a'
def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1
language = params['language'].split('-')[0]
language = match_language(params['language'], supported_languages, language_aliases).split('-')[0]
params['url'] = search_url.format(offset=offset,
query=urlencode({'p': query}),

View file

@ -5,11 +5,7 @@
language_codes = (
(u"ar-SA", u"العربية", u"", u"Arabic"),
(u"bg-BG", u"Български", u"", u"Bulgarian"),
(u"ca", u"Català", u"", u"Catalan"),
(u"ca-AD", u"Català", u"Andorra", u"Catalan"),
(u"ca-CT", u"Català", u"", u"Catalan"),
(u"ca-ES", u"Català", u"Espanya", u"Catalan"),
(u"ca-FR", u"Català", u"França", u"Catalan"),
(u"ca-ES", u"Català", u"", u"Catalan"),
(u"cs-CZ", u"Čeština", u"", u"Czech"),
(u"da-DK", u"Dansk", u"", u"Danish"),
(u"de", u"Deutsch", u"", u"German"),
@ -21,55 +17,51 @@ language_codes = (
(u"en-AU", u"English", u"Australia", u"English"),
(u"en-CA", u"English", u"Canada", u"English"),
(u"en-GB", u"English", u"United Kingdom", u"English"),
(u"en-ID", u"English", u"Indonesia", u"English"),
(u"en-IE", u"English", u"Ireland", u"English"),
(u"en-IN", u"English", u"India", u"English"),
(u"en-MY", u"English", u"Malaysia", u"English"),
(u"en-NZ", u"English", u"New Zealand", u"English"),
(u"en-PH", u"English", u"Philippines", u"English"),
(u"en-SG", u"English", u"Singapore", u"English"),
(u"en-US", u"English", u"United States", u"English"),
(u"en-ZA", u"English", u"South Africa", u"English"),
(u"es", u"Español", u"", u"Spanish"),
(u"es-AD", u"Español", u"Andorra", u"Spanish"),
(u"es-AR", u"Español", u"Argentina", u"Spanish"),
(u"es-CL", u"Español", u"Chile", u"Spanish"),
(u"es-CO", u"Español", u"Colombia", u"Spanish"),
(u"es-ES", u"Español", u"España", u"Spanish"),
(u"es-MX", u"Español", u"México", u"Spanish"),
(u"es-PE", u"Español", u"Perú", u"Spanish"),
(u"es-US", u"Español", u"Estados Unidos", u"Spanish"),
(u"et-EE", u"Eesti", u"", u"Estonian"),
(u"fa-IR", u"فارسی", u"", u"Persian"),
(u"fi-FI", u"Suomi", u"", u"Finnish"),
(u"fr", u"Français", u"", u"French"),
(u"fr-AD", u"Français", u"Andorre", u"French"),
(u"fr-BE", u"Français", u"Belgique", u"French"),
(u"fr-CA", u"Français", u"Canada", u"French"),
(u"fr-CH", u"Français", u"Suisse", u"French"),
(u"fr-FR", u"Français", u"France", u"French"),
(u"he-IL", u"עברית", u"", u"Hebrew"),
(u"hr-HR", u"Hrvatski", u"", u"Croatian"),
(u"hu-HU", u"Magyar", u"", u"Hungarian"),
(u"it", u"Italiano", u"", u"Italian"),
(u"it-CH", u"Italiano", u"Svizzera", u"Italian"),
(u"it-IT", u"Italiano", u"Italia", u"Italian"),
(u"id-ID", u"Indonesia", u"", u"Indonesian"),
(u"is-IS", u"Íslenska", u"", u"Icelandic"),
(u"it-IT", u"Italiano", u"", u"Italian"),
(u"ja-JP", u"日本語", u"", u"Japanese"),
(u"ko-KR", u"한국어", u"", u"Korean"),
(u"lt-LT", u"Lietuvių", u"", u"Lithuanian"),
(u"lv-LV", u"Latviešu", u"", u"Latvian"),
(u"ms-MY", u"Bahasa Melayu", u"", u"Malay"),
(u"nb-NO", u"Norsk Bokmål", u"", u"Norwegian Bokmål"),
(u"nl", u"Nederlands", u"", u"Dutch"),
(u"nl-BE", u"Nederlands", u"België", u"Dutch"),
(u"nl-NL", u"Nederlands", u"Nederland", u"Dutch"),
(u"no-NO", u"Norsk", u"", u"Norwegian"),
(u"pl-PL", u"Polski", u"", u"Polish"),
(u"pt", u"Português", u"", u"Portuguese"),
(u"pt-AD", u"Português", u"Andorra", u"Portuguese"),
(u"pt-BR", u"Português", u"Brasil", u"Portuguese"),
(u"pt-PT", u"Português", u"Portugal", u"Portuguese"),
(u"ro-RO", u"Română", u"", u"Romanian"),
(u"ru-RU", u"Русский", u"", u"Russian"),
(u"sk-SK", u"Slovenčina", u"", u"Slovak"),
(u"sl-SI", u"Slovenščina", u"", u"Slovenian"),
(u"sr-RS", u"Српски", u"", u"Serbian"),
(u"sv-SE", u"Svenska", u"", u"Swedish"),
(u"th-TH", u"ไทย", u"", u"Thai"),
(u"tr-TR", u"Türkçe", u"", u"Turkish"),
(u"uk-UA", u"Українська", u"", u"Ukrainian"),
(u"vi-VN", u"Tiếng Việt", u"", u"Vietnamese"),
(u"zh", u"中文", u"", u"Chinese"),
(u"zh-CN", u"中文", u"中国", u"Chinese"),
(u"zh-HK", u"中文", u"香港", u"Chinese"),
(u"zh-TW", u"中文", u"台湾", u"Chinese")
(u"zh-TW", u"中文", u"台灣", u"Chinese")
)

View file

@ -115,10 +115,6 @@ class SearchLanguageSetting(EnumStringSetting):
pass
elif lang in self.choices:
data = lang
elif data == 'nb-NO':
data = 'no-NO'
elif data == 'ar-XA':
data = 'ar-SA'
else:
data = self.value
self.value = data

View file

@ -96,7 +96,11 @@ class RawTextQuery(object):
break
# user may set a valid, yet not selectable language
if not self.languages and VALID_LANGUAGE_CODE.match(lang):
if VALID_LANGUAGE_CODE.match(lang):
lang_parts = lang.split('-')
if len(lang_parts) > 1:
lang = lang_parts[0].lower() + '-' + lang_parts[1].upper()
if lang not in self.languages:
self.languages.append(lang)
parse_next = True

View file

@ -187,7 +187,7 @@
</td>
<th>{{ search_engine.name }}</th>
<td class="name">{{ shortcuts[search_engine.name] }}</td>
<td>{{ support_toggle(current_language in search_engine.supported_languages or current_language.split('-')[0] in search_engine.supported_languages) }}</td>
<td>{{ support_toggle(stats[search_engine.name].supports_selected_language) }}</td>
<td>{{ support_toggle(search_engine.safesearch==True) }}</td>
<td>{{ support_toggle(search_engine.time_range_support==True) }}</td>
<td class="{{ 'danger' if stats[search_engine.name]['warn_time'] else '' }}">{{ 'N/A' if stats[search_engine.name].time==None else stats[search_engine.name].time }}</td>
@ -197,7 +197,7 @@
<td class="{{ 'danger' if stats[search_engine.name]['warn_time'] else '' }}">{{ 'N/A' if stats[search_engine.name].time==None else stats[search_engine.name].time }}</td>
<td>{{ support_toggle(search_engine.time_range_support==True) }}</td>
<td>{{ support_toggle(search_engine.safesearch==True) }}</td>
<td>{{ support_toggle(current_language in search_engine.supported_languages or current_language.split('-')[0] in search_engine.supported_languages) }}</td>
<td>{{ support_toggle(stats[search_engine.name].supports_selected_language) }}</td>
<td>{{ shortcuts[search_engine.name] }}</td>
<th>{{ search_engine.name }}</th>
<td class="onoff-checkbox">

View file

@ -4,6 +4,7 @@ import hmac
import os
import re
from babel.core import get_global
from babel.dates import format_date
from codecs import getincrementalencoder
from imp import load_source
@ -12,6 +13,7 @@ from os.path import splitext, join
from random import choice
import sys
from searx import settings
from searx.version import VERSION_STRING
from searx.languages import language_codes
from searx import settings
@ -322,6 +324,65 @@ def is_valid_lang(lang):
return False
# auxiliary function to match lang_code in lang_list
def _match_language(lang_code, lang_list=[], custom_aliases={}):
# replace language code with a custom alias if necessary
if lang_code in custom_aliases:
lang_code = custom_aliases[lang_code]
if lang_code in lang_list:
return lang_code
# try to get the most likely country for this language
subtags = get_global('likely_subtags').get(lang_code)
if subtags:
subtag_parts = subtags.split('_')
new_code = subtag_parts[0] + '-' + subtag_parts[-1]
if new_code in custom_aliases:
new_code = custom_aliases[new_code]
if new_code in lang_list:
return new_code
# try to get the any supported country for this language
for lc in lang_list:
if lang_code == lc.split('-')[0]:
return lc
return None
# get the language code from lang_list that best matches locale_code
def match_language(locale_code, lang_list=[], custom_aliases={}, fallback='en-US'):
# try to get language from given locale_code
language = _match_language(locale_code, lang_list, custom_aliases)
if language:
return language
locale_parts = locale_code.split('-')
lang_code = locale_parts[0]
# try to get language using an equivalent country code
if len(locale_parts) > 1:
country_alias = get_global('territory_aliases').get(locale_parts[-1])
if country_alias:
language = _match_language(lang_code + '-' + country_alias[0], lang_list, custom_aliases)
if language:
return language
# try to get language using an equivalent language code
alias = get_global('language_aliases').get(lang_code)
if alias:
language = _match_language(alias, lang_list, custom_aliases)
if language:
return language
if lang_code != locale_code:
# try to get language from given language without giving the country
language = _match_language(lang_code, lang_list, custom_aliases)
return language or fallback
def load_module(filename, module_dir):
modname = splitext(filename)[0]
if modname in sys.modules:

View file

@ -58,16 +58,16 @@ from searx.engines import (
from searx.utils import (
UnicodeWriter, highlight_content, html_to_text, get_resources_directory,
get_static_files, get_result_templates, get_themes, gen_useragent,
dict_subset, prettify_url
dict_subset, prettify_url, match_language
)
from searx.version import VERSION_STRING
from searx.languages import language_codes
from searx.languages import language_codes as languages
from searx.search import SearchWithPlugins, get_search_query_from_webapp
from searx.query import RawTextQuery
from searx.autocomplete import searx_bang, backends as autocomplete_backends
from searx.plugins import plugins
from searx.plugins.oa_doi_rewrite import get_doi_resolver
from searx.preferences import Preferences, ValidationException
from searx.preferences import Preferences, ValidationException, LANGUAGE_CODES
from searx.answerers import answerers
from searx.url_utils import urlencode, urlparse, urljoin
from searx.utils import new_hmac
@ -133,7 +133,7 @@ if not searx_debug \
babel = Babel(app)
rtl_locales = ['ar', 'arc', 'bcc', 'bqi', 'ckb', 'dv', 'fa', 'glk', 'he',
'ku', 'mzn', 'pnb'', ''ps', 'sd', 'ug', 'ur', 'yi']
'ku', 'mzn', 'pnb', 'ps', 'sd', 'ug', 'ur', 'yi']
# used when translating category names
_category_names = (gettext('files'),
@ -352,9 +352,11 @@ def render(template_name, override_theme=None, **kwargs):
kwargs['safesearch'] = str(request.preferences.get_value('safesearch'))
kwargs['language_codes'] = language_codes
kwargs['language_codes'] = languages
if 'current_language' not in kwargs:
kwargs['current_language'] = request.preferences.get_value('language')
kwargs['current_language'] = match_language(request.preferences.get_value('language'),
LANGUAGE_CODES,
fallback=settings['search']['language'])
# override url_for function in templates
kwargs['url_for'] = url_for_theme
@ -590,7 +592,9 @@ def index():
infoboxes=result_container.infoboxes,
paging=result_container.paging,
unresponsive_engines=result_container.unresponsive_engines,
current_language=search_query.lang,
current_language=match_language(search_query.lang,
LANGUAGE_CODES,
fallback=settings['search']['language']),
base_url=get_base_url(),
theme=get_current_theme_name(),
favicons=global_favicons[themes.index(get_current_theme_name())]
@ -687,6 +691,10 @@ def preferences():
'warn_time': False}
if e.timeout > settings['outgoing']['request_timeout']:
stats[e.name]['warn_timeout'] = True
if match_language(request.preferences.get_value('language'),
getattr(e, 'supported_languages', []),
getattr(e, 'language_aliases', {}), None):
stats[e.name]['supports_selected_language'] = True
# get first element [0], the engine time,
# and then the second element [1] : the time (the first one is the label)

View file

@ -19,12 +19,17 @@ class TestArchLinuxEngine(SearxTestCase):
query = 'test_query'
dic = defaultdict(dict)
dic['pageno'] = 1
dic['language'] = 'en_US'
dic['language'] = 'en-US'
params = archlinux.request(query, dic)
self.assertTrue('url' in params)
self.assertTrue(query in params['url'])
self.assertTrue('wiki.archlinux.org' in params['url'])
for lang, name in archlinux.main_langs:
dic['language'] = lang
params = archlinux.request(query, dic)
self.assertTrue(name in params['url'])
for lang, domain in domains.items():
dic['language'] = lang
params = archlinux.request(query, dic)

View file

@ -7,6 +7,7 @@ from searx.testing import SearxTestCase
class TestBingEngine(SearxTestCase):
def test_request(self):
bing.supported_languages = ['en', 'fr', 'zh-CHS', 'zh-CHT', 'pt-PT', 'pt-BR']
query = u'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 0

View file

@ -9,7 +9,6 @@ class TestBingImagesEngine(SearxTestCase):
def test_request(self):
bing_images.supported_languages = ['fr-FR', 'en-US']
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1

View file

@ -8,10 +8,11 @@ import lxml
class TestBingNewsEngine(SearxTestCase):
def test_request(self):
bing_news.supported_languages = ['en', 'fr']
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1
dicto['language'] = 'fr_FR'
dicto['language'] = 'fr-FR'
dicto['time_range'] = ''
params = bing_news.request(query, dicto)
self.assertIn('url', params)

View file

@ -9,7 +9,6 @@ class TestBingVideosEngine(SearxTestCase):
def test_request(self):
bing_videos.supported_languages = ['fr-FR', 'en-US']
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1

View file

@ -8,10 +8,11 @@ from searx.testing import SearxTestCase
class TestDailymotionEngine(SearxTestCase):
def test_request(self):
dailymotion.supported_languages = ['en', 'fr']
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 0
dicto['language'] = 'fr_FR'
dicto['language'] = 'fr-FR'
params = dailymotion.request(query, dicto)
self.assertTrue('url' in params)
self.assertTrue(query in params['url'])

View file

@ -1,18 +1,21 @@
# -*- coding: utf-8 -*-
from collections import defaultdict
import mock
from searx.engines import duckduckgo
from searx.engines import load_engine, duckduckgo
from searx.testing import SearxTestCase
class TestDuckduckgoEngine(SearxTestCase):
def test_request(self):
duckduckgo = load_engine({'engine': 'duckduckgo', 'name': 'duckduckgo'})
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1
dicto['language'] = 'de-CH'
dicto['time_range'] = ''
dicto['language'] = 'de-CH'
params = duckduckgo.request(query, dicto)
self.assertIn('url', params)
self.assertIn(query, params['url'])
@ -20,16 +23,19 @@ class TestDuckduckgoEngine(SearxTestCase):
self.assertIn('ch-de', params['url'])
self.assertIn('s=0', params['url'])
# when ddg uses non standard code
# when ddg uses non standard codes
dicto['language'] = 'zh-HK'
params = duckduckgo.request(query, dicto)
self.assertIn('hk-tzh', params['url'])
dicto['language'] = 'en-GB'
params = duckduckgo.request(query, dicto)
self.assertIn('uk-en', params['url'])
# no country given
duckduckgo.supported_languages = ['de-CH', 'en-US']
dicto['language'] = 'de'
dicto['language'] = 'en'
params = duckduckgo.request(query, dicto)
self.assertIn('ch-de', params['url'])
self.assertIn('us-en', params['url'])
def test_no_url_in_request_year_time_range(self):
dicto = defaultdict(dict)

View file

@ -18,6 +18,7 @@ class TestDDGDefinitionsEngine(SearxTestCase):
self.assertEqual(result, 'Text in link')
def test_request(self):
duckduckgo_definitions.supported_languages = ['en-US', 'es-ES']
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1

View file

@ -9,7 +9,6 @@ class TestDuckduckgoImagesEngine(SearxTestCase):
def test_request(self):
duckduckgo_images.supported_languages = ['de-CH', 'en-US']
query = 'test_query'
dicto = defaultdict(dict)
dicto['is_test'] = True

View file

@ -15,6 +15,8 @@ class TestGoogleEngine(SearxTestCase):
return response
def test_request(self):
google.supported_languages = ['en', 'fr', 'zh-CN']
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1
@ -31,6 +33,11 @@ class TestGoogleEngine(SearxTestCase):
self.assertIn('google.co', params['url'])
self.assertIn('en', params['headers']['Accept-Language'])
dicto['language'] = 'zh'
params = google.request(query, dicto)
self.assertIn('google.com', params['url'])
self.assertIn('zh-CN', params['headers']['Accept-Language'])
def test_response(self):
self.assertRaises(AttributeError, google.response, None)
self.assertRaises(AttributeError, google.response, [])

View file

@ -9,6 +9,7 @@ from searx.testing import SearxTestCase
class TestGoogleNewsEngine(SearxTestCase):
def test_request(self):
google_news.supported_languages = ['en-US', 'fr-FR']
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1

View file

@ -7,6 +7,7 @@ from searx.testing import SearxTestCase
class TestQwantEngine(SearxTestCase):
def test_request(self):
qwant.supported_languages = ['en-US', 'fr-CA', 'fr-FR']
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 0
@ -26,7 +27,6 @@ class TestQwantEngine(SearxTestCase):
self.assertIn('en_us', params['url'])
self.assertIn('news', params['url'])
qwant.supported_languages = ['en', 'fr-FR', 'fr-CA']
dicto['language'] = 'fr'
params = qwant.request(query, dicto)
self.assertIn('fr_fr', params['url'])

View file

@ -7,6 +7,7 @@ from searx.testing import SearxTestCase
class TestSwisscowsEngine(SearxTestCase):
def test_request(self):
swisscows.supported_languages = ['de-AT', 'de-DE']
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1

View file

@ -9,6 +9,7 @@ from searx.testing import SearxTestCase
class TestWikidataEngine(SearxTestCase):
def test_request(self):
wikidata.supported_languages = ['en', 'es']
query = 'test_query'
dicto = defaultdict(dict)
dicto['language'] = 'en-US'

View file

@ -25,11 +25,12 @@ class TestYahooEngine(SearxTestCase):
self.assertEqual('https://this.is.the.url/', url)
def test_request(self):
yahoo.supported_languages = ['en', 'fr', 'zh-CHT', 'zh-CHS']
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1
dicto['time_range'] = ''
dicto['language'] = 'fr_FR'
dicto['language'] = 'fr-FR'
params = yahoo.request(query, dicto)
self.assertIn('url', params)
self.assertIn(query, params['url'])
@ -39,6 +40,16 @@ class TestYahooEngine(SearxTestCase):
self.assertIn('sB', params['cookies'])
self.assertIn('fr', params['cookies']['sB'])
dicto['language'] = 'zh'
params = yahoo.request(query, dicto)
self.assertIn('zh_chs', params['url'])
self.assertIn('zh_chs', params['cookies']['sB'])
dicto['language'] = 'zh-TW'
params = yahoo.request(query, dicto)
self.assertIn('zh_cht', params['url'])
self.assertIn('zh_cht', params['cookies']['sB'])
def test_no_url_in_request_year_time_range(self):
dicto = defaultdict(dict)
query = 'test_query'
@ -168,5 +179,5 @@ class TestYahooEngine(SearxTestCase):
self.assertEqual(type(languages), list)
self.assertEqual(len(languages), 3)
self.assertIn('ar', languages)
self.assertIn('zh-chs', languages)
self.assertIn('zh-cht', languages)
self.assertIn('zh-CHS', languages)
self.assertIn('zh-CHT', languages)

View file

@ -9,10 +9,11 @@ from searx.testing import SearxTestCase
class TestYahooNewsEngine(SearxTestCase):
def test_request(self):
yahoo_news.supported_languages = ['en', 'fr']
query = 'test_query'
dicto = defaultdict(dict)
dicto['pageno'] = 1
dicto['language'] = 'fr_FR'
dicto['language'] = 'fr-FR'
params = yahoo_news.request(query, dicto)
self.assertIn('url', params)
self.assertIn(query, params['url'])

View file

@ -65,6 +65,31 @@ class TestUtils(SearxTestCase):
for test_url, expected in data:
self.assertEqual(utils.prettify_url(test_url, max_length=32), expected)
def test_match_language(self):
self.assertEqual(utils.match_language('es', ['es']), 'es')
self.assertEqual(utils.match_language('es', [], fallback='fallback'), 'fallback')
self.assertEqual(utils.match_language('ja', ['jp'], {'ja': 'jp'}), 'jp')
aliases = {'en-GB': 'en-UK', 'he': 'iw'}
# guess country
self.assertEqual(utils.match_language('de-DE', ['de']), 'de')
self.assertEqual(utils.match_language('de', ['de-DE']), 'de-DE')
self.assertEqual(utils.match_language('es-CO', ['es-AR', 'es-ES', 'es-MX']), 'es-ES')
self.assertEqual(utils.match_language('es-CO', ['es-MX']), 'es-MX')
self.assertEqual(utils.match_language('en-UK', ['en-AU', 'en-GB', 'en-US']), 'en-GB')
self.assertEqual(utils.match_language('en-GB', ['en-AU', 'en-UK', 'en-US'], aliases), 'en-UK')
# language aliases
self.assertEqual(utils.match_language('iw', ['he']), 'he')
self.assertEqual(utils.match_language('he', ['iw'], aliases), 'iw')
self.assertEqual(utils.match_language('iw-IL', ['he']), 'he')
self.assertEqual(utils.match_language('he-IL', ['iw'], aliases), 'iw')
self.assertEqual(utils.match_language('iw', ['he-IL']), 'he-IL')
self.assertEqual(utils.match_language('he', ['iw-IL'], aliases), 'iw-IL')
self.assertEqual(utils.match_language('iw-IL', ['he-IL']), 'he-IL')
self.assertEqual(utils.match_language('he-IL', ['iw-IL'], aliases), 'iw-IL')
class TestHTMLTextExtractor(SearxTestCase):

View file

@ -2,83 +2,27 @@
# This script generates languages.py from intersecting each engine's supported languages.
#
# The country names are obtained from http://api.geonames.org which requires registering as a user.
#
# Output files (engines_languages.json and languages.py)
# are written in current directory to avoid overwriting in case something goes wrong.
from requests import get
from lxml.html import fromstring
from json import loads, dump
from json import dump
import io
from sys import path
from babel import Locale, UnknownLocaleError
from babel.languages import get_global
path.append('../searx') # noqa
from searx import settings
from searx.url_utils import urlencode
from searx.engines import initialize_engines, engines
# Geonames API for country names.
geonames_user = '' # ADD USER NAME HERE
country_names_url = 'http://api.geonames.org/countryInfoJSON?{parameters}'
# Output files.
engines_languages_file = 'engines_languages.json'
languages_file = 'languages.py'
engines_languages = {}
# To filter out invalid codes and dialects.
def valid_code(lang_code):
# filter invalid codes
# sl-SL is technically not invalid, but still a mistake
invalid_codes = ['sl-SL', 'wt-WT', 'jw']
invalid_countries = ['UK', 'XA', 'XL']
if lang_code[:2] == 'xx'\
or lang_code in invalid_codes\
or lang_code[-2:] in invalid_countries\
or is_dialect(lang_code):
return False
return True
# Language codes with any additional tags other than language and country.
def is_dialect(lang_code):
lang_code = lang_code.split('-')
if len(lang_code) > 2 or len(lang_code[0]) > 3:
return True
if len(lang_code) == 2 and len(lang_code[1]) > 2:
return True
return False
# Get country name in specified language.
def get_country_name(locale):
if geonames_user is '':
return ''
locale = locale.split('-')
if len(locale) != 2:
return ''
url = country_names_url.format(parameters=urlencode({'lang': locale[0],
'country': locale[1],
'username': geonames_user}))
response = get(url)
json = loads(response.text)
content = json.get('geonames', None)
if content is None or len(content) != 1:
print("No country name found for " + locale[0] + "-" + locale[1])
return ''
return content[0].get('countryName', '')
# Fetchs supported languages for each engine and writes json file with those.
def fetch_supported_languages():
initialize_engines(settings['engines'])
engines_languages = {}
for engine_name in engines:
if hasattr(engines[engine_name], 'fetch_supported_languages'):
try:
@ -90,81 +34,135 @@ def fetch_supported_languages():
with io.open(engines_languages_file, "w", encoding="utf-8") as f:
dump(engines_languages, f, ensure_ascii=False)
return engines_languages
# Get babel Locale object from lang_code if possible.
def get_locale(lang_code):
try:
locale = Locale.parse(lang_code, sep='-')
return locale
except (UnknownLocaleError, ValueError):
return None
# Append engine_name to list of engines that support locale.
def add_engine_counter(lang_code, engine_name, languages):
if lang_code in languages:
if 'counter' not in languages[lang_code]:
languages[lang_code]['counter'] = [engine_name]
elif engine_name not in languages[lang_code]['counter']:
languages[lang_code]['counter'].append(engine_name)
# Join all language lists.
# Iterate all languages supported by each engine.
def join_language_lists():
global languages
# include wikipedia first for more accurate language names
languages = {code: lang for code, lang
in engines_languages['wikipedia'].items()
if valid_code(code)}
# TODO: Add language names from engine's language list if name not known by babel.
def join_language_lists(engines_languages):
language_list = {}
for engine_name in engines_languages:
for locale in engines_languages[engine_name]:
if valid_code(locale):
# if language is not on list or if it has no name yet
if locale not in languages or not languages[locale].get('name'):
if isinstance(engines_languages[engine_name], dict):
languages[locale] = engines_languages[engine_name][locale]
for lang_code in engines_languages[engine_name]:
# apply custom fixes if necessary
if lang_code in getattr(engines[engine_name], 'language_aliases', {}).values():
lang_code = next(lc for lc, alias in engines[engine_name].language_aliases.items()
if lang_code == alias)
locale = get_locale(lang_code)
# ensure that lang_code uses standard language and country codes
if locale and locale.territory:
lang_code = locale.language + '-' + locale.territory
# add locale if it's not in list
if lang_code not in language_list:
if locale:
language_list[lang_code] = {'name': locale.get_language_name().title(),
'english_name': locale.english_name,
'country': locale.get_territory_name() or ''}
# also add language without country
if locale.language not in language_list:
language_list[locale.language] = {'name': locale.get_language_name().title(),
'english_name': locale.english_name}
else:
languages[locale] = {}
language_list[lang_code] = {}
# add to counter of engines that support given language
lang = locale.split('-')[0]
if lang in languages:
if 'counter' not in languages[lang]:
languages[lang]['counter'] = [engine_name]
elif engine_name not in languages[lang]['counter']:
languages[lang]['counter'].append(engine_name)
# count engine for both language_country combination and language alone
add_engine_counter(lang_code, engine_name, language_list)
add_engine_counter(lang_code.split('-')[0], engine_name, language_list)
# filter list to include only languages supported by most engines
min_supported_engines = int(0.70 * len(engines_languages))
languages = {code: lang for code, lang
in languages.items()
if len(lang.get('counter', [])) >= min_supported_engines or
len(languages.get(code.split('-')[0], {}).get('counter', [])) >= min_supported_engines}
# get locales that have no name or country yet
for locale in languages.keys():
# try to get language names
if not languages[locale].get('name'):
name = languages.get(locale.split('-')[0], {}).get('name', None)
if name:
languages[locale]['name'] = name
else:
# filter out locales with no name
del languages[locale]
continue
# try to get language name in english
if not languages[locale].get('english_name'):
languages[locale]['english_name'] = languages.get(locale.split('-')[0], {}).get('english_name', '')
# try to get country name
if locale.find('-') > 0 and not languages[locale].get('country'):
languages[locale]['country'] = get_country_name(locale) or ''
return language_list
# Remove countryless language if language is featured in only one country.
def filter_single_country_languages():
prev_lang = None
prev_code = None
for code in sorted(languages):
lang = code.split('-')[0]
if lang == prev_lang:
countries += 1
else:
if prev_lang is not None and countries == 1:
del languages[prev_lang]
languages[prev_code]['country'] = ''
# Filter language list so it only includes the most supported languages and countries.
def filter_language_list(all_languages):
min_supported_engines = 10
main_engines = [engine_name for engine_name in engines.keys()
if 'general' in engines[engine_name].categories and
engines[engine_name].supported_languages and
not engines[engine_name].disabled]
# filter list to include only languages supported by most engines or all default general engines
filtered_languages = {code: lang for code, lang
in all_languages.items()
if (len(lang.get('counter', [])) >= min_supported_engines or
all(main_engine in lang.get('counter', [])
for main_engine in main_engines))}
return filtered_languages
# Add country codes to languages without one and filter out language codes.
def assign_country_codes(filtered_languages, all_languages):
sorted_languages = sorted(all_languages,
key=lambda lang: len(all_languages[lang].get('counter', [])),
reverse=True)
previous_lang = None
previous_code = None
countries = 0
prev_lang = lang
prev_code = code
for current_code in sorted(filtered_languages):
current_lang = current_code.split('-')[0]
# count country codes per language
if current_lang == previous_lang:
countries += 1
else:
if previous_lang is not None:
# if language has no single country code
if countries == 0:
# try to get country code with most supported engines
for l in sorted_languages:
l_parts = l.split('-')
if len(l_parts) == 2 and l_parts[0] == previous_lang:
filtered_languages[l] = all_languages[l]
filtered_languages[l]['country'] = ''
countries = 1
break
if countries == 0:
# get most likely country code from babel
subtags = get_global('likely_subtags').get(previous_lang)
if subtags:
subtag_parts = subtags.split('_')
new_code = subtag_parts[0] + '-' + subtag_parts[-1]
filtered_languages[new_code] = all_languages[previous_lang]
countries = 1
if countries == 1:
# remove countryless version of language if there's only one country
del filtered_languages[previous_lang]
if previous_code in filtered_languages:
filtered_languages[previous_code]['country'] = ''
countries = 0
previous_lang = current_lang
previous_code = current_code
# Write languages.py.
def write_languages_file():
def write_languages_file(languages):
new_file = open(languages_file, 'wb')
file_content = '# -*- coding: utf-8 -*-\n'\
+ '# list of language codes\n'\
@ -183,7 +181,9 @@ def write_languages_file():
if __name__ == "__main__":
fetch_supported_languages()
join_language_lists()
filter_single_country_languages()
write_languages_file()
initialize_engines(settings['engines'])
engines_languages = fetch_supported_languages()
all_languages = join_language_lists(engines_languages)
filtered_languages = filter_language_list(all_languages)
assign_country_codes(filtered_languages, all_languages)
write_languages_file(filtered_languages)