forked from Ponysearch/Ponysearch
minor fixes in utils/fetch_languages.py
This commit is contained in:
parent
af35eee10b
commit
4a1ff56389
5 changed files with 36 additions and 32 deletions
File diff suppressed because one or more lines are too long
|
@ -131,7 +131,8 @@ def _fetch_supported_languages(resp):
|
||||||
name = td[2].xpath('./a')[0].text
|
name = td[2].xpath('./a')[0].text
|
||||||
english_name = td[1].xpath('./a')[0].text
|
english_name = td[1].xpath('./a')[0].text
|
||||||
articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
|
articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
|
||||||
if articles >= 10000:
|
# exclude languages with too few articles
|
||||||
|
if articles >= 100000:
|
||||||
supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}
|
supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}
|
||||||
|
|
||||||
return supported_languages
|
return supported_languages
|
||||||
|
|
|
@ -124,8 +124,8 @@ language_codes = (
|
||||||
(u"war", u"Winaray", u"", u"Waray-Waray"),
|
(u"war", u"Winaray", u"", u"Waray-Waray"),
|
||||||
(u"xh", u"Xhosa", u"", u"Xhosa"),
|
(u"xh", u"Xhosa", u"", u"Xhosa"),
|
||||||
(u"zh", u"中文", u"", u"Chinese"),
|
(u"zh", u"中文", u"", u"Chinese"),
|
||||||
(u"zh-CN", u"中文", u"中国", u""),
|
(u"zh-CN", u"中文", u"中国", u"Chinese"),
|
||||||
(u"zh-HK", u"中文", u"香港", u"Chinese"),
|
(u"zh-HK", u"中文", u"香港", u"Chinese"),
|
||||||
(u"zh-TW", u"中文", u"台湾", u""),
|
(u"zh-TW", u"中文", u"台湾", u"Chinese"),
|
||||||
(u"zu", u"Isi-Zulu", u"", u"Zulu")
|
(u"zu", u"Isi-Zulu", u"", u"Zulu")
|
||||||
)
|
)
|
||||||
|
|
|
@ -172,7 +172,7 @@
|
||||||
</td>
|
</td>
|
||||||
<th>{{ search_engine.name }}</th>
|
<th>{{ search_engine.name }}</th>
|
||||||
<td>{{ shortcuts[search_engine.name] }}</td>
|
<td>{{ shortcuts[search_engine.name] }}</td>
|
||||||
<td><input type="checkbox" {{ "checked" if current_language in search_engine.supported_languages or current_language.split('-')[0] in search_engine.supported_languages else ""}} readonly="readonly" disabled="disabled"></td>
|
<td><input type="checkbox" {{ "checked" if current_language == 'all' and current_language in search_engine.supported_languages or current_language.split('-')[0] in search_engine.supported_languages else ""}} readonly="readonly" disabled="disabled"></td>
|
||||||
<td><input type="checkbox" {{ "checked" if search_engine.safesearch==True else ""}} readonly="readonly" disabled="disabled"></td>
|
<td><input type="checkbox" {{ "checked" if search_engine.safesearch==True else ""}} readonly="readonly" disabled="disabled"></td>
|
||||||
<td><input type="checkbox" {{ "checked" if search_engine.time_range_support==True else ""}} readonly="readonly" disabled="disabled"></td>
|
<td><input type="checkbox" {{ "checked" if search_engine.time_range_support==True else ""}} readonly="readonly" disabled="disabled"></td>
|
||||||
<td class="{{ 'danger' if stats[search_engine.name]['warn_time'] else '' }}">{{ 'N/A' if stats[search_engine.name].time==None else stats[search_engine.name].time }}</td>
|
<td class="{{ 'danger' if stats[search_engine.name]['warn_time'] else '' }}">{{ 'N/A' if stats[search_engine.name].time==None else stats[search_engine.name].time }}</td>
|
||||||
|
@ -181,7 +181,7 @@
|
||||||
<td class="{{ 'danger' if stats[search_engine.name]['warn_timeout'] else '' }}">{{ search_engine.timeout }}</td>
|
<td class="{{ 'danger' if stats[search_engine.name]['warn_timeout'] else '' }}">{{ search_engine.timeout }}</td>
|
||||||
<td class="{{ 'danger' if stats[search_engine.name]['warn_time'] else '' }}">{{ 'N/A' if stats[search_engine.name].time==None else stats[search_engine.name].time }}</td>
|
<td class="{{ 'danger' if stats[search_engine.name]['warn_time'] else '' }}">{{ 'N/A' if stats[search_engine.name].time==None else stats[search_engine.name].time }}</td>
|
||||||
<td><input type="checkbox" {{ "checked" if search_engine.safesearch==True else ""}} readonly="readonly" disabled="disabled"></td>
|
<td><input type="checkbox" {{ "checked" if search_engine.safesearch==True else ""}} readonly="readonly" disabled="disabled"></td>
|
||||||
<td><input type="checkbox" {{ "checked" if current_language in search_engine.supported_languages or current_language.split('-')[0] in search_engine.supported_languages else ""}} readonly="readonly" disabled="disabled"></td>
|
<td><input type="checkbox" {{ "checked" if current_language == 'all' and current_language in search_engine.supported_languages or current_language.split('-')[0] in search_engine.supported_languages else ""}} readonly="readonly" disabled="disabled"></td>
|
||||||
<td>{{ shortcuts[search_engine.name] }}</td>
|
<td>{{ shortcuts[search_engine.name] }}</td>
|
||||||
<th>{{ search_engine.name }}</th>
|
<th>{{ search_engine.name }}</th>
|
||||||
<td class="onoff-checkbox">
|
<td class="onoff-checkbox">
|
||||||
|
|
|
@ -32,25 +32,28 @@ languages = {}
|
||||||
def valid_code(lang_code):
|
def valid_code(lang_code):
|
||||||
# filter invalid codes
|
# filter invalid codes
|
||||||
# sl-SL is technically not invalid, but still a mistake
|
# sl-SL is technically not invalid, but still a mistake
|
||||||
|
invalid_codes = ['sl-SL', 'wt-WT', 'jw']
|
||||||
|
invalid_countries = ['UK', 'XA', 'XL']
|
||||||
if lang_code[:2] == 'xx'\
|
if lang_code[:2] == 'xx'\
|
||||||
or lang_code == 'sl-SL'\
|
or lang_code in invalid_codes\
|
||||||
or lang_code == 'wt-WT'\
|
or lang_code[-2:] in invalid_countries\
|
||||||
or lang_code == 'jw'\
|
or is_dialect(lang_code):
|
||||||
or lang_code[-2:] == 'UK'\
|
|
||||||
or lang_code[-2:] == 'XA'\
|
|
||||||
or lang_code[-2:] == 'XL':
|
|
||||||
return False
|
|
||||||
|
|
||||||
# filter dialects
|
|
||||||
lang_code = lang_code.split('-')
|
|
||||||
if len(lang_code) > 2 or len(lang_code[0]) > 3:
|
|
||||||
return False
|
|
||||||
if len(lang_code) == 2 and len(lang_code[1]) > 2:
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
# Language codes with any additional tags other than language and country.
|
||||||
|
def is_dialect(lang_code):
|
||||||
|
lang_code = lang_code.split('-')
|
||||||
|
if len(lang_code) > 2 or len(lang_code[0]) > 3:
|
||||||
|
return True
|
||||||
|
if len(lang_code) == 2 and len(lang_code[1]) > 2:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
# Get country name in specified language.
|
# Get country name in specified language.
|
||||||
def get_country_name(locale):
|
def get_country_name(locale):
|
||||||
if geonames_user is '':
|
if geonames_user is '':
|
||||||
|
@ -83,19 +86,17 @@ def fetch_supported_languages():
|
||||||
print e
|
print e
|
||||||
|
|
||||||
# write json file
|
# write json file
|
||||||
f = io.open(engines_languages_file, "w", encoding="utf-8")
|
with io.open(engines_languages_file, "w", encoding="utf-8") as f:
|
||||||
f.write(unicode(dumps(engines_languages, ensure_ascii=False, encoding="utf-8")))
|
f.write(unicode(dumps(engines_languages, ensure_ascii=False, encoding="utf-8")))
|
||||||
f.close()
|
|
||||||
|
|
||||||
|
|
||||||
# Join all language lists.
|
# Join all language lists.
|
||||||
# Iterate all languages supported by each engine.
|
# Iterate all languages supported by each engine.
|
||||||
def join_language_lists():
|
def join_language_lists():
|
||||||
# include wikipedia first for more accurate language names
|
# include wikipedia first for more accurate language names
|
||||||
# exclude languages with too few articles
|
|
||||||
languages.update({code: lang for code, lang
|
languages.update({code: lang for code, lang
|
||||||
in engines_languages['wikipedia'].iteritems()
|
in engines_languages['wikipedia'].iteritems()
|
||||||
if valid_code(code) and lang['articles'] >= 100000})
|
if valid_code(code)})
|
||||||
|
|
||||||
for engine_name in engines_languages:
|
for engine_name in engines_languages:
|
||||||
for locale in engines_languages[engine_name]:
|
for locale in engines_languages[engine_name]:
|
||||||
|
@ -104,25 +105,27 @@ def join_language_lists():
|
||||||
|
|
||||||
# if language is not on list or if it has no name yet
|
# if language is not on list or if it has no name yet
|
||||||
if locale not in languages or not languages[locale].get('name'):
|
if locale not in languages or not languages[locale].get('name'):
|
||||||
if isinstance(engines_languages[engine_name], dict) \
|
if isinstance(engines_languages[engine_name], dict):
|
||||||
and engines_languages[engine_name][locale].get('articles', float('inf')) >= 100000:
|
|
||||||
languages[locale] = engines_languages[engine_name][locale]
|
languages[locale] = engines_languages[engine_name][locale]
|
||||||
else:
|
else:
|
||||||
languages[locale] = {}
|
languages[locale] = {}
|
||||||
|
|
||||||
# get locales that have no name or country yet
|
# get locales that have no name or country yet
|
||||||
for locale in languages.keys():
|
for locale in languages.keys():
|
||||||
|
# try to get language names
|
||||||
if not languages[locale].get('name'):
|
if not languages[locale].get('name'):
|
||||||
# try to get language names
|
|
||||||
name = languages.get(locale.split('-')[0], {}).get('name', None)
|
name = languages.get(locale.split('-')[0], {}).get('name', None)
|
||||||
if name:
|
if name:
|
||||||
languages[locale]['name'] = name
|
languages[locale]['name'] = name
|
||||||
languages[locale]['english_name'] = languages.get(locale.split('-')[0], {}).get('english_name', '')
|
|
||||||
else:
|
else:
|
||||||
# filter out locales with no name
|
# filter out locales with no name
|
||||||
del languages[locale]
|
del languages[locale]
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# try to get language name in english
|
||||||
|
if not languages[locale].get('english_name'):
|
||||||
|
languages[locale]['english_name'] = languages.get(locale.split('-')[0], {}).get('english_name', '')
|
||||||
|
|
||||||
# try to get country name
|
# try to get country name
|
||||||
if locale.find('-') > 0 and not languages[locale].get('country'):
|
if locale.find('-') > 0 and not languages[locale].get('country'):
|
||||||
languages[locale]['country'] = get_country_name(locale) or ''
|
languages[locale]['country'] = get_country_name(locale) or ''
|
||||||
|
@ -145,10 +148,10 @@ def filter_single_country_languages():
|
||||||
# Write languages.py.
|
# Write languages.py.
|
||||||
def write_languages_file():
|
def write_languages_file():
|
||||||
new_file = open(languages_file, 'w')
|
new_file = open(languages_file, 'w')
|
||||||
file_content = '# -*- coding: utf-8 -*-\n'
|
file_content = '# -*- coding: utf-8 -*-\n'\
|
||||||
file_content += '# list of language codes\n'
|
+ '# list of language codes\n'\
|
||||||
file_content += '# this file is generated automatically by utils/update_search_languages.py\n'
|
+ '# this file is generated automatically by utils/update_search_languages.py\n'\
|
||||||
file_content += '\nlanguage_codes = ('
|
+ '\nlanguage_codes = ('
|
||||||
for code in sorted(languages):
|
for code in sorted(languages):
|
||||||
file_content += '\n (u"' + code + '"'\
|
file_content += '\n (u"' + code + '"'\
|
||||||
+ ', u"' + languages[code]['name'].split(' (')[0] + '"'\
|
+ ', u"' + languages[code]['name'].split(' (')[0] + '"'\
|
||||||
|
|
Loading…
Reference in a new issue