[mod] fetch supported languages for several engines

utils/fetch_languages.py gets languages supported by each engine and
generates engines_languages.json with each engine's supported language.
This commit is contained in:
marc 2016-11-05 20:51:38 -06:00
parent 92c6e88ad3
commit f62ce21f50
26 changed files with 3633 additions and 362 deletions

File diff suppressed because it is too large Load diff

View file

@ -20,6 +20,7 @@ from os.path import realpath, dirname
import sys import sys
from flask_babel import gettext from flask_babel import gettext
from operator import itemgetter from operator import itemgetter
from json import loads
from searx import settings from searx import settings
from searx import logger from searx import logger
from searx.utils import load_module from searx.utils import load_module
@ -78,6 +79,9 @@ def load_engine(engine_data):
if not hasattr(engine, arg_name): if not hasattr(engine, arg_name):
setattr(engine, arg_name, arg_value) setattr(engine, arg_name, arg_value)
if engine_data['name'] in languages:
setattr(engine, 'supported_languages', languages[engine_data['name']])
# checking required variables # checking required variables
for engine_attr in dir(engine): for engine_attr in dir(engine):
if engine_attr.startswith('_'): if engine_attr.startswith('_'):
@ -207,6 +211,8 @@ if 'engines' not in settings or not settings['engines']:
logger.error('No engines found. Edit your settings.yml') logger.error('No engines found. Edit your settings.yml')
exit(2) exit(2)
languages = loads(open(engine_dir + '/../data/engines_languages.json').read())
for engine_data in settings['engines']: for engine_data in settings['engines']:
engine = load_engine(engine_data) engine = load_engine(engine_data)
if engine is not None: if engine is not None:

View file

@ -15,12 +15,14 @@
from urllib import urlencode from urllib import urlencode
from lxml import html from lxml import html
from requests import get
from searx.engines.xpath import extract_text from searx.engines.xpath import extract_text
# engine dependent config # engine dependent config
categories = ['general'] categories = ['general']
paging = True paging = True
language_support = True language_support = True
supported_languages_url = 'https://www.bing.com/account/general'
# search-url # search-url
base_url = 'https://www.bing.com/' base_url = 'https://www.bing.com/'
@ -81,3 +83,16 @@ def response(resp):
# return results # return results
return results return results
# get supported languages from their site
def fetch_supported_languages():
supported_languages = []
response = get(supported_languages_url)
dom = html.fromstring(response.text)
options = dom.xpath('//div[@id="limit-languages"]//input')
for option in options:
code = option.xpath('./@id')[0].replace('_', '-')
supported_languages.append(code)
return supported_languages

View file

@ -19,7 +19,7 @@ from urllib import urlencode
from lxml import html from lxml import html
from json import loads from json import loads
import re import re
from searx.engines.bing import supported_languages from searx.engines.bing import fetch_supported_languages
# engine dependent config # engine dependent config
categories = ['images'] categories = ['images']

View file

@ -17,7 +17,7 @@ from datetime import datetime
from dateutil import parser from dateutil import parser
from lxml import etree from lxml import etree
from searx.utils import list_get from searx.utils import list_get
from searx.engines.bing import supported_languages from searx.engines.bing import fetch_supported_languages
# engine dependent config # engine dependent config
categories = ['news'] categories = ['news']

View file

@ -15,29 +15,12 @@
from urllib import urlencode from urllib import urlencode
from json import loads from json import loads
from datetime import datetime from datetime import datetime
from requests import get
# engine dependent config # engine dependent config
categories = ['videos'] categories = ['videos']
paging = True paging = True
language_support = True language_support = True
supported_languages = ["af", "ak", "am", "ar", "an", "as", "av", "ae", "ay", "az",
"ba", "bm", "be", "bn", "bi", "bo", "bs", "br", "bg", "ca",
"cs", "ch", "ce", "cu", "cv", "kw", "co", "cr", "cy", "da",
"de", "dv", "dz", "el", "en", "eo", "et", "eu", "ee", "fo",
"fa", "fj", "fi", "fr", "fy", "ff", "gd", "ga", "gl", "gv",
"gn", "gu", "ht", "ha", "sh", "he", "hz", "hi", "ho", "hr",
"hu", "hy", "ig", "io", "ii", "iu", "ie", "ia", "id", "ik",
"is", "it", "jv", "ja", "kl", "kn", "ks", "ka", "kr", "kk",
"km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo",
"la", "lv", "li", "ln", "lt", "lb", "lu", "lg", "mh", "ml",
"mr", "mk", "mg", "mt", "mn", "mi", "ms", "my", "na", "nv",
"nr", "nd", "ng", "ne", "nl", "nn", "nb", "no", "ny", "oc",
"oj", "or", "om", "os", "pa", "pi", "pl", "pt", "ps", "qu",
"rm", "ro", "rn", "ru", "sg", "sa", "si", "sk", "sl", "se",
"sm", "sn", "sd", "so", "st", "es", "sq", "sc", "sr", "ss",
"su", "sw", "sv", "ty", "ta", "tt", "te", "tg", "tl", "th",
"ti", "to", "tn", "ts", "tk", "tr", "tw", "ug", "uk", "ur",
"uz", "ve", "vi", "vo", "wa", "wo", "xh", "yi", "yo", "za", "zh", "zu"]
# search-url # search-url
# see http://www.dailymotion.com/doc/api/obj-video.html # see http://www.dailymotion.com/doc/api/obj-video.html
@ -45,6 +28,8 @@ search_url = 'https://api.dailymotion.com/videos?fields=created_time,title,descr
embedded_url = '<iframe frameborder="0" width="540" height="304" ' +\ embedded_url = '<iframe frameborder="0" width="540" height="304" ' +\
'data-src="//www.dailymotion.com/embed/video/{videoid}" allowfullscreen></iframe>' 'data-src="//www.dailymotion.com/embed/video/{videoid}" allowfullscreen></iframe>'
supported_languages_url = 'https://api.dailymotion.com/languages'
# do search-request # do search-request
def request(query, params): def request(query, params):
@ -92,3 +77,23 @@ def response(resp):
# return results # return results
return results return results
# get supported languages from their site
def fetch_supported_languages():
supported_languages = {}
response = get(supported_languages_url)
response_json = loads(response.text)
for language in response_json['list']:
supported_languages[language['code']] = {}
name = language['native_name']
if name:
supported_languages[language['code']]['name'] = name
english_name = language['name']
if english_name:
supported_languages[language['code']]['english_name'] = english_name
return supported_languages

View file

@ -15,19 +15,15 @@
from urllib import urlencode from urllib import urlencode
from lxml.html import fromstring from lxml.html import fromstring
from requests import get
from json import loads
from searx.engines.xpath import extract_text from searx.engines.xpath import extract_text
# engine dependent config # engine dependent config
categories = ['general'] categories = ['general']
paging = True paging = True
language_support = True language_support = True
supported_languages = ["es-AR", "en-AU", "de-AT", "fr-BE", "nl-BE", "pt-BR", "bg-BG", "en-CA", "fr-CA", "ca-CT", supported_languages_url = 'https://duckduckgo.com/d2030.js'
"es-CL", "zh-CN", "es-CO", "hr-HR", "cs-CZ", "da-DK", "et-EE", "fi-FI", "fr-FR", "de-DE",
"el-GR", "tzh-HK", "hu-HU", "en-IN", "id-ID", "en-ID", "en-IE", "he-IL", "it-IT", "jp-JP",
"kr-KR", "es-XL", "lv-LV", "lt-LT", "ms-MY", "en-MY", "es-MX", "nl-NL", "en-NZ", "no-NO",
"es-PE", "en-PH", "tl-PH", "pl-PL", "pt-PT", "ro-RO", "ru-RU", "ar-XA", "en-XA", "en-SG",
"sk-SK", "sl-SL", "en-ZA", "es-ES", "ca-ES", "sv-SE", "de-CH", "fr-CH", "it-CH", "tzh-TW",
"th-TH", "tr-TR", "uk-UA", "en-UK", "en-US", "es-US", "vi-VN"]
time_range_support = True time_range_support = True
# search-url # search-url
@ -65,8 +61,6 @@ def request(query, params):
locale = 'xa' + params['language'].split('-')[0] locale = 'xa' + params['language'].split('-')[0]
elif params['language'][-2:] == 'GB': elif params['language'][-2:] == 'GB':
locale = 'uk' + params['language'].split('-')[0] locale = 'uk' + params['language'].split('-')[0]
elif params['language'] == 'es-419':
locale = 'xl-es'
else: else:
locale = params['language'].split('-') locale = params['language'].split('-')
if len(locale) == 2: if len(locale) == 2:
@ -120,3 +114,18 @@ def response(resp):
# return results # return results
return results return results
# get supported languages from their site
def fetch_supported_languages():
response = get(supported_languages_url)
# response is a js file with regions as an embedded object
response_page = response.text
response_page = response_page[response_page.find('regions:{') + 8:]
response_page = response_page[:response_page.find('}') + 1]
regions_json = loads(response_page)
supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys())
return supported_languages

View file

@ -4,7 +4,7 @@ from re import compile, sub
from lxml import html from lxml import html
from searx.utils import html_to_text from searx.utils import html_to_text
from searx.engines.xpath import extract_text from searx.engines.xpath import extract_text
from searx.engines.duckduckgo import supported_languages from searx.engines.duckduckgo import fetch_supported_languages
url = 'https://api.duckduckgo.com/'\ url = 'https://api.duckduckgo.com/'\
+ '?{query}&format=json&pretty=0&no_redirect=1&d=1' + '?{query}&format=json&pretty=0&no_redirect=1&d=1'

View file

@ -14,6 +14,8 @@ from json import loads
from random import randint from random import randint
from time import time from time import time
from urllib import urlencode from urllib import urlencode
from requests import get
from lxml.html import fromstring
# engine dependent config # engine dependent config
categories = ['general'] categories = ['general']
@ -40,11 +42,7 @@ url_xpath = './/url'
title_xpath = './/title' title_xpath = './/title'
content_xpath = './/sum' content_xpath = './/sum'
supported_languages = ["en", "fr", "es", "ru", "tr", "ja", "zh-CN", "zh-TW", "ko", "de", supported_languages_url = 'https://gigablast.com/search?&rxikd=1'
"nl", "it", "fi", "sv", "no", "pt", "vi", "ar", "he", "id", "el",
"th", "hi", "bn", "pl", "tl", "la", "eo", "ca", "bg", "tx", "sr",
"hu", "da", "lt", "cs", "gl", "ka", "gd", "go", "ro", "ga", "lv",
"hy", "is", "ag", "gv", "io", "fa", "te", "vv", "mg", "ku", "lb", "et"]
# do search-request # do search-request
@ -90,3 +88,17 @@ def response(resp):
# return results # return results
return results return results
# get supported languages from their site
def fetch_supported_languages():
supported_languages = []
response = get(supported_languages_url)
dom = fromstring(response.text)
links = dom.xpath('//span[@id="menu2"]/a')
for link in links:
code = link.xpath('./@href')[0][-2:]
if code != 'xx' and code not in supported_languages:
supported_languages.append(code)
return supported_languages

View file

@ -12,6 +12,7 @@ import re
from urllib import urlencode from urllib import urlencode
from urlparse import urlparse, parse_qsl from urlparse import urlparse, parse_qsl
from lxml import html, etree from lxml import html, etree
from requests import get
from searx.engines.xpath import extract_text, extract_url from searx.engines.xpath import extract_text, extract_url
from searx.search import logger from searx.search import logger
@ -23,20 +24,6 @@ categories = ['general']
paging = True paging = True
language_support = True language_support = True
use_locale_domain = True use_locale_domain = True
supported_languages = ["ach", "af", "ak", "az", "ms", "ban", "xx-bork", "bs", "br", "ca",
"ceb", "ckb", "cs", "sn", "co", "cy", "da", "de", "yo", "et",
"xx-elmer", "en", "es", "es-419", "eo", "eu", "ee", "tl", "fo", "fr",
"gaa", "ga", "gd", "gl", "gn", "xx-hacker", "ht", "ha", "hr", "haw",
"bem", "ig", "rn", "id", "ia", "zu", "is", "it", "jw", "rw", "sw",
"tlh", "kg", "mfe", "kri", "la", "lv", "to", "lt", "ln", "loz",
"lua", "lg", "hu", "mg", "mt", "mi", "nl", "pcm", "no", "nso",
"ny", "nn", "uz", "oc", "om", "xx-pirate", "pl", "pt-BR", "pt-PT",
"ro", "rm", "qu", "nyn", "crs", "sq", "sd", "sk", "sl", "so", "st",
"sr-ME", "sr-Latn", "su", "fi", "sv", "tg", "tt", "vi", "tn", "tum",
"tr", "tk", "tw", "fy", "wo", "xh", "el", "be", "bg", "ky", "kk", "mk",
"mn", "ru", "sr", "uk", "ka", "hy", "yi", "iw", "ug", "ur", "ar", "ps",
"fa", "ti", "am", "ne", "mr", "hi", "bn", "pa", "gu", "or", "ta", "te",
"kn", "ml", "si", "th", "lo", "my", "km", "chr", "ko", "zh-CN", "zh-TW", "ja"]
time_range_support = True time_range_support = True
# based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests # based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests
@ -117,6 +104,7 @@ map_hostname_start = 'maps.google.'
maps_path = '/maps' maps_path = '/maps'
redirect_path = '/url' redirect_path = '/url'
images_path = '/images' images_path = '/images'
supported_languages_url = 'https://www.google.com/preferences?#languages'
# specific xpath variables # specific xpath variables
results_xpath = '//div[@class="g"]' results_xpath = '//div[@class="g"]'
@ -373,3 +361,17 @@ def attributes_to_html(attributes):
retval = retval + '<tr><th>' + a.get('label') + '</th><td>' + value + '</td></tr>' retval = retval + '<tr><th>' + a.get('label') + '</th><td>' + value + '</td></tr>'
retval = retval + '</table>' retval = retval + '</table>'
return retval return retval
# get supported languages from their site
def fetch_supported_languages():
supported_languages = {}
response = get(supported_languages_url)
dom = html.fromstring(response.text)
options = dom.xpath('//select[@name="hl"]/option')
for option in options:
code = option.xpath('./@value')[0].split('-')[0]
name = option.text[:-1].title()
supported_languages[code] = {"name": name}
return supported_languages

View file

@ -13,7 +13,7 @@
from lxml import html from lxml import html
from urllib import urlencode from urllib import urlencode
from json import loads from json import loads
from searx.engines.google import supported_languages from searx.engines.google import fetch_supported_languages
# search-url # search-url
categories = ['news'] categories = ['news']

View file

@ -15,7 +15,6 @@
from json import loads from json import loads
from string import Formatter from string import Formatter
from urllib import urlencode, quote from urllib import urlencode, quote
from searx.engines.wikipedia import supported_languages
# engine dependent config # engine dependent config
categories = ['general'] categories = ['general']

View file

@ -20,11 +20,6 @@ from searx.utils import html_to_text
categories = None categories = None
paging = True paging = True
language_support = True language_support = True
supported_languages = ["fr-FR", "de-DE", "en-GB", "it-IT", "es-ES", "pt-PT", "de-CH", "fr-CH", "it-CH", "de-AT",
"fr-BE", "nl-BE", "nl-NL", "da-DK", "fi-FI", "sv-SE", "en-IE", "no-NO", "pl-PL", "ru-RU",
"el-GR", "bg-BG", "cs-CZ", "et-EE", "hu-HU", "ro-RO", "en-US", "en-CA", "fr-CA", "pt-BR",
"es-AR", "es-CL", "es-MX", "ja-JP", "en-SG", "en-IN", "en-MY", "ms-MY", "ko-KR", "tl-PH",
"th-TH", "he-IL", "tr-TR", "en-AU", "en-NZ"]
category_to_keyword = {'general': 'web', category_to_keyword = {'general': 'web',
'images': 'images', 'images': 'images',
@ -51,15 +46,7 @@ def request(query, params):
# add language tag if specified # add language tag if specified
if params['language'] != 'all': if params['language'] != 'all':
locale = params['language'].split('-')
if len(locale) == 2 and params['language'] in supported_languages:
params['url'] += '&locale=' + params['language'].replace('-', '_').lower() params['url'] += '&locale=' + params['language'].replace('-', '_').lower()
else:
# try to get a country code for language
for lang in supported_languages:
if locale[0] == lang.split('-')[0]:
params['url'] += '&locale=' + lang.replace('-', '_').lower()
break
return params return params

View file

@ -24,11 +24,6 @@ categories = ['general']
# paging = False # paging = False
language_support = True language_support = True
supported_languages = ["af", "de", "ar", "hy", "be", "bg", "ca", "cs", "zh-CN", "zh-TW",
"ko", "hr", "da", "sk", "sl", "es", "eo", "et", "fi", "fr",
"el", "iw", "hi", "nl", "hu", "id", "en", "is", "it", "ja",
"lv", "lt", "no", "fa", "pl", "pt", "ro", "ru", "sr", "sw",
"sv", "tl", "th", "tr", "uk", "vi"]
# search-url # search-url
base_url = 'https://startpage.com/' base_url = 'https://startpage.com/'

View file

@ -22,7 +22,7 @@ language = ""
# search-url # search-url
url = 'http://www.subtitleseeker.com/' url = 'http://www.subtitleseeker.com/'
search_url = url + 'search/TITLES/{query}&p={pageno}' search_url = url + 'search/TITLES/{query}?p={pageno}'
# specific xpath variables # specific xpath variables
results_xpath = '//div[@class="boxRows"]' results_xpath = '//div[@class="boxRows"]'
@ -51,7 +51,8 @@ def response(resp):
elif resp.search_params['language'] != 'all': elif resp.search_params['language'] != 'all':
search_lang = [lc[3] search_lang = [lc[3]
for lc in language_codes for lc in language_codes
if lc[0][:2] == resp.search_params['language'].split('_')[0]][0] if lc[0].split('-')[0] == resp.search_params['language'].split('-')[0]]
search_lang = search_lang[0].split(' (')[0]
# parse results # parse results
for result in dom.xpath(results_xpath): for result in dom.xpath(results_xpath):

View file

@ -13,17 +13,13 @@
from json import loads from json import loads
from urllib import urlencode, unquote from urllib import urlencode, unquote
import re import re
from requests import get
from lxml.html import fromstring
# engine dependent config # engine dependent config
categories = ['general', 'images'] categories = ['general', 'images']
paging = True paging = True
language_support = True language_support = True
supported_languages = ["ar-SA", "es-AR", "en-AU", "de-AT", "fr-BE", "nl-BE", "pt-BR", "bg-BG", "en-CA", "fr-CA",
"es-CL", "zh-CN", "hr-HR", "cs-CZ", "da-DK", "et-EE", "fi-FI", "fr-FR", "de-DE", "el-GR",
"zh-HK", "hu-HU", "en-IN", "en-IE", "he-IL", "it-IT", "ja-JP", "ko-KR", "lv-LV", "lt-LT",
"en-MY", "es-MX", "nl-NL", "en-NZ", "nb-NO", "en-PH", "pl-PL", "pt-PT", "ro-RO", "ru-RU",
"en-SG", "sk-SK", "sl-SI", "en-ZA", "es-ES", "sv-SE", "de-CH", "fr-CH", "zh-TW", "th-TH",
"tr-TR", "uk-UA", "en-GB", "en-US", "es-US"]
# search-url # search-url
base_url = 'https://swisscows.ch/' base_url = 'https://swisscows.ch/'
@ -114,3 +110,16 @@ def response(resp):
# return results # return results
return results return results
# get supported languages from their site
def fetch_supported_languages():
supported_languages = []
response = get(base_url)
dom = fromstring(response.text)
options = dom.xpath('//div[@id="regions-popup"]//ul/li/a')
for option in options:
code = option.xpath('./@data-val')[0]
supported_languages.append(code)
return supported_languages

View file

@ -15,7 +15,7 @@ from searx import logger
from searx.poolrequests import get from searx.poolrequests import get
from searx.engines.xpath import extract_text from searx.engines.xpath import extract_text
from searx.utils import format_date_by_locale from searx.utils import format_date_by_locale
from searx.engines.wikipedia import supported_languages from searx.engines.wikipedia import fetch_supported_languages
from json import loads from json import loads
from lxml.html import fromstring from lxml.html import fromstring
@ -57,7 +57,7 @@ calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]'
def request(query, params): def request(query, params):
language = params['language'].split('_')[0] language = params['language'].split('-')[0]
if language == 'all': if language == 'all':
language = 'en' language = 'en'
@ -72,7 +72,7 @@ def response(resp):
html = fromstring(resp.content) html = fromstring(resp.content)
wikidata_ids = html.xpath(wikidata_ids_xpath) wikidata_ids = html.xpath(wikidata_ids_xpath)
language = resp.search_params['language'].split('_')[0] language = resp.search_params['language'].split('-')[0]
if language == 'all': if language == 'all':
language = 'en' language = 'en'

View file

@ -12,36 +12,9 @@
from json import loads from json import loads
from urllib import urlencode, quote from urllib import urlencode, quote
from requests import get
from lxml.html import fromstring
supported_languages = ["en", "sv", "ceb", "de", "nl", "fr", "ru", "it", "es", "war",
"pl", "vi", "ja", "pt", "zh", "uk", "ca", "fa", "no", "sh",
"ar", "fi", "hu", "id", "ro", "cs", "ko", "sr", "ms", "tr",
"eu", "eo", "min", "bg", "da", "kk", "sk", "hy", "he", "zh-min-nan",
"lt", "hr", "sl", "et", "ce", "gl", "nn", "uz", "la", "vo",
"el", "simple", "be", "az", "th", "ur", "ka", "hi", "oc", "ta",
"mk", "mg", "new", "lv", "cy", "bs", "tt", "tl", "te", "pms",
"be-tarask", "br", "sq", "ky", "ht", "jv", "tg", "ast", "zh-yue", "lb",
"mr", "ml", "bn", "pnb", "is", "af", "sco", "ga", "ba", "fy",
"cv", "lmo", "sw", "my", "an", "yo", "ne", "io", "gu", "nds",
"scn", "bpy", "pa", "ku", "als", "kn", "bar", "ia", "qu", "su",
"ckb", "bat-smg", "mn", "arz", "nap", "wa", "bug", "gd", "yi", "map-bms",
"am", "mzn", "fo", "si", "nah", "li", "sah", "vec", "hsb", "or",
"os", "mrj", "sa", "hif", "mhr", "roa-tara", "azb", "pam", "ilo",
"sd", "ps", "se", "mi", "bh", "eml", "bcl", "xmf", "diq", "hak",
"gan", "glk", "vls", "nds-nl", "rue", "bo", "fiu-vro", "co", "sc",
"tk", "csb", "lrc", "vep", "wuu", "km", "szl", "gv", "crh", "kv",
"zh-classical", "frr", "zea", "as", "so", "kw", "nso", "ay", "stq",
"udm", "cdo", "nrm", "ie", "koi", "rm", "pcd", "myv", "mt", "fur",
"ace", "lad", "gn", "lij", "dsb", "dv", "cbk-zam", "ext", "gom",
"kab", "ksh", "ang", "mai", "mwl", "lez", "gag", "ln", "ug", "pi",
"pag", "frp", "sn", "nv", "av", "pfl", "haw", "xal", "krc", "kaa",
"rw", "bxr", "pdc", "to", "kl", "nov", "arc", "kbd", "lo", "bjn",
"pap", "ha", "tet", "ki", "tyv", "tpi", "na", "lbe", "ig", "jbo",
"roa-rup", "ty", "jam", "za", "kg", "mdf", "lg", "wo", "srn", "ab",
"ltg", "zu", "sm", "chr", "om", "tn", "chy", "rmy", "cu", "tw", "tum",
"xh", "bi", "rn", "pih", "got", "ss", "pnt", "bm", "ch", "mo", "ts",
"ady", "iu", "st", "ee", "ny", "fj", "ks", "ak", "ik", "sg", "ve",
"dz", "ff", "ti", "cr", "ng", "cho", "kj", "mh", "ho", "ii", "aa", "mus", "hz", "kr"]
# search-url # search-url
base_url = 'https://{language}.wikipedia.org/' base_url = 'https://{language}.wikipedia.org/'
@ -54,6 +27,7 @@ search_postfix = 'w/api.php?'\
'&explaintext'\ '&explaintext'\
'&pithumbsize=300'\ '&pithumbsize=300'\
'&redirects' '&redirects'
supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
# set language in base_url # set language in base_url
@ -142,3 +116,24 @@ def response(resp):
'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]}) 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
return results return results
# get supported languages from their site
def fetch_supported_languages():
supported_languages = {}
response = get(supported_languages_url)
dom = fromstring(response.text)
tables = dom.xpath('//table[contains(@class,"sortable")]')
for table in tables:
# exclude header row
trs = table.xpath('.//tr')[1:]
for tr in trs:
td = tr.xpath('./td')
code = td[3].xpath('./a')[0].text
name = td[2].xpath('./a')[0].text
english_name = td[1].xpath('./a')[0].text
articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
if articles >= 10000:
supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}
return supported_languages

View file

@ -14,16 +14,13 @@
from urllib import urlencode from urllib import urlencode
from urlparse import unquote from urlparse import unquote
from lxml import html from lxml import html
from requests import get
from searx.engines.xpath import extract_text, extract_url from searx.engines.xpath import extract_text, extract_url
# engine dependent config # engine dependent config
categories = ['general'] categories = ['general']
paging = True paging = True
language_support = True language_support = True
supported_languages = ["ar", "bg", "ca", "szh", "tzh", "hr", "cs", "da", "nl", "en",
"et", "fi", "fr", "de", "el", "he", "hu", "is", "id", "it", "ja",
"ko", "lv", "lt", "no", "fa", "pl", "pt", "ro", "ru", "sk", "sr",
"sl", "es", "sv", "th", "tr"]
time_range_support = True time_range_support = True
# search-url # search-url
@ -31,6 +28,8 @@ base_url = 'https://search.yahoo.com/'
search_url = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}' search_url = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}'
search_url_with_time = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}&age={age}&btf={btf}&fr2=time' search_url_with_time = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}&age={age}&btf={btf}&fr2=time'
supported_languages_url = 'https://search.yahoo.com/web/advanced'
# specific xpath variables # specific xpath variables
results_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' Sr ')]" results_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' Sr ')]"
url_xpath = './/h3/a/@href' url_xpath = './/h3/a/@href'
@ -142,3 +141,16 @@ def response(resp):
# return results # return results
return results return results
# get supported languages from their site
def fetch_supported_languages():
supported_languages = []
response = get(supported_languages_url)
dom = html.fromstring(response.text)
options = dom.xpath('//div[@id="yschlang"]/span/label/input')
for option in options:
code = option.xpath('./@value')[0][5:]
supported_languages.append(code)
return supported_languages

View file

@ -12,7 +12,7 @@
from urllib import urlencode from urllib import urlencode
from lxml import html from lxml import html
from searx.engines.xpath import extract_text, extract_url from searx.engines.xpath import extract_text, extract_url
from searx.engines.yahoo import parse_url, supported_languages from searx.engines.yahoo import parse_url, fetch_supported_languages
from datetime import datetime, timedelta from datetime import datetime, timedelta
import re import re
from dateutil import parser from dateutil import parser

View file

@ -4,39 +4,29 @@
language_codes = ( language_codes = (
(u"ach", u"Acoli", u"", u""), (u"ach", u"Acoli", u"", u""),
(u"af", u"Afrikaans", u"", u"Afrikaans"), (u"af", u"Afrikaans", u"", u""),
(u"ak", u"Akan", u"", u""), (u"ak", u"Akan", u"", u""),
(u"als", u"Alemannisch", u"", u"Alemannic"), (u"am", u"አማርኛ", u"", u""),
(u"am", u"አማርኛ", u"", u"Amharic"),
(u"an", u"Aragonés", u"", u"Aragonese"),
(u"ar-SA", u"العربية", u"المملكة العربية السعودية", u"Arabic"), (u"ar-SA", u"العربية", u"المملكة العربية السعودية", u"Arabic"),
(u"arz", u"مصرى (Maṣri)", u"", u"Egyptian Arabic"),
(u"ast", u"Asturianu", u"", u"Asturian"),
(u"az", u"Azərbaycanca", u"", u"Azerbaijani"), (u"az", u"Azərbaycanca", u"", u"Azerbaijani"),
(u"azb", u"تۆرکجه", u"", u"South Azerbaijani"),
(u"ba", u"Башҡорт", u"", u"Bashkir"),
(u"ban", u"Balinese", u"", u""), (u"ban", u"Balinese", u"", u""),
(u"bar", u"Boarisch", u"", u"Bavarian"),
(u"be", u"Беларуская", u"", u"Belarusian"), (u"be", u"Беларуская", u"", u"Belarusian"),
(u"bem", u"Ichibemba", u"", u""), (u"bem", u"Ichibemba", u"", u""),
(u"bg-BG", u"Български", u"България", u"Bulgarian"), (u"bg-BG", u"Български", u"България", u"Bulgarian"),
(u"bn", u"বাংলা", u"", u"Bengali"), (u"bn", u"বাংলা", u"", u""),
(u"bpy", u"ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী", u"", u"Bishnupriya Manipuri"), (u"br", u"Brezhoneg", u"", u""),
(u"br", u"Brezhoneg", u"", u"Breton"), (u"bs", u"Bosanski", u"", u""),
(u"bs", u"Bosanski", u"", u"Bosnian"),
(u"bug", u"Basa Ugi", u"", u"Buginese"),
(u"ca", u"Català", u"", u"Catalan"), (u"ca", u"Català", u"", u"Catalan"),
(u"ca-CT", u"Català", u"", u"Catalan"), (u"ca-CT", u"Català", u"", u"Catalan"),
(u"ca-ES", u"Català", u"Espanya", u"Catalan"), (u"ca-ES", u"Català", u"Espanya", u"Catalan"),
(u"ce", u"Нохчийн", u"", u"Chechen"), (u"ce", u"Нохчийн", u"", u"Chechen"),
(u"ceb", u"Sinugboanong Binisaya", u"", u"Cebuano"), (u"ceb", u"Sinugboanong Binisaya", u"", u"Cebuano"),
(u"chr", u"ᏣᎳᎩ", u"", u""), (u"chr", u"ᏣᎳᎩ", u"", u""),
(u"ckb", u"Soranî / کوردی", u"", u"Sorani"), (u"ckb", u"Central Kurdish", u"", u""),
(u"co", u"Corsican", u"", u""), (u"co", u"Corsican", u"", u""),
(u"crs", u"Seychellois Creole", u"", u""), (u"crs", u"Seychellois Creole", u"", u""),
(u"cs-CZ", u"Čeština", u"Česko", u"Czech"), (u"cs-CZ", u"Čeština", u"Česko", u"Czech"),
(u"cv", u"Чăваш", u"", u"Chuvash"), (u"cy", u"Cymraeg", u"", u""),
(u"cy", u"Cymraeg", u"", u"Welsh"),
(u"da-DK", u"Dansk", u"Danmark", u"Danish"), (u"da-DK", u"Dansk", u"Danmark", u"Danish"),
(u"de", u"Deutsch", u"", u"German"), (u"de", u"Deutsch", u"", u"German"),
(u"de-AT", u"Deutsch", u"Österreich", u"German"), (u"de-AT", u"Deutsch", u"Österreich", u"German"),
@ -70,148 +60,129 @@ language_codes = (
(u"eu", u"Euskara", u"", u"Basque"), (u"eu", u"Euskara", u"", u"Basque"),
(u"fa", u"فارسی", u"", u"Persian"), (u"fa", u"فارسی", u"", u"Persian"),
(u"fi-FI", u"Suomi", u"Suomi", u"Finnish"), (u"fi-FI", u"Suomi", u"Suomi", u"Finnish"),
(u"fo", u"Føroyskt", u"", u"Faroese"), (u"fo", u"Føroyskt", u"", u""),
(u"fr", u"Français", u"", u"French"), (u"fr", u"Français", u"", u"French"),
(u"fr-BE", u"Français", u"Belgique", u"French"), (u"fr-BE", u"Français", u"Belgique", u"French"),
(u"fr-CA", u"Français", u"Canada", u"French"), (u"fr-CA", u"Français", u"Canada", u"French"),
(u"fr-CH", u"Français", u"Suisse", u"French"), (u"fr-CH", u"Français", u"Suisse", u"French"),
(u"fr-FR", u"Français", u"France", u"French"), (u"fr-FR", u"Français", u"France", u"French"),
(u"fy", u"Frysk", u"", u"West Frisian"), (u"fy", u"West-Frysk", u"", u""),
(u"ga", u"Gaeilge", u"", u"Irish"), (u"ga", u"Gaeilge", u"", u""),
(u"gaa", u"Ga", u"", u""), (u"gaa", u"Ga", u"", u""),
(u"gd", u"Gàidhlig", u"", u"Scottish Gaelic"), (u"gd", u"Gàidhlig", u"", u""),
(u"gl", u"Galego", u"", u"Galician"), (u"gl", u"Galego", u"", u"Galician"),
(u"gn", u"Guarani", u"", u""), (u"gn", u"Guarani", u"", u""),
(u"gu", u"ગુજરાતી", u"", u"Gujarati"), (u"gu", u"ગુજરાતી", u"", u""),
(u"ha", u"Hausa", u"", u""), (u"ha", u"Hausa", u"", u""),
(u"haw", u"ʻŌlelo HawaiʻI", u"", u""), (u"haw", u"ʻŌlelo HawaiʻI", u"", u""),
(u"he-IL", u"עברית", u"ישראל", u"Hebrew"), (u"he-IL", u"עברית", u"ישראל", u"Hebrew"),
(u"hi", u"हिन्दी", u"", u"Hindi"), (u"hi", u"हिन्दी", u"", u"Hindi"),
(u"hr-HR", u"Hrvatski", u"Hrvatska", u"Croatian"), (u"hr-HR", u"Hrvatski", u"Hrvatska", u"Croatian"),
(u"hsb", u"Hornjoserbsce", u"", u"Upper Sorbian"), (u"ht", u"Haitian Creole", u"", u""),
(u"ht", u"Krèyol ayisyen", u"", u"Haitian"),
(u"hu-HU", u"Magyar", u"Magyarország", u"Hungarian"), (u"hu-HU", u"Magyar", u"Magyarország", u"Hungarian"),
(u"hy", u"Հայերեն", u"", u"Armenian"), (u"hy", u"Հայերեն", u"", u"Armenian"),
(u"ia", u"Interlingua", u"", u"Interlingua"), (u"ia", u"Interlingua", u"", u""),
(u"id-ID", u"Bahasa Indonesia", u"Indonesia", u"Indonesian"), (u"id-ID", u"Bahasa Indonesia", u"Indonesia", u"Indonesian"),
(u"ig", u"Igbo", u"", u""), (u"ig", u"Igbo", u"", u""),
(u"io", u"Ido", u"", u"Ido"), (u"is", u"Íslenska", u"", u""),
(u"is", u"Íslenska", u"", u"Icelandic"),
(u"it", u"Italiano", u"", u"Italian"), (u"it", u"Italiano", u"", u"Italian"),
(u"it-CH", u"Italiano", u"Svizzera", u"Italian"), (u"it-CH", u"Italiano", u"Svizzera", u"Italian"),
(u"it-IT", u"Italiano", u"Italia", u"Italian"), (u"it-IT", u"Italiano", u"Italia", u"Italian"),
(u"iw", u"עברית", u"", u""), (u"iw", u"עברית", u"", u""),
(u"ja-JP", u"日本語", u"日本", u"Japanese"), (u"ja-JP", u"日本語", u"日本", u"Japanese"),
(u"jv", u"Basa Jawa", u"", u"Javanese"),
(u"ka", u"ქართული", u"", u"Georgian"), (u"ka", u"ქართული", u"", u"Georgian"),
(u"kg", u"Kongo", u"", u""), (u"kg", u"Kongo", u"", u""),
(u"kk", u"Қазақша", u"", u"Kazakh"), (u"kk", u"Қазақша", u"", u"Kazakh"),
(u"km", u"ខ្មែរ", u"", u""), (u"km", u"ខ្មែរ", u"", u""),
(u"kn", u"ಕನ್ನಡ", u"", u"Kannada"), (u"kn", u"ಕನ್ನಡ", u"", u""),
(u"ko-KR", u"한국어", u"대한민국", u"Korean"), (u"ko-KR", u"한국어", u"대한민국", u"Korean"),
(u"kri", u"Krio (Sierra Leone)", u"", u""), (u"kri", u"Krio", u"", u""),
(u"ku", u"Kurdî / كوردی", u"", u"Kurdish"), (u"ky", u"Кыргызча", u"", u""),
(u"ky", u"Кыргызча", u"", u"Kirghiz"),
(u"la", u"Latina", u"", u"Latin"), (u"la", u"Latina", u"", u"Latin"),
(u"lb", u"Lëtzebuergesch", u"", u"Luxembourgish"),
(u"lg", u"Luganda", u"", u""), (u"lg", u"Luganda", u"", u""),
(u"li", u"Limburgs", u"", u"Limburgish"),
(u"lmo", u"Lumbaart", u"", u"Lombard"),
(u"ln", u"Lingála", u"", u""), (u"ln", u"Lingála", u"", u""),
(u"lo", u"ລາວ", u"", u""), (u"lo", u"ລາວ", u"", u""),
(u"loz", u"Lozi", u"", u""), (u"loz", u"Lozi", u"", u""),
(u"lt-LT", u"Lietuvių", u"Lietuva", u"Lithuanian"), (u"lt-LT", u"Lietuvių", u"Lietuva", u"Lithuanian"),
(u"lua", u"Luba-Lulua", u"", u""), (u"lua", u"Luba-Lulua", u"", u""),
(u"lv-LV", u"Latviešu", u"Latvijas Republika", u"Latvian"), (u"lv-LV", u"Latviešu", u"Latvijas Republika", u""),
(u"mfe", u"Kreol Morisien", u"", u""), (u"mfe", u"Kreol Morisien", u"", u""),
(u"mg", u"Malagasy", u"", u"Malagasy"), (u"mg", u"Malagasy", u"", u""),
(u"mi", u"Maori", u"", u""), (u"mi", u"Maori", u"", u""),
(u"min", u"Minangkabau", u"", u"Minangkabau"), (u"min", u"Minangkabau", u"", u"Minangkabau"),
(u"mk", u"Македонски", u"", u"Macedonian"), (u"mk", u"Македонски", u"", u""),
(u"ml", u"മലയാളം", u"", u"Malayalam"), (u"ml", u"മലയാളം", u"", u""),
(u"mn", u"Монгол", u"", u"Mongolian"), (u"mn", u"Монгол", u"", u""),
(u"mr", u"मराठी", u"", u"Marathi"), (u"mr", u"मराठी", u"", u""),
(u"mrj", u"Кырык Мары (Kyryk Mary)", u"", u"Hill Mari"),
(u"ms-MY", u"Bahasa Melayu", u"Malaysia", u"Malay"), (u"ms-MY", u"Bahasa Melayu", u"Malaysia", u"Malay"),
(u"mt", u"Malti", u"", u""), (u"mt", u"Malti", u"", u""),
(u"my", u"မြန်မာဘာသာ", u"", u"Burmese"), (u"my", u"ဗမာ", u"", u""),
(u"mzn", u"مَزِروني", u"", u"Mazandarani"), (u"nb-NO", u"Norwegian Bokmål", u"Norge", u"Norwegian Bokmål"),
(u"nah", u"Nāhuatl", u"", u"Nahuatl"), (u"ne", u"नेपाली", u"", u""),
(u"nap", u"Nnapulitano", u"", u"Neapolitan"),
(u"nds-nl", u"Plattdüütsch", u"Nedderlannen", u"Low Saxon"),
(u"ne", u"नेपाली", u"", u"Nepali"),
(u"new", u"नेपाल भाषा", u"", u"Newar"),
(u"nl", u"Nederlands", u"", u"Dutch"), (u"nl", u"Nederlands", u"", u"Dutch"),
(u"nl-BE", u"Nederlands", u"België", u"Dutch"), (u"nl-BE", u"Nederlands", u"België", u"Dutch"),
(u"nl-NL", u"Nederlands", u"Nederland", u"Dutch"), (u"nl-NL", u"Nederlands", u"Nederland", u"Dutch"),
(u"nn", u"Nynorsk", u"", u"Norwegian (Nynorsk)"), (u"nn", u"Nynorsk", u"", u"Norwegian"),
(u"no-NO", u"Norsk (Bokmål)", u"Norge", u"Norwegian (Bokmål)"), (u"no-NO", u"Norsk", u"Norge", u"Norwegian"),
(u"nso", u"Northern Sotho", u"", u""), (u"nso", u"Northern Sotho", u"", u""),
(u"ny", u"Nyanja", u"", u""), (u"ny", u"Nyanja", u"", u""),
(u"nyn", u"Runyankore", u"", u""), (u"nyn", u"Runyankore", u"", u""),
(u"oc", u"Occitan", u"", u"Occitan"), (u"oc", u"Occitan", u"", u""),
(u"om", u"Oromoo", u"", u""), (u"om", u"Oromoo", u"", u""),
(u"or", u"ଓଡ଼ିଆ", u"", u"Oriya"), (u"or", u"ଓଡ଼ିଆ", u"", u""),
(u"os", u"Иронау", u"", u"Ossetian"), (u"pa", u"ਪੰਜਾਬੀ", u"", u""),
(u"pa", u"ਪੰਜਾਬੀ", u"", u"Punjabi"),
(u"pcm", u"Nigerian Pidgin", u"", u""), (u"pcm", u"Nigerian Pidgin", u"", u""),
(u"pl-PL", u"Polski", u"Rzeczpospolita Polska", u"Polish"), (u"pl-PL", u"Polski", u"Rzeczpospolita Polska", u"Polish"),
(u"pms", u"Piemontèis", u"", u"Piedmontese"),
(u"pnb", u"شاہ مکھی پنجابی (Shāhmukhī Pañjābī)", u"", u"Western Punjabi"),
(u"ps", u"پښتو", u"", u""), (u"ps", u"پښتو", u"", u""),
(u"pt", u"Português", u"", u"Portuguese"), (u"pt", u"Português", u"", u"Portuguese"),
(u"pt-BR", u"Português", u"Brasil", u"Portuguese"), (u"pt-BR", u"Português", u"Brasil", u"Portuguese"),
(u"pt-PT", u"Português", u"Portugal", u"Portuguese"), (u"pt-PT", u"Português", u"Portugal", u"Portuguese"),
(u"qu", u"Runa Simi", u"", u"Quechua"), (u"qu", u"Runasimi", u"", u""),
(u"rm", u"Rumantsch", u"", u""), (u"rm", u"Rumantsch", u"", u""),
(u"rn", u"Ikirundi", u"", u""), (u"rn", u"Ikirundi", u"", u""),
(u"ro-RO", u"Română", u"România", u"Romanian"), (u"ro-RO", u"Română", u"România", u"Romanian"),
(u"ru-RU", u"Русский", u"Россия", u"Russian"), (u"ru-RU", u"Русский", u"Россия", u"Russian"),
(u"rw", u"Kinyarwanda", u"", u""), (u"rw", u"Kinyarwanda", u"", u""),
(u"sa", u"संस्कृतम्", u"", u"Sanskrit"),
(u"sah", u"Саха тыла (Saxa Tyla)", u"", u"Sakha"),
(u"scn", u"Sicilianu", u"", u"Sicilian"),
(u"sco", u"Scots", u"", u"Scots"),
(u"sd", u"Sindhi", u"", u""), (u"sd", u"Sindhi", u"", u""),
(u"sh", u"Srpskohrvatski / Српскохрватски", u"", u"Serbo-Croatian"), (u"sh", u"Srpskohrvatski / Српскохрватски", u"", u"Serbo-Croatian"),
(u"si", u"සිංහල", u"", u"Sinhalese"), (u"si", u"සිංහල", u"", u""),
(u"sk-SK", u"Slovenčina", u"Slovenská republika", u"Slovak"), (u"sk-SK", u"Slovenčina", u"Slovenská republika", u"Slovak"),
(u"sl-SI", u"Slovenščina", u"Slovenija", u"Slovenian"), (u"sl", u"Slovenščina", u"", u"Slovenian"),
(u"sn", u"Chishona", u"", u""), (u"sn", u"Chishona", u"", u""),
(u"so", u"Soomaali", u"", u""), (u"so", u"Soomaali", u"", u""),
(u"sq", u"Shqip", u"", u"Albanian"), (u"sq", u"Shqip", u"", u""),
(u"sr-ME", u"Српски / Srpski", u"Црна Гора", u"Serbian"), (u"sr", u"Српски / Srpski", u"", u"Serbian"),
(u"st", u"Southern Sotho", u"", u""), (u"st", u"Southern Sotho", u"", u""),
(u"su", u"Basa Sunda", u"", u"Sundanese"), (u"su", u"Sundanese", u"", u""),
(u"sv-SE", u"Svenska", u"Sverige", u"Swedish"), (u"sv-SE", u"Svenska", u"Sverige", u"Swedish"),
(u"sw", u"Kiswahili", u"", u"Swahili"), (u"sw", u"Kiswahili", u"", u""),
(u"ta", u"தமிழ்", u"", u"Tamil"), (u"ta", u"தமிழ்", u"", u""),
(u"te", u"తెలుగు", u"", u"Telugu"), (u"te", u"తెలుగు", u"", u""),
(u"tg", u"Тоҷикӣ", u"", u"Tajik"), (u"tg", u"Tajik", u"", u""),
(u"th-TH", u"ไทย", u"ไทย", u"Thai"), (u"th-TH", u"ไทย", u"ไทย", u"Thai"),
(u"ti", u"ትግርኛ", u"", u""), (u"ti", u"ትግርኛ", u"", u""),
(u"tk", u"Turkmen", u"", u""), (u"tk", u"Turkmen", u"", u""),
(u"tl-PH", u"Tagalog", u"Pilipinas", u"Tagalog"), (u"tl-PH", u"Filipino", u"Pilipinas", u""),
(u"tlh", u"Klingon", u"", u""), (u"tlh", u"Klingon", u"", u""),
(u"tn", u"Tswana", u"", u""), (u"tn", u"Tswana", u"", u""),
(u"to", u"Lea Fakatonga", u"", u""), (u"to", u"Lea Fakatonga", u"", u""),
(u"tr-TR", u"Türkçe", u"Türkiye", u"Turkish"), (u"tr-TR", u"Türkçe", u"Türkiye", u"Turkish"),
(u"tt", u"Tatarça / Татарча", u"", u"Tatar"), (u"tt", u"Tatar", u"", u""),
(u"tum", u"Tumbuka", u"", u""), (u"tum", u"Tumbuka", u"", u""),
(u"tw", u"Twi", u"", u""), (u"tw", u"Twi", u"", u""),
(u"ug", u"ئۇيغۇرچە", u"", u""), (u"ug", u"ئۇيغۇرچە", u"", u""),
(u"uk-UA", u"Українська", u"Україна", u"Ukrainian"), (u"uk-UA", u"Українська", u"Україна", u"Ukrainian"),
(u"ur", u"اردو", u"", u"Urdu"), (u"ur", u"اردو", u"", u"Urdu"),
(u"uz", u"Ozbek", u"", u"Uzbek"), (u"uz", u"Ozbek", u"", u"Uzbek"),
(u"vec", u"Vèneto", u"", u"Venetian"), (u"ve", u"Venda", u"", u"Venda"),
(u"vi-VN", u"Tiếng Việt", u"Công Hòa Xã Hội Chủ Nghĩa Việt Nam", u"Vietnamese"), (u"vi-VN", u"Tiếng Việt", u"Công Hòa Xã Hội Chủ Nghĩa Việt Nam", u"Vietnamese"),
(u"vo", u"Volapük", u"", u"Volapük"), (u"vo", u"Volapük", u"", u"Volapük"),
(u"wa", u"Walon", u"", u"Walloon"), (u"wa", u"Walon", u"", u"Walloon"),
(u"war", u"Winaray", u"", u"Waray-Waray"), (u"war", u"Winaray", u"", u"Waray-Waray"),
(u"wo", u"Wolof", u"", u""), (u"wo", u"Wolof", u"", u""),
(u"xh", u"Xhosa", u"", u""), (u"xh", u"Xhosa", u"", u""),
(u"yi", u"ייִדיש", u"", u"Yiddish"), (u"yi", u"ייִדיש", u"", u""),
(u"yo", u"Yorùbá", u"", u"Yoruba"), (u"yo", u"Èdè Yorùbá", u"", u""),
(u"zh", u"中文", u"", u"Chinese"), (u"zh", u"中文", u"", u"Chinese"),
(u"zh-CN", u"中文", u"中国", u"Chinese"), (u"zh-CN", u"中文", u"中国", u"Chinese"),
(u"zh-HK", u"中文", u"香港", u"Chinese"), (u"zh-HK", u"中文", u"香港", u"Chinese"),

View file

@ -514,7 +514,7 @@ def index():
answers=result_container.answers, answers=result_container.answers,
infoboxes=result_container.infoboxes, infoboxes=result_container.infoboxes,
paging=result_container.paging, paging=result_container.paging,
current_language=search.lang, current_language=search_query.lang,
base_url=get_base_url(), base_url=get_base_url(),
theme=get_current_theme_name(), theme=get_current_theme_name(),
favicons=global_favicons[themes.index(get_current_theme_name())] favicons=global_favicons[themes.index(get_current_theme_name())]

View file

@ -17,7 +17,7 @@ class TestSubtitleseekerEngine(SearxTestCase):
def test_response(self): def test_response(self):
dicto = defaultdict(dict) dicto = defaultdict(dict)
dicto['language'] = 'fr_FR' dicto['language'] = 'fr-FR'
response = mock.Mock(search_params=dicto) response = mock.Mock(search_params=dicto)
self.assertRaises(AttributeError, subtitleseeker.response, None) self.assertRaises(AttributeError, subtitleseeker.response, None)

View file

@ -8,6 +8,8 @@ from searx.testing import SearxTestCase
class TestWikipediaEngine(SearxTestCase): class TestWikipediaEngine(SearxTestCase):
def test_request(self): def test_request(self):
wikipedia.supported_languages = ['fr', 'en']
query = 'test_query' query = 'test_query'
dicto = defaultdict(dict) dicto = defaultdict(dict)
dicto['language'] = 'fr-FR' dicto['language'] = 'fr-FR'

164
utils/fetch_languages.py Normal file
View file

@ -0,0 +1,164 @@
# -*- coding: utf-8 -*-
# This script generates languages.py from intersecting each engine's supported languages.
#
# The country names are obtained from http://api.geonames.org which requires registering as a user.
#
# Output files (engines_languages.json and languages.py)
# are written in current directory to avoid overwriting in case something goes wrong.
from requests import get
from urllib import urlencode
from lxml.html import fromstring
from json import loads, dumps
import io
from sys import path
path.append('../searx') # noqa
from searx.engines import engines
# Geonames API for country names.
geonames_user = '' # ADD USER NAME HERE
country_names_url = 'http://api.geonames.org/countryInfoJSON?{parameters}'
# Output files.
engines_languages_file = 'engines_languages.json'
languages_file = 'languages.py'
engines_languages = {}
languages = {}
# To filter out invalid codes and dialects.
def valid_code(lang_code):
# filter invalid codes
# sl-SL is technically not invalid, but still a mistake
if lang_code[:2] == 'xx'\
or lang_code == 'sl-SL'\
or lang_code == 'wt-WT'\
or lang_code == 'jw'\
or lang_code[-2:] == 'UK'\
or lang_code[-2:] == 'XA'\
or lang_code[-2:] == 'XL':
return False
# filter dialects
lang_code = lang_code.split('-')
if len(lang_code) > 2 or len(lang_code[0]) > 3:
return False
if len(lang_code) == 2 and len(lang_code[1]) > 2:
return False
return True
# Get country name in specified language.
def get_country_name(locale):
if geonames_user is '':
return ''
locale = locale.split('-')
if len(locale) != 2:
return ''
url = country_names_url.format(parameters=urlencode({'lang': locale[0],
'country': locale[1],
'username': geonames_user}))
response = get(url)
json = loads(response.text)
content = json.get('geonames', None)
if content is None or len(content) != 1:
print "No country name found for " + locale[0] + "-" + locale[1]
return ''
return content[0].get('countryName', '')
# Fetchs supported languages for each engine and writes json file with those.
def fetch_supported_languages():
for engine_name in engines:
if hasattr(engines[engine_name], 'fetch_supported_languages'):
try:
engines_languages[engine_name] = engines[engine_name].fetch_supported_languages()
except Exception as e:
print e
# write json file
f = io.open(engines_languages_file, "w", encoding="utf-8")
f.write(unicode(dumps(engines_languages, indent=4, ensure_ascii=False, encoding="utf-8")))
f.close()
# Join all language lists.
# Iterate all languages supported by each engine.
def join_language_lists():
# include wikipedia first for more accurate language names
# exclude languages with too few articles
languages.update({code: lang for code, lang
in engines_languages['wikipedia'].iteritems()
if valid_code(code) and lang['articles'] >= 100000})
for engine_name in engines_languages:
for locale in engines_languages[engine_name]:
if not valid_code(locale):
continue
# if language is not on list or if it has no name yet
if locale not in languages or not languages[locale].get('name'):
if isinstance(engines_languages[engine_name], dict) \
and engines_languages[engine_name][locale].get('articles', float('inf')) >= 100000:
languages[locale] = engines_languages[engine_name][locale]
else:
languages[locale] = {}
# get locales that have no name yet
for locale in languages.keys():
if not languages[locale].get('name'):
# try to get language and country names
name = languages.get(locale.split('-')[0], {}).get('name', None)
if name:
languages[locale]['name'] = name
languages[locale]['country'] = get_country_name(locale) or ''
languages[locale]['english_name'] = languages.get(locale.split('-')[0], {}).get('english_name', '')
else:
# filter out locales with no name
del languages[locale]
# Remove countryless language if language is featured in only one country.
def filter_single_country_languages():
prev_lang = None
for code in sorted(languages):
lang = code.split('-')[0]
if lang == prev_lang:
countries += 1
else:
if prev_lang is not None and countries == 1:
del languages[prev_lang]
countries = 0
prev_lang = lang
# Write languages.py.
def write_languages_file():
new_file = open(languages_file, 'w')
file_content = '# -*- coding: utf-8 -*-\n'
file_content += '# list of language codes\n'
file_content += '# this file is generated automatically by utils/update_search_languages.py\n'
file_content += '\nlanguage_codes = ('
for code in sorted(languages):
file_content += '\n (u"' + code + '"'\
+ ', u"' + languages[code]['name'].split(' (')[0] + '"'\
+ ', u"' + languages[code].get('country', '') + '"'\
+ ', u"' + languages[code].get('english_name', '').split(' (')[0] + '"),'
# remove last comma
file_content = file_content[:-1]
file_content += '\n)\n'
new_file.write(file_content.encode('utf8'))
new_file.close()
if __name__ == "__main__":
fetch_supported_languages()
join_language_lists()
filter_single_country_languages()
write_languages_file()

View file

@ -1,169 +0,0 @@
# -*- coding: utf-8 -*-
# This script generates languages.py from
# intersecting each engine's supported languages.
#
# The language's native names are obtained from
# Wikipedia and Google's supported languages.
#
# The country names are obtained from http://api.geonames.org
# which requires registering as a user.
#
# Output file (languages.py) is written in current directory
# to avoid overwriting in case something goes wrong.
from requests import get
from urllib import urlencode
from lxml.html import fromstring
from json import loads
from sys import path
path.append('../searx')
from searx.engines import engines
# list of names
wiki_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
google_languages_url = 'https://www.google.com/preferences?#languages'
country_names_url = 'http://api.geonames.org/countryInfoJSON?{parameters}'
geonames_user = '' # add user name here
google_json_name = 'google.preferences.langMap'
languages = {}
# To filter out invalid codes and dialects.
def valid_code(lang_code):
# filter invalid codes
# sl-SL is technically not invalid, but still a mistake
if lang_code[:2] == 'xx'\
or lang_code == 'sl-SL'\
or lang_code == 'jw'\
or lang_code[-2:] == 'UK'\
or lang_code[-2:] == 'XA'\
or lang_code[-2:] == 'XL':
return False
# filter dialects
lang_code = lang_code.split('-')
if len(lang_code) > 2 or len(lang_code[0]) > 3:
return False
if len(lang_code) == 2 and len(lang_code[1]) > 2:
return False
return True
# Get country name in specified language.
def get_country_name(locale):
if geonames_user is '':
return ''
locale = locale.split('-')
if len(locale) != 2:
return ''
url = country_names_url.format(parameters=urlencode({'lang': locale[0],
'country': locale[1],
'username': geonames_user}))
response = get(url)
json = loads(response.text)
content = json.get('geonames', None)
if content is None or len(content) != 1:
print "No country name found for " + locale[0] + "-" + locale[1]
print json
return ''
return content[0].get('countryName', '')
# Get language names from Wikipedia.
def get_wikipedia_languages():
response = get(wiki_languages_url)
dom = fromstring(response.text)
tables = dom.xpath('//table[contains(@class,"sortable")]')
for table in tables:
# exclude header row
trs = table.xpath('.//tr')[1:]
for tr in trs:
td = tr.xpath('./td')
code = td[3].xpath('./a')[0].text
name = td[2].xpath('./a')[0].text
english_name = td[1].xpath('./a')[0].text
articles = int(td[4].xpath('./a/b')[0].text.replace(',',''))
# exclude language variants and languages with few articles
if code not in languages and articles >= 10000 and valid_code(code):
languages[code] = (name, '', english_name)
# Get language names from Google.
def get_google_languages():
response = get(google_languages_url)
dom = fromstring(response.text)
options = dom.xpath('//select[@name="hl"]/option')
for option in options:
code = option.xpath('./@value')[0].split('-')[0]
name = option.text[:-1].title()
if code not in languages and valid_code(code):
languages[code] = (name, '', '')
# Join all language lists.
# iterate all languages supported by each engine
def join_language_lists():
for engine_name in engines:
for locale in engines[engine_name].supported_languages:
locale = locale.replace('_', '-')
if locale not in languages and valid_code(locale):
# try to get language name
language = languages.get(locale.split('-')[0], None)
if language == None:
print engine_name + ": " + locale
continue
country = get_country_name(locale)
languages[locale] = (language[0], country, language[2])
# Remove countryless language if language is featured in only one country.
def filter_single_country_languages():
prev_lang = None
for code in sorted(languages):
lang = code.split('-')[0]
if lang == prev_lang:
countries += 1
else:
if prev_lang is not None and countries == 1:
del languages[prev_lang]
countries = 0
prev_lang = lang
# Write languages.py.
def write_languages_file():
new_file = open('languages.py', 'w')
file_content = '# -*- coding: utf-8 -*-\n'
file_content += '# list of language codes\n'
file_content += '# this file is generated automatically by utils/update_search_languages.py\n'
file_content += '\nlanguage_codes = ('
for code in sorted(languages):
(name, country, english) = languages[code]
file_content += '\n (u"' + code + '"'\
+ ', u"' + name + '"'\
+ ', u"' + country + '"'\
+ ', u"' + english + '"),'
# remove last comma
file_content = file_content[:-1]
file_content += '\n)\n'
new_file.write(file_content.encode('utf8'))
new_file.close()
if __name__ == "__main__":
get_wikipedia_languages()
get_google_languages()
join_language_lists()
filter_single_country_languages()
write_languages_file()