Merge pull request #1866 from return42/fix-news

bugfix: google-news and bing-news has changed the language parameter
This commit is contained in:
Markus Heiser 2020-03-04 11:00:30 +00:00 committed by GitHub
commit a5d3585a0c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 28391 additions and 27527 deletions

View file

@ -27,6 +27,7 @@ help:
@echo ' uninstall - uninstall (./local)' @echo ' uninstall - uninstall (./local)'
@echo ' gh-pages - build docs & deploy on gh-pages branch' @echo ' gh-pages - build docs & deploy on gh-pages branch'
@echo ' clean - drop builds and environments' @echo ' clean - drop builds and environments'
@echo ' project - re-build generic files of the searx project'
@echo '' @echo ''
@$(MAKE) -s -f utils/makefile.include make-help @$(MAKE) -s -f utils/makefile.include make-help
@echo '' @echo ''
@ -67,6 +68,18 @@ docs-live: pyenvinstall sphinx-live
$(GH_PAGES):: $(GH_PAGES)::
@echo "doc available at --> $(DOCS_URL)" @echo "doc available at --> $(DOCS_URL)"
# update project files
# --------------------
PHONY += project engines-languages
project: searx/data/engines_languages.json
searx/data/engines_languages.json: pyenvinstall
$(PY_ENV_ACT); python utils/fetch_languages.py
mv engines_languages.json searx/data/engines_languages.json
mv languages.py searx/languages.py
# test # test
# ---- # ----

View file

@ -5,6 +5,7 @@ mock==2.0.0
nose2[coverage_plugin] nose2[coverage_plugin]
cov-core==1.15.0 cov-core==1.15.0
pep8==1.7.0 pep8==1.7.0
pylint
plone.testing==5.0.0 plone.testing==5.0.0
splinter==0.11.0 splinter==0.11.0
transifex-client==0.12.2 transifex-client==0.12.2

File diff suppressed because it is too large Load diff

View file

@ -110,13 +110,18 @@ def response(resp):
# get supported languages from their site # get supported languages from their site
def _fetch_supported_languages(resp): def _fetch_supported_languages(resp):
supported_languages = [] lang_tags = set()
dom = html.fromstring(resp.text)
options = eval_xpath(dom, '//div[@id="limit-languages"]//input')
for option in options:
code = eval_xpath(option, './@id')[0].replace('_', '-')
if code == 'nb':
code = 'no'
supported_languages.append(code)
return supported_languages setmkt = re.compile('setmkt=([^&]*)')
dom = html.fromstring(resp.text)
lang_links = eval_xpath(dom, "//li/a[contains(@href, 'setmkt')]")
for a in lang_links:
href = eval_xpath(a, './@href')[0]
match = setmkt.search(href)
l_tag = match.groups()[0]
_lang, _nation = l_tag.split('-', 1)
l_tag = _lang.lower() + '-' + _nation.upper()
lang_tags.add(l_tag)
return list(lang_tags)

View file

@ -18,6 +18,8 @@ import re
from searx.url_utils import urlencode from searx.url_utils import urlencode
from searx.utils import match_language from searx.utils import match_language
from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases
# engine dependent config # engine dependent config
categories = ['images'] categories = ['images']
paging = True paging = True
@ -103,22 +105,3 @@ def response(resp):
continue continue
return results return results
# get supported languages from their site
def _fetch_supported_languages(resp):
supported_languages = []
dom = html.fromstring(resp.text)
regions_xpath = '//div[@id="region-section-content"]' \
+ '//ul[@class="b_vList"]/li/a/@href'
regions = dom.xpath(regions_xpath)
for region in regions:
code = re.search('setmkt=[^\&]+', region).group()[7:]
if code == 'nb-NO':
code = 'no-NO'
supported_languages.append(code)
return supported_languages

View file

@ -15,9 +15,10 @@ from datetime import datetime
from dateutil import parser from dateutil import parser
from lxml import etree from lxml import etree
from searx.utils import list_get, match_language from searx.utils import list_get, match_language
from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases
from searx.url_utils import urlencode, urlparse, parse_qsl from searx.url_utils import urlencode, urlparse, parse_qsl
from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases
# engine dependent config # engine dependent config
categories = ['news'] categories = ['news']
paging = True paging = True
@ -58,6 +59,7 @@ def _get_url(query, language, offset, time_range):
offset=offset, offset=offset,
interval=time_range_dict[time_range]) interval=time_range_dict[time_range])
else: else:
# e.g. setmkt=de-de&setlang=de
search_path = search_string.format( search_path = search_string.format(
query=urlencode({'q': query, 'setmkt': language}), query=urlencode({'q': query, 'setmkt': language}),
offset=offset) offset=offset)

View file

@ -12,10 +12,10 @@
from json import loads from json import loads
from lxml import html from lxml import html
from searx.engines.bing_images import _fetch_supported_languages, supported_languages_url
from searx.url_utils import urlencode from searx.url_utils import urlencode
from searx.utils import match_language from searx.utils import match_language
from searx.engines.bing import _fetch_supported_languages, supported_languages_url, language_aliases
categories = ['videos'] categories = ['videos']
paging = True paging = True
@ -67,6 +67,10 @@ def request(query, params):
if params['time_range'] in time_range_dict: if params['time_range'] in time_range_dict:
params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']]) params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']])
# bing videos did not like "older" versions < 70.0.1 when selectin other
# languages then 'en' .. very strange ?!?!
params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:73.0.1) Gecko/20100101 Firefox/73.0.1'
return params return params

View file

@ -54,7 +54,7 @@ def request(query, params):
if params['language'] != 'all': if params['language'] != 'all':
language = match_language(params['language'], supported_languages, language_aliases).split('-')[0] language = match_language(params['language'], supported_languages, language_aliases).split('-')[0]
if language: if language:
params['url'] += '&lr=lang_' + language params['url'] += '&hl=' + language
return params return params

View file

@ -3,9 +3,11 @@
# this file is generated automatically by utils/update_search_languages.py # this file is generated automatically by utils/update_search_languages.py
language_codes = ( language_codes = (
(u"af-NA", u"Afrikaans", u"", u"Afrikaans"),
(u"ar-SA", u"العربية", u"", u"Arabic"), (u"ar-SA", u"العربية", u"", u"Arabic"),
(u"be-BY", u"Беларуская", u"", u"Belarusian"),
(u"bg-BG", u"Български", u"", u"Bulgarian"), (u"bg-BG", u"Български", u"", u"Bulgarian"),
(u"ca-ES", u"Català", u"", u"Catalan"), (u"ca-AD", u"Català", u"", u"Catalan"),
(u"cs-CZ", u"Čeština", u"", u"Czech"), (u"cs-CZ", u"Čeština", u"", u"Czech"),
(u"da-DK", u"Dansk", u"", u"Danish"), (u"da-DK", u"Dansk", u"", u"Danish"),
(u"de", u"Deutsch", u"", u"German"), (u"de", u"Deutsch", u"", u"German"),
@ -17,11 +19,15 @@ language_codes = (
(u"en-AU", u"English", u"Australia", u"English"), (u"en-AU", u"English", u"Australia", u"English"),
(u"en-CA", u"English", u"Canada", u"English"), (u"en-CA", u"English", u"Canada", u"English"),
(u"en-GB", u"English", u"United Kingdom", u"English"), (u"en-GB", u"English", u"United Kingdom", u"English"),
(u"en-IE", u"English", u"Ireland", u"English"),
(u"en-IN", u"English", u"India", u"English"), (u"en-IN", u"English", u"India", u"English"),
(u"en-MY", u"English", u"Malaysia", u"English"), (u"en-NZ", u"English", u"New Zealand", u"English"),
(u"en-PH", u"English", u"Philippines", u"English"),
(u"en-SG", u"English", u"Singapore", u"English"),
(u"en-US", u"English", u"United States", u"English"), (u"en-US", u"English", u"United States", u"English"),
(u"es", u"Español", u"", u"Spanish"), (u"es", u"Español", u"", u"Spanish"),
(u"es-AR", u"Español", u"Argentina", u"Spanish"), (u"es-AR", u"Español", u"Argentina", u"Spanish"),
(u"es-CL", u"Español", u"Chile", u"Spanish"),
(u"es-ES", u"Español", u"España", u"Spanish"), (u"es-ES", u"Español", u"España", u"Spanish"),
(u"es-MX", u"Español", u"México", u"Spanish"), (u"es-MX", u"Español", u"México", u"Spanish"),
(u"et-EE", u"Eesti", u"", u"Estonian"), (u"et-EE", u"Eesti", u"", u"Estonian"),
@ -35,6 +41,7 @@ language_codes = (
(u"he-IL", u"עברית", u"", u"Hebrew"), (u"he-IL", u"עברית", u"", u"Hebrew"),
(u"hr-HR", u"Hrvatski", u"", u"Croatian"), (u"hr-HR", u"Hrvatski", u"", u"Croatian"),
(u"hu-HU", u"Magyar", u"", u"Hungarian"), (u"hu-HU", u"Magyar", u"", u"Hungarian"),
(u"hy-AM", u"Հայերեն", u"", u"Armenian"),
(u"id-ID", u"Indonesia", u"", u"Indonesian"), (u"id-ID", u"Indonesia", u"", u"Indonesian"),
(u"is-IS", u"Íslenska", u"", u"Icelandic"), (u"is-IS", u"Íslenska", u"", u"Icelandic"),
(u"it-IT", u"Italiano", u"", u"Italian"), (u"it-IT", u"Italiano", u"", u"Italian"),
@ -42,7 +49,7 @@ language_codes = (
(u"ko-KR", u"한국어", u"", u"Korean"), (u"ko-KR", u"한국어", u"", u"Korean"),
(u"lt-LT", u"Lietuvių", u"", u"Lithuanian"), (u"lt-LT", u"Lietuvių", u"", u"Lithuanian"),
(u"lv-LV", u"Latviešu", u"", u"Latvian"), (u"lv-LV", u"Latviešu", u"", u"Latvian"),
(u"ms-MY", u"Bahasa Melayu", u"", u"Malay"), (u"ms-MY", u"Melayu", u"", u"Malay"),
(u"nb-NO", u"Norsk Bokmål", u"", u"Norwegian Bokmål"), (u"nb-NO", u"Norsk Bokmål", u"", u"Norwegian Bokmål"),
(u"nl", u"Nederlands", u"", u"Dutch"), (u"nl", u"Nederlands", u"", u"Dutch"),
(u"nl-BE", u"Nederlands", u"België", u"Dutch"), (u"nl-BE", u"Nederlands", u"België", u"Dutch"),
@ -55,8 +62,9 @@ language_codes = (
(u"ru-RU", u"Русский", u"", u"Russian"), (u"ru-RU", u"Русский", u"", u"Russian"),
(u"sk-SK", u"Slovenčina", u"", u"Slovak"), (u"sk-SK", u"Slovenčina", u"", u"Slovak"),
(u"sl-SI", u"Slovenščina", u"", u"Slovenian"), (u"sl-SI", u"Slovenščina", u"", u"Slovenian"),
(u"sr-RS", u"Српски", u"", u"Serbian"), (u"sr-RS", u"Srpski", u"", u"Serbian"),
(u"sv-SE", u"Svenska", u"", u"Swedish"), (u"sv-SE", u"Svenska", u"", u"Swedish"),
(u"sw-KE", u"Kiswahili", u"", u"Swahili"),
(u"th-TH", u"ไทย", u"", u"Thai"), (u"th-TH", u"ไทย", u"", u"Thai"),
(u"tr-TR", u"Türkçe", u"", u"Turkish"), (u"tr-TR", u"Türkçe", u"", u"Turkish"),
(u"uk-UA", u"Українська", u"", u"Ukrainian"), (u"uk-UA", u"Українська", u"", u"Ukrainian"),

View file

@ -5,7 +5,7 @@
# Output files (engines_languages.json and languages.py) # Output files (engines_languages.json and languages.py)
# are written in current directory to avoid overwriting in case something goes wrong. # are written in current directory to avoid overwriting in case something goes wrong.
from json import dump import json
import io import io
from sys import path from sys import path
from babel import Locale, UnknownLocaleError from babel import Locale, UnknownLocaleError
@ -22,19 +22,22 @@ languages_file = 'languages.py'
# Fetchs supported languages for each engine and writes json file with those. # Fetchs supported languages for each engine and writes json file with those.
def fetch_supported_languages(): def fetch_supported_languages():
engines_languages = {} engines_languages = {}
for engine_name in engines: names = list(engines)
names.sort()
for engine_name in names:
print("fetching languages of engine %s" % engine_name)
if hasattr(engines[engine_name], 'fetch_supported_languages'): if hasattr(engines[engine_name], 'fetch_supported_languages'):
try:
engines_languages[engine_name] = engines[engine_name].fetch_supported_languages() engines_languages[engine_name] = engines[engine_name].fetch_supported_languages()
if type(engines_languages[engine_name]) == list: if type(engines_languages[engine_name]) == list:
engines_languages[engine_name] = sorted(engines_languages[engine_name]) engines_languages[engine_name] = sorted(engines_languages[engine_name])
except Exception as e:
print(e)
# write json file # write json file
with io.open(engines_languages_file, "w", encoding="utf-8") as f: with open(engines_languages_file, 'w', encoding='utf-8') as f:
dump(engines_languages, f, ensure_ascii=False, indent=4, separators=(',', ': ')) json.dump(engines_languages, f, indent=2, sort_keys=True)
return engines_languages return engines_languages