Merge pull request #2482 from return42/fix-google-video

[fix] revise of the google-Video engine
This commit is contained in:
Alexandre Flament 2021-01-28 11:11:07 +01:00 committed by GitHub
commit 71d66979c2
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 1186 additions and 1316 deletions

View file

@ -166,6 +166,18 @@ PHONY += gecko.driver
gecko.driver: gecko.driver:
$(PY_ENV_ACT); ./manage.sh install_geckodriver $(PY_ENV_ACT); ./manage.sh install_geckodriver
# search.checker
# --------------
search.checker: pyenvinstall
$(Q)$(PY_ENV_ACT); searx-checker -v
ENGINE_TARGETS=$(patsubst searx/engines/%.py,search.checker.%,$(wildcard searx/engines/[!_]*.py))
$(ENGINE_TARGETS): pyenvinstall
$(Q)$(PY_ENV_ACT); searx-checker -v "$(subst _, ,$(patsubst search.checker.%,%,$@))"
# test # test
# ---- # ----
@ -179,7 +191,9 @@ PYLINT_FILES=\
searx/engines/deviantart.py \ searx/engines/deviantart.py \
searx/engines/digg.py \ searx/engines/digg.py \
searx/engines/google.py \ searx/engines/google.py \
searx/engines/google_news.py searx/engines/google_news.py \
searx/engines/google_videos.py \
searx/engines/google_images.py
test.pylint: pyenvinstall test.pylint: pyenvinstall
$(call cmd,pylint,$(PYLINT_FILES)) $(call cmd,pylint,$(PYLINT_FILES))

File diff suppressed because it is too large Load diff

View file

@ -1,10 +1,10 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
"""Google (Web) """Google (Web)
For detailed description of the *REST-full* API see: `Query Parameter For detailed description of the *REST-full* API see: `Query Parameter
Definitions`_. Definitions`_.
.. _Query Parameter Definitions: .. _Query Parameter Definitions:
https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
""" """
@ -16,7 +16,6 @@ from searx import logger
from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
from searx.exceptions import SearxEngineCaptchaException from searx.exceptions import SearxEngineCaptchaException
logger = logger.getChild('google engine') logger = logger.getChild('google engine')
# about # about
@ -56,7 +55,7 @@ google_domains = {
'NZ': 'google.co.nz', # New Zealand 'NZ': 'google.co.nz', # New Zealand
'PH': 'google.com.ph', # Philippines 'PH': 'google.com.ph', # Philippines
'SG': 'google.com.sg', # Singapore 'SG': 'google.com.sg', # Singapore
# 'US': 'google.us', # United States, redirect to .com 'US': 'google.com', # United States (google.us) redirects to .com
'ZA': 'google.co.za', # South Africa 'ZA': 'google.co.za', # South Africa
'AR': 'google.com.ar', # Argentina 'AR': 'google.com.ar', # Argentina
'CL': 'google.cl', # Chile 'CL': 'google.cl', # Chile
@ -87,7 +86,7 @@ google_domains = {
'TH': 'google.co.th', # Thailand 'TH': 'google.co.th', # Thailand
'TR': 'google.com.tr', # Turkey 'TR': 'google.com.tr', # Turkey
'UA': 'google.com.ua', # Ukraine 'UA': 'google.com.ua', # Ukraine
# 'CN': 'google.cn', # China, only from China ? 'CN': 'google.com.hk', # There is no google.cn, we use .com.hk for zh-CN
'HK': 'google.com.hk', # Hong Kong 'HK': 'google.com.hk', # Hong Kong
'TW': 'google.com.tw' # Taiwan 'TW': 'google.com.tw' # Taiwan
} }
@ -134,26 +133,58 @@ suggestion_xpath = '//div[contains(@class, "card-section")]//a'
spelling_suggestion_xpath = '//div[@class="med"]/p/a' spelling_suggestion_xpath = '//div[@class="med"]/p/a'
def get_lang_country(params, lang_list, custom_aliases): def get_lang_info(params, lang_list, custom_aliases):
"""Returns a tuple with *langauage* on its first and *country* on its second ret_val = {}
position."""
language = params['language']
if language == 'all':
language = 'en-US'
language_array = language.split('-') _lang = params['language']
if _lang.lower() == 'all':
_lang = 'en-US'
if len(language_array) == 2: language = match_language(_lang, lang_list, custom_aliases)
country = language_array[1] ret_val['language'] = language
# the requested language from params (en, en-US, de, de-AT, fr, fr-CA, ...)
_l = _lang.split('-')
# the country code (US, AT, CA)
if len(_l) == 2:
country = _l[1]
else: else:
country = language_array[0].upper() country = _l[0].upper()
if country == 'EN':
country = 'US'
language = match_language(language, lang_list, custom_aliases) ret_val['country'] = country
# the combination (en-US, en-EN, de-DE, de-AU, fr-FR, fr-FR)
lang_country = '%s-%s' % (language, country) lang_country = '%s-%s' % (language, country)
if lang_country == 'en-EN':
lang_country = 'en'
return language, country, lang_country # Accept-Language: fr-CH, fr;q=0.8, en;q=0.6, *;q=0.5
ret_val['Accept-Language'] = ','.join([
lang_country,
language + ';q=0.8,',
'en;q=0.6',
'*;q=0.5',
])
# subdomain
ret_val['subdomain'] = 'www.' + google_domains.get(country.upper(), 'google.com')
# hl parameter:
# https://developers.google.com/custom-search/docs/xml_results#hlsp The
# Interface Language:
# https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
ret_val['hl'] = lang_list.get(lang_country, language)
# lr parameter:
# https://developers.google.com/custom-search/docs/xml_results#lrsp
# Language Collection Values:
# https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections
ret_val['lr'] = "lang_" + lang_list.get(lang_country, language)
return ret_val
def detect_google_sorry(resp): def detect_google_sorry(resp):
resp_url = urlparse(resp.url) resp_url = urlparse(resp.url)
@ -165,17 +196,17 @@ def request(query, params):
"""Google search request""" """Google search request"""
offset = (params['pageno'] - 1) * 10 offset = (params['pageno'] - 1) * 10
language, country, lang_country = get_lang_country(
lang_info = get_lang_info(
# pylint: disable=undefined-variable # pylint: disable=undefined-variable
params, supported_languages, language_aliases params, supported_languages, language_aliases
) )
subdomain = 'www.' + google_domains.get(country.upper(), 'google.com')
# https://www.google.de/search?q=corona&hl=de-DE&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
query_url = 'https://' + subdomain + '/search' + "?" + urlencode({ query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({
'q': query, 'q': query,
'hl': lang_country, 'hl': lang_info['hl'],
'lr': "lang_" + language, 'lr': lang_info['lr'],
'ie': "utf8", 'ie': "utf8",
'oe': "utf8", 'oe': "utf8",
'start': offset, 'start': offset,
@ -186,19 +217,14 @@ def request(query, params):
if params['safesearch']: if params['safesearch']:
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
params['url'] = query_url
logger.debug("query_url --> %s", query_url) logger.debug("query_url --> %s", query_url)
params['url'] = query_url
# en-US,en;q=0.8,en;q=0.5 logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language'])
params['headers']['Accept-Language'] = ( params['headers']['Accept-Language'] = lang_info['Accept-Language']
lang_country + ',' + language + ';q=0.8,' + language + ';q=0.5'
)
logger.debug("HTTP header Accept-Language --> %s",
params['headers']['Accept-Language'])
params['headers']['Accept'] = ( params['headers']['Accept'] = (
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
) )
# params['google_subdomain'] = subdomain
return params return params
@ -209,8 +235,6 @@ def response(resp):
detect_google_sorry(resp) detect_google_sorry(resp)
results = [] results = []
# which subdomain ?
# subdomain = resp.search_params.get('google_subdomain')
# convert the text to dom # convert the text to dom
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
@ -247,7 +271,9 @@ def response(resp):
logger.debug('ingoring <div class="g" ../> section: missing title') logger.debug('ingoring <div class="g" ../> section: missing title')
continue continue
title = extract_text(title_tag) title = extract_text(title_tag)
url = eval_xpath_getindex(result, href_xpath, 0) url = eval_xpath_getindex(result, href_xpath, 0, None)
if url is None:
continue
content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True) content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True)
results.append({ results.append({
'url': url, 'url': url,

View file

@ -10,35 +10,50 @@ Definitions`_.
``data:` scheme).:: ``data:` scheme).::
Header set Content-Security-Policy "img-src 'self' data: ;" Header set Content-Security-Policy "img-src 'self' data: ;"
.. _Query Parameter Definitions:
https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
.. _data URLs:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs
""" """
from urllib.parse import urlencode, unquote from urllib.parse import urlencode, unquote
from lxml import html from lxml import html
from searx import logger from searx import logger
from searx.utils import extract_text, eval_xpath from searx.utils import (
from searx.engines.google import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import eval_xpath,
eval_xpath_list,
eval_xpath_getindex,
extract_text,
)
from searx.engines.google import ( from searx.engines.google import (
get_lang_country, get_lang_info,
google_domains,
time_range_dict, time_range_dict,
detect_google_sorry, detect_google_sorry,
) )
# pylint: disable=unused-import
from searx.engines.google import (
supported_languages_url
, _fetch_supported_languages
)
# pylint: enable=unused-import
logger = logger.getChild('google images') logger = logger.getChild('google images')
# about # about
about = { about = {
"website": 'https://images.google.com/', "website": 'https://images.google.com',
"wikidata_id": 'Q521550', "wikidata_id": 'Q521550',
"official_api_documentation": 'https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions', # NOQA "official_api_documentation": 'https://developers.google.com/custom-search',
"use_official_api": False, "use_official_api": False,
"require_api_key": False, "require_api_key": False,
"results": 'HTML', "results": 'HTML',
} }
# engine dependent config # engine dependent config
categories = ['images'] categories = ['images']
paging = False paging = False
language_support = True language_support = True
@ -84,17 +99,16 @@ def scrap_img_by_id(script, data_id):
def request(query, params): def request(query, params):
"""Google-Video search request""" """Google-Video search request"""
language, country, lang_country = get_lang_country( lang_info = get_lang_info(
# pylint: disable=undefined-variable # pylint: disable=undefined-variable
params, supported_languages, language_aliases params, supported_languages, language_aliases
) )
subdomain = 'www.' + google_domains.get(country.upper(), 'google.com')
query_url = 'https://' + subdomain + '/search' + "?" + urlencode({ query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({
'q': query, 'q': query,
'tbm': "isch", 'tbm': "isch",
'hl': lang_country, 'hl': lang_info['hl'],
'lr': "lang_" + language, 'lr': lang_info['lr'],
'ie': "utf8", 'ie': "utf8",
'oe': "utf8", 'oe': "utf8",
'num': 30, 'num': 30,
@ -105,17 +119,14 @@ def request(query, params):
if params['safesearch']: if params['safesearch']:
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
params['url'] = query_url
logger.debug("query_url --> %s", query_url) logger.debug("query_url --> %s", query_url)
params['url'] = query_url
params['headers']['Accept-Language'] = ( logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language'])
"%s,%s;q=0.8,%s;q=0.5" % (lang_country, language, language)) params['headers']['Accept-Language'] = lang_info['Accept-Language']
logger.debug(
"HTTP Accept-Language --> %s", params['headers']['Accept-Language'])
params['headers']['Accept'] = ( params['headers']['Accept'] = (
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
) )
# params['google_subdomain'] = subdomain
return params return params
@ -125,13 +136,11 @@ def response(resp):
detect_google_sorry(resp) detect_google_sorry(resp)
# which subdomain ?
# subdomain = resp.search_params.get('google_subdomain')
# convert the text to dom # convert the text to dom
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
img_bas64_map = scrap_out_thumbs(dom) img_bas64_map = scrap_out_thumbs(dom)
img_src_script = eval_xpath(dom, '//script[contains(., "AF_initDataCallback({key: ")]')[1].text img_src_script = eval_xpath_getindex(
dom, '//script[contains(., "AF_initDataCallback({key: ")]', 1).text
# parse results # parse results
# #
@ -156,10 +165,9 @@ def response(resp):
return results return results
root = root[0] root = root[0]
for img_node in eval_xpath(root, './/img[contains(@class, "rg_i")]'): for img_node in eval_xpath_list(root, './/img[contains(@class, "rg_i")]'):
try: img_alt = eval_xpath_getindex(img_node, '@alt', 0)
img_alt = eval_xpath(img_node, '@alt')[0]
img_base64_id = eval_xpath(img_node, '@data-iid') img_base64_id = eval_xpath(img_node, '@data-iid')
if img_base64_id: if img_base64_id:
@ -174,8 +182,8 @@ def response(resp):
else: else:
thumbnail_src = '' thumbnail_src = ''
link_node = eval_xpath(img_node, '../../../a[2]')[0] link_node = eval_xpath_getindex(img_node, '../../../a[2]', 0)
url = eval_xpath(link_node, '@href')[0] url = eval_xpath_getindex(link_node, '@href', 0)
pub_nodes = eval_xpath(link_node, './div/div') pub_nodes = eval_xpath(link_node, './div/div')
pub_descr = img_alt pub_descr = img_alt
@ -184,7 +192,7 @@ def response(resp):
pub_descr = extract_text(pub_nodes[0]) pub_descr = extract_text(pub_nodes[0])
pub_source = extract_text(pub_nodes[1]) pub_source = extract_text(pub_nodes[1])
img_src_id = eval_xpath(img_node, '../../../@data-id')[0] img_src_id = eval_xpath_getindex(img_node, '../../../@data-id', 0)
src_url = scrap_img_by_id(img_src_script, img_src_id) src_url = scrap_img_by_id(img_src_script, img_src_id)
if not src_url: if not src_url:
src_url = thumbnail_src src_url = thumbnail_src
@ -199,12 +207,5 @@ def response(resp):
'thumbnail_src': thumbnail_src, 'thumbnail_src': thumbnail_src,
'template': 'images.html' 'template': 'images.html'
}) })
except Exception as e: # pylint: disable=broad-except
logger.error(e, exc_info=True)
# from lxml import etree
# logger.debug(etree.tostring(img_node, pretty_print=True))
# import pdb
# pdb.set_trace()
continue
return results return results

View file

@ -2,13 +2,16 @@
"""Google (News) """Google (News)
For detailed description of the *REST-full* API see: `Query Parameter For detailed description of the *REST-full* API see: `Query Parameter
Definitions`_. Not all parameters can be appied, e.g. num_ (the number of Definitions`_. Not all parameters can be appied:
search results to return) is ignored.
- num_ : the number of search results is ignored
- save_ : is ignored / Google-News results are always *SafeSearch*
.. _Query Parameter Definitions: .. _Query Parameter Definitions:
https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
.. _num: https://developers.google.com/custom-search/docs/xml_results#numsp .. _num: https://developers.google.com/custom-search/docs/xml_results#numsp
.. _save: https://developers.google.com/custom-search/docs/xml_results#safesp
""" """
@ -32,20 +35,19 @@ from searx.utils import (
from searx.engines.google import ( from searx.engines.google import (
supported_languages_url, supported_languages_url,
_fetch_supported_languages, _fetch_supported_languages,
detect_google_sorry,
) )
# pylint: enable=unused-import # pylint: enable=unused-import
from searx.engines.google import ( from searx.engines.google import (
get_lang_country, get_lang_info,
filter_mapping, detect_google_sorry,
) )
# about # about
about = { about = {
"website": 'https://news.google.com', "website": 'https://news.google.com',
"wikidata_id": 'Q12020', "wikidata_id": 'Q12020',
"official_api_documentation": None, "official_api_documentation": 'https://developers.google.com/custom-search',
"use_official_api": False, "use_official_api": False,
"require_api_key": False, "require_api_key": False,
"results": 'HTML', "results": 'HTML',
@ -69,51 +71,53 @@ paging = False
language_support = True language_support = True
use_locale_domain = True use_locale_domain = True
time_range_support = True time_range_support = True
safesearch = True # not really, but it is not generated by google
# Google-News results are always *SafeSearch*. Option 'safesearch' is set to
# False here, otherwise checker will report safesearch-errors::
#
# safesearch : results are identitical for safesearch=0 and safesearch=2
safesearch = False
def request(query, params): def request(query, params):
"""Google-News search request""" """Google-News search request"""
language, country, lang_country = get_lang_country( lang_info = get_lang_info(
# pylint: disable=undefined-variable # pylint: disable=undefined-variable
params, supported_languages, language_aliases params, supported_languages, language_aliases
) )
subdomain = 'news.google.com'
if params['time_range']: # in time_range_dict: # google news has only one domain
lang_info['subdomain'] = 'news.google.com'
ceid = "%s:%s" % (lang_info['country'], lang_info['language'])
# google news redirects en to en-US
if lang_info['hl'] == 'en':
lang_info['hl'] = 'en-US'
# Very special to google-news compared to other google engines, the time
# range is included in the search term.
if params['time_range']:
query += ' ' + time_range_dict[params['time_range']] query += ' ' + time_range_dict[params['time_range']]
query_url = 'https://'+ subdomain + '/search' + "?" + urlencode({ query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({
'q': query, 'q': query,
'hl': language, 'hl': lang_info['hl'],
'lr': "lang_" + language, 'lr': lang_info['lr'],
'ie': "utf8", 'ie': "utf8",
'oe': "utf8", 'oe': "utf8",
'ceid' : "%s:%s" % (country, language), 'gl': lang_info['country'],
'gl' : country, }) + ('&ceid=%s' % ceid) # ceid includes a ':' character which must not be urlencoded
})
if params['safesearch']:
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
params['url'] = query_url
logger.debug("query_url --> %s", query_url) logger.debug("query_url --> %s", query_url)
params['url'] = query_url
# en-US,en;q=0.8,en;q=0.5 logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language'])
params['headers']['Accept-Language'] = ( params['headers']['Accept-Language'] = lang_info['Accept-Language']
lang_country + ',' + language + ';q=0.8,' + language + ';q=0.5'
)
logger.debug("HTTP header Accept-Language --> %s",
params['headers']['Accept-Language'])
params['headers']['Accept'] = ( params['headers']['Accept'] = (
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
) )
# hl=en redirect to hl=en-US / en-CA ...
params['soft_max_redirects'] = 1
#params['google_subdomain'] = subdomain
return params return params
@ -123,9 +127,6 @@ def response(resp):
detect_google_sorry(resp) detect_google_sorry(resp)
# which subdomain ?
# subdomain = resp.search_params.get('google_subdomain')
# convert the text to dom # convert the text to dom
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)

View file

@ -1,99 +1,202 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
""" """Google (Video)
Google (Videos)
For detailed description of the *REST-full* API see: `Query Parameter
Definitions`_. Not all parameters can be appied.
.. _admonition:: Content-Security-Policy (CSP)
This engine needs to allow images from the `data URLs`_ (prefixed with the
``data:` scheme).::
Header set Content-Security-Policy "img-src 'self' data: ;"
.. _Query Parameter Definitions:
https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
.. _data URLs:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs
""" """
from datetime import date, timedelta # pylint: disable=invalid-name, missing-function-docstring
import re
from urllib.parse import urlencode from urllib.parse import urlencode
from lxml import html from lxml import html
from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
import re from searx import logger
from searx.utils import (
eval_xpath,
eval_xpath_list,
eval_xpath_getindex,
extract_text,
)
from searx.engines.google import (
get_lang_info,
time_range_dict,
filter_mapping,
results_xpath,
g_section_with_header,
title_xpath,
href_xpath,
content_xpath,
suggestion_xpath,
spelling_suggestion_xpath,
detect_google_sorry,
)
# pylint: disable=unused-import
from searx.engines.google import (
supported_languages_url
, _fetch_supported_languages
)
# pylint: enable=unused-import
# about # about
about = { about = {
"website": 'https://www.google.com', "website": 'https://www.google.com',
"wikidata_id": 'Q219885', "wikidata_id": 'Q219885',
"official_api_documentation": 'https://developers.google.com/custom-search/', "official_api_documentation": 'https://developers.google.com/custom-search',
"use_official_api": False, "use_official_api": False,
"require_api_key": False, "require_api_key": False,
"results": 'HTML', "results": 'HTML',
} }
logger = logger.getChild('google video')
# engine dependent config # engine dependent config
categories = ['videos'] categories = ['videos']
paging = True paging = False
safesearch = True language_support = True
use_locale_domain = True
time_range_support = True time_range_support = True
number_of_results = 10 safesearch = True
search_url = 'https://www.google.com/search'\ RE_CACHE = {}
'?q={query}'\
'&tbm=vid'\ def _re(regexpr):
'&{search_options}' """returns compiled regular expression"""
time_range_attr = "qdr:{range}" RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr))
time_range_custom_attr = "cdr:1,cd_min:{start},cd_max{end}" return RE_CACHE[regexpr]
time_range_dict = {'day': 'd',
'week': 'w', def scrap_out_thumbs(dom):
'month': 'm'} """Scrap out thumbnail data from <script> tags.
"""
ret_val = dict()
thumb_name = 'vidthumb'
for script in eval_xpath_list(dom, '//script[contains(., "_setImagesSrc")]'):
_script = script.text
# var s='data:image/jpeg;base64, ...'
_imgdata = _re("s='([^']*)").findall( _script)
if not _imgdata:
continue
# var ii=['vidthumb4','vidthumb7']
for _vidthumb in _re(r"(%s\d+)" % thumb_name).findall(_script):
# At least the equal sign in the URL needs to be decoded
ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=")
# {google.ldidly=-1;google.ldi={"vidthumb8":"https://...
for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'):
_script = script.text
for key_val in _re(r'"%s\d+\":\"[^\"]*"' % thumb_name).findall( _script) :
match = _re(r'"(%s\d+)":"(.*)"' % thumb_name).search(key_val)
if match:
# At least the equal sign in the URL needs to be decoded
ret_val[match.group(1)] = match.group(2).replace(r"\u003d", "=")
logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys())
return ret_val
# do search-request
def request(query, params): def request(query, params):
search_options = { """Google-Video search request"""
'ijn': params['pageno'] - 1,
'start': (params['pageno'] - 1) * number_of_results lang_info = get_lang_info(
} # pylint: disable=undefined-variable
params, supported_languages, language_aliases
)
query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({
'q': query,
'tbm': "vid",
'hl': lang_info['hl'],
'lr': lang_info['lr'],
'ie': "utf8",
'oe': "utf8",
})
if params['time_range'] in time_range_dict: if params['time_range'] in time_range_dict:
search_options['tbs'] = time_range_attr.format(range=time_range_dict[params['time_range']]) query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
elif params['time_range'] == 'year': if params['safesearch']:
now = date.today() query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
then = now - timedelta(days=365)
start = then.strftime('%m/%d/%Y')
end = now.strftime('%m/%d/%Y')
search_options['tbs'] = time_range_custom_attr.format(start=start, end=end)
if safesearch and params['safesearch']: logger.debug("query_url --> %s", query_url)
search_options['safe'] = 'on' params['url'] = query_url
params['url'] = search_url.format(query=urlencode({'q': query}),
search_options=urlencode(search_options))
logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language'])
params['headers']['Accept-Language'] = lang_info['Accept-Language']
params['headers']['Accept'] = (
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
)
return params return params
# get response from search-request
def response(resp): def response(resp):
"""Get response from google's search request"""
results = [] results = []
detect_google_sorry(resp)
# convert the text to dom
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
vidthumb_imgdata = scrap_out_thumbs(dom)
# parse results # parse results
for result in eval_xpath_list(dom, '//div[@class="g"]'): for result in eval_xpath_list(dom, results_xpath):
title = extract_text(eval_xpath(result, './/h3')) # google *sections*
url = eval_xpath_getindex(result, './/div[@class="r"]/a/@href', 0) if extract_text(eval_xpath(result, g_section_with_header)):
content = extract_text(eval_xpath(result, './/span[@class="st"]')) logger.debug("ingoring <g-section-with-header>")
continue
# get thumbnails title = extract_text(eval_xpath_getindex(result, title_xpath, 0))
script = str(dom.xpath('//script[contains(., "_setImagesSrc")]')[0].text) url = eval_xpath_getindex(result, href_xpath, 0)
ids = result.xpath('.//div[@class="s"]//img/@id') c_node = eval_xpath_getindex(result, content_xpath, 0)
if len(ids) > 0:
thumbnails_data = \
re.findall('s=\'(.*?)(?:\\\\[a-z,1-9,\\\\]+\'|\')\;var ii=\[(?:|[\'vidthumb\d+\',]+)\'' + ids[0],
script)
tmp = []
if len(thumbnails_data) != 0:
tmp = re.findall('(data:image/jpeg;base64,[a-z,A-Z,0-9,/,\+]+)', thumbnails_data[0])
thumbnail = ''
if len(tmp) != 0:
thumbnail = tmp[-1]
# append result # <img id="vidthumb1" ...>
results.append({'url': url, img_id = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@id', 0, default=None)
if img_id is None:
continue
img_src = vidthumb_imgdata.get(img_id, None)
if not img_src:
logger.error("no vidthumb imgdata for: %s" % img_id)
img_src = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@src', 0)
length = extract_text(eval_xpath(c_node, './/div[1]//a/div[3]'))
content = extract_text(eval_xpath(c_node, './/div[2]/span'))
pub_info = extract_text(eval_xpath(c_node, './/div[2]/div'))
results.append({
'url': url,
'title': title, 'title': title,
'content': content, 'content': content,
'thumbnail': thumbnail, 'length': length,
'template': 'videos.html'}) 'author': pub_info,
'thumbnail': img_src,
'template': 'videos.html',
})
# parse suggestion
for suggestion in eval_xpath_list(dom, suggestion_xpath):
# append suggestion
results.append({'suggestion': extract_text(suggestion)})
for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
results.append({'correction': extract_text(correction)})
return results return results

View file

@ -21,8 +21,6 @@ language_codes = \
('en-IE', 'English', 'Ireland', 'English'), ('en-IE', 'English', 'Ireland', 'English'),
('en-IN', 'English', 'India', 'English'), ('en-IN', 'English', 'India', 'English'),
('en-NZ', 'English', 'New Zealand', 'English'), ('en-NZ', 'English', 'New Zealand', 'English'),
('en-PH', 'English', 'Philippines', 'English'),
('en-SG', 'English', 'Singapore', 'English'),
('en-US', 'English', 'United States', 'English'), ('en-US', 'English', 'United States', 'English'),
('es', 'Español', '', 'Spanish'), ('es', 'Español', '', 'Spanish'),
('es-AR', 'Español', 'Argentina', 'Spanish'), ('es-AR', 'Español', 'Argentina', 'Spanish'),
@ -48,7 +46,6 @@ language_codes = \
('ko-KR', '한국어', '', 'Korean'), ('ko-KR', '한국어', '', 'Korean'),
('lt-LT', 'Lietuvių', '', 'Lithuanian'), ('lt-LT', 'Lietuvių', '', 'Lithuanian'),
('lv-LV', 'Latviešu', '', 'Latvian'), ('lv-LV', 'Latviešu', '', 'Latvian'),
('ms-MY', 'Melayu', '', 'Malay'),
('nb-NO', 'Norsk Bokmål', '', 'Norwegian Bokmål'), ('nb-NO', 'Norsk Bokmål', '', 'Norwegian Bokmål'),
('nl', 'Nederlands', '', 'Dutch'), ('nl', 'Nederlands', '', 'Dutch'),
('nl-BE', 'Nederlands', 'België', 'Dutch'), ('nl-BE', 'Nederlands', 'België', 'Dutch'),

View file

@ -117,6 +117,7 @@ checker:
# every: [86400, 90000] # how often the checker runs # every: [86400, 90000] # how often the checker runs
# additional tests: only for the YAML anchors (see the engines section) # additional tests: only for the YAML anchors (see the engines section)
additional_tests: additional_tests:
rosebud: &test_rosebud rosebud: &test_rosebud
matrix: matrix:
@ -127,6 +128,17 @@ checker:
- ['one_title_contains', 'citizen kane'] - ['one_title_contains', 'citizen kane']
test: test:
- unique_results - unique_results
android: &test_android
matrix:
query: ['android']
lang: ['en', 'de', 'fr', 'zh-CN']
result_container:
- not_empty
- ['one_title_contains', 'google']
test:
- unique_results
# tests: only for the YAML anchors (see the engines section) # tests: only for the YAML anchors (see the engines section)
tests: tests:
infobox: &tests_infobox infobox: &tests_infobox
@ -480,18 +492,32 @@ engines:
- name : google - name : google
engine : google engine : google
shortcut : go shortcut : go
# additional_tests:
# android: *test_android
- name : google images - name : google images
engine : google_images engine : google_images
shortcut : goi shortcut : goi
# additional_tests:
# android: *test_android
# dali:
# matrix:
# query: ['Dali Christ']
# lang: ['en', 'de', 'fr', 'zh-CN']
# result_container:
# - ['one_title_contains', 'Salvador']
- name : google news - name : google news
engine : google_news engine : google_news
shortcut : gon shortcut : gon
# additional_tests:
# android: *test_android
- name : google videos - name : google videos
engine : google_videos engine : google_videos
shortcut : gov shortcut : gov
# additional_tests:
# android: *test_android
- name : google scholar - name : google scholar
engine : xpath engine : xpath