Merge pull request #2482 from return42/fix-google-video

[fix] revise of the google-Video engine
2021-01-28 11:11:07 +01:00 · 2021-01-28 11:11:07 +01:00 · 71d66979c2
commit 71d66979c2
parent 0f18e885bf 7f505bdc6f
8 changed files with 1186 additions and 1316 deletions
--- a/16
+++ b/16
@ -166,6 +166,18 @@ PHONY += gecko.driver
 gecko.driver:
 	$(PY_ENV_ACT); ./manage.sh install_geckodriver
 # search.checker
 # --------------
 search.checker: pyenvinstall
 	$(Q)$(PY_ENV_ACT); searx-checker -v
 ENGINE_TARGETS=$(patsubst searx/engines/%.py,search.checker.%,$(wildcard searx/engines/[!_]*.py))
 $(ENGINE_TARGETS): pyenvinstall
 	$(Q)$(PY_ENV_ACT); searx-checker -v "$(subst _, ,$(patsubst search.checker.%,%,$@))"
 # test
 # ----
@ -179,7 +191,9 @@ PYLINT_FILES=\
 	searx/engines/deviantart.py \
 	searx/engines/digg.py \
 	searx/engines/google.py \
-	searx/engines/google_news.py
+	searx/engines/google_news.py \
 	searx/engines/google_videos.py \
 	searx/engines/google_images.py
 test.pylint: pyenvinstall
 	$(call cmd,pylint,$(PYLINT_FILES))
--- a/searx/data/engines_languages.json
+++ b/searx/data/engines_languages.json
--- a/searx/engines/google.py
+++ b/searx/engines/google.py
@ -1,10 +1,10 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 """Google (Web)
- For detailed description of the *REST-full* API see: `Query Parameter
+For detailed description of the *REST-full* API see: `Query Parameter
- Definitions`_.
+Definitions`_.
- .. _Query Parameter Definitions:
+.. _Query Parameter Definitions:
   https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
 """
@ -16,7 +16,6 @@ from searx import logger
 from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
 from searx.exceptions import SearxEngineCaptchaException
 logger = logger.getChild('google engine')
 # about
@ -56,7 +55,7 @@ google_domains = {
    'NZ': 'google.co.nz',   # New Zealand
    'PH': 'google.com.ph',  # Philippines
    'SG': 'google.com.sg',  # Singapore
-    # 'US': 'google.us',    # United States, redirect to .com
+    'US': 'google.com',     # United States (google.us) redirects to .com
    'ZA': 'google.co.za',   # South Africa
    'AR': 'google.com.ar',  # Argentina
    'CL': 'google.cl',      # Chile
@ -87,7 +86,7 @@ google_domains = {
    'TH': 'google.co.th',   # Thailand
    'TR': 'google.com.tr',  # Turkey
    'UA': 'google.com.ua',  # Ukraine
-    # 'CN': 'google.cn',    # China, only from China ?
+    'CN': 'google.com.hk',  # There is no google.cn, we use .com.hk for zh-CN
    'HK': 'google.com.hk',  # Hong Kong
    'TW': 'google.com.tw'   # Taiwan
 }
@ -134,26 +133,58 @@ suggestion_xpath = '//div[contains(@class, "card-section")]//a'
 spelling_suggestion_xpath = '//div[@class="med"]/p/a'
-def get_lang_country(params, lang_list, custom_aliases):
+def get_lang_info(params, lang_list, custom_aliases):
-    """Returns a tuple with *langauage* on its first and *country* on its second
+    ret_val = {}
    position."""
    language = params['language']
    if language == 'all':
        language = 'en-US'
-    language_array = language.split('-')
+    _lang = params['language']
    if _lang.lower() == 'all':
        _lang = 'en-US'
-    if len(language_array) == 2:
+    language = match_language(_lang, lang_list, custom_aliases)
-        country = language_array[1]
+    ret_val['language'] = language
    # the requested language from params (en, en-US, de, de-AT, fr, fr-CA, ...)
    _l = _lang.split('-')
    # the country code (US, AT, CA)
    if len(_l) == 2:
        country = _l[1]
    else:
-        country = language_array[0].upper()
+        country = _l[0].upper()
        if country == 'EN':
            country = 'US'
-    language = match_language(language, lang_list, custom_aliases)
+    ret_val['country'] = country
    # the combination (en-US, en-EN, de-DE, de-AU, fr-FR, fr-FR)
    lang_country = '%s-%s' % (language, country)
    if lang_country == 'en-EN':
        lang_country = 'en'
-    return language, country, lang_country
+    # Accept-Language: fr-CH, fr;q=0.8, en;q=0.6, *;q=0.5
    ret_val['Accept-Language'] = ','.join([
        lang_country,
        language + ';q=0.8,',
        'en;q=0.6',
        '*;q=0.5',
    ])
    # subdomain
    ret_val['subdomain']  = 'www.' + google_domains.get(country.upper(), 'google.com')
    # hl parameter:
    #   https://developers.google.com/custom-search/docs/xml_results#hlsp The
    # Interface Language:
    #   https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
    ret_val['hl'] = lang_list.get(lang_country, language)
    # lr parameter:
    #   https://developers.google.com/custom-search/docs/xml_results#lrsp
    # Language Collection Values:
    #   https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections
    ret_val['lr'] = "lang_" + lang_list.get(lang_country, language)
    return ret_val
 def detect_google_sorry(resp):
    resp_url = urlparse(resp.url)
@ -165,17 +196,17 @@ def request(query, params):
    """Google search request"""
    offset = (params['pageno'] - 1) * 10
-    language, country, lang_country = get_lang_country(
+
    lang_info = get_lang_info(
        # pylint: disable=undefined-variable
        params, supported_languages, language_aliases
    )
    subdomain = 'www.' + google_domains.get(country.upper(), 'google.com')
-    # https://www.google.de/search?q=corona&hl=de-DE&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
+    # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
-    query_url = 'https://' + subdomain + '/search' + "?" + urlencode({
+    query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({
        'q': query,
-        'hl': lang_country,
+        'hl': lang_info['hl'],
-        'lr': "lang_" + language,
+        'lr': lang_info['lr'],
        'ie': "utf8",
        'oe': "utf8",
        'start': offset,
@ -186,19 +217,14 @@ def request(query, params):
    if params['safesearch']:
        query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
    params['url'] = query_url
    logger.debug("query_url --> %s", query_url)
    params['url'] = query_url
-    # en-US,en;q=0.8,en;q=0.5
+    logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language'])
-    params['headers']['Accept-Language'] = (
+    params['headers']['Accept-Language'] = lang_info['Accept-Language']
        lang_country + ',' + language + ';q=0.8,' + language + ';q=0.5'
    )
    logger.debug("HTTP header Accept-Language --> %s",
                 params['headers']['Accept-Language'])
    params['headers']['Accept'] = (
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
    )
    # params['google_subdomain'] = subdomain
    return params
@ -209,8 +235,6 @@ def response(resp):
    detect_google_sorry(resp)
    results = []
    # which subdomain ?
    # subdomain = resp.search_params.get('google_subdomain')
    # convert the text to dom
    dom = html.fromstring(resp.text)
@ -247,7 +271,9 @@ def response(resp):
                logger.debug('ingoring <div class="g" ../> section: missing title')
                continue
            title = extract_text(title_tag)
-            url = eval_xpath_getindex(result, href_xpath, 0)
+            url = eval_xpath_getindex(result, href_xpath, 0, None)
            if url is None:
                continue
            content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True)
            results.append({
                'url': url,
--- a/searx/engines/google_images.py
+++ b/searx/engines/google_images.py
@ -10,35 +10,50 @@ Definitions`_.
   ``data:` scheme).::
     Header set Content-Security-Policy "img-src 'self' data: ;"
 .. _Query Parameter Definitions:
   https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
 .. _data URLs:
   https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs
 """
 from urllib.parse import urlencode, unquote
 from lxml import html
 from searx import logger
-from searx.utils import extract_text, eval_xpath
+from searx.utils import (
-from searx.engines.google import _fetch_supported_languages, supported_languages_url  # NOQA # pylint: disable=unused-import
+    eval_xpath,
    eval_xpath_list,
    eval_xpath_getindex,
    extract_text,
 )
 from searx.engines.google import (
-    get_lang_country,
+    get_lang_info,
    google_domains,
    time_range_dict,
    detect_google_sorry,
 )
 # pylint: disable=unused-import
 from searx.engines.google import (
    supported_languages_url
    ,  _fetch_supported_languages
 )
 # pylint: enable=unused-import
 logger = logger.getChild('google images')
 # about
 about = {
-    "website": 'https://images.google.com/',
+    "website": 'https://images.google.com',
    "wikidata_id": 'Q521550',
-    "official_api_documentation": 'https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions',  # NOQA
+    "official_api_documentation": 'https://developers.google.com/custom-search',
    "use_official_api": False,
    "require_api_key": False,
    "results": 'HTML',
 }
 # engine dependent config
 categories = ['images']
 paging = False
 language_support = True
@ -84,17 +99,16 @@ def scrap_img_by_id(script, data_id):
 def request(query, params):
    """Google-Video search request"""
-    language, country, lang_country = get_lang_country(
+    lang_info = get_lang_info(
        # pylint: disable=undefined-variable
        params, supported_languages, language_aliases
    )
    subdomain = 'www.' + google_domains.get(country.upper(), 'google.com')
-    query_url = 'https://' + subdomain + '/search' + "?" + urlencode({
+    query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({
        'q': query,
        'tbm': "isch",
-        'hl': lang_country,
+        'hl': lang_info['hl'],
-        'lr': "lang_" + language,
+        'lr': lang_info['lr'],
        'ie': "utf8",
        'oe': "utf8",
        'num': 30,
@ -105,17 +119,14 @@ def request(query, params):
    if params['safesearch']:
        query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
    params['url'] = query_url
    logger.debug("query_url --> %s", query_url)
    params['url'] = query_url
-    params['headers']['Accept-Language'] = (
+    logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language'])
-        "%s,%s;q=0.8,%s;q=0.5" % (lang_country, language, language))
+    params['headers']['Accept-Language'] = lang_info['Accept-Language']
    logger.debug(
        "HTTP Accept-Language --> %s", params['headers']['Accept-Language'])
    params['headers']['Accept'] = (
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
    )
    # params['google_subdomain'] = subdomain
    return params
@ -125,13 +136,11 @@ def response(resp):
    detect_google_sorry(resp)
    # which subdomain ?
    # subdomain = resp.search_params.get('google_subdomain')
    # convert the text to dom
    dom = html.fromstring(resp.text)
    img_bas64_map = scrap_out_thumbs(dom)
-    img_src_script = eval_xpath(dom, '//script[contains(., "AF_initDataCallback({key: ")]')[1].text
+    img_src_script = eval_xpath_getindex(
        dom, '//script[contains(., "AF_initDataCallback({key: ")]', 1).text
    # parse results
    #
@ -156,10 +165,9 @@ def response(resp):
        return results
    root = root[0]
-    for img_node in eval_xpath(root, './/img[contains(@class, "rg_i")]'):
+    for img_node in eval_xpath_list(root, './/img[contains(@class, "rg_i")]'):
-        try:
+        img_alt = eval_xpath_getindex(img_node, '@alt', 0)
            img_alt = eval_xpath(img_node, '@alt')[0]
        img_base64_id = eval_xpath(img_node, '@data-iid')
        if img_base64_id:
@ -174,8 +182,8 @@ def response(resp):
            else:
                thumbnail_src = ''
-            link_node = eval_xpath(img_node, '../../../a[2]')[0]
+        link_node = eval_xpath_getindex(img_node, '../../../a[2]', 0)
-            url = eval_xpath(link_node, '@href')[0]
+        url = eval_xpath_getindex(link_node, '@href', 0)
        pub_nodes = eval_xpath(link_node, './div/div')
        pub_descr = img_alt
@ -184,7 +192,7 @@ def response(resp):
            pub_descr = extract_text(pub_nodes[0])
            pub_source = extract_text(pub_nodes[1])
-            img_src_id = eval_xpath(img_node, '../../../@data-id')[0]
+        img_src_id = eval_xpath_getindex(img_node, '../../../@data-id', 0)
        src_url = scrap_img_by_id(img_src_script, img_src_id)
        if not src_url:
            src_url = thumbnail_src
@ -199,12 +207,5 @@ def response(resp):
            'thumbnail_src': thumbnail_src,
            'template': 'images.html'
        })
        except Exception as e:  # pylint: disable=broad-except
            logger.error(e, exc_info=True)
            # from lxml import etree
            # logger.debug(etree.tostring(img_node, pretty_print=True))
            # import pdb
            # pdb.set_trace()
            continue
    return results
--- a/searx/engines/google_news.py
+++ b/searx/engines/google_news.py
@ -2,13 +2,16 @@
 """Google (News)
 For detailed description of the *REST-full* API see: `Query Parameter
-Definitions`_.  Not all parameters can be appied, e.g. num_ (the number of
+Definitions`_.  Not all parameters can be appied:
-search results to return) is ignored.
+
 - num_ : the number of search results is ignored
 - save_ : is ignored / Google-News results are always *SafeSearch*
 .. _Query Parameter Definitions:
   https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
 .. _num: https://developers.google.com/custom-search/docs/xml_results#numsp
 .. _save: https://developers.google.com/custom-search/docs/xml_results#safesp
 """
@ -32,20 +35,19 @@ from searx.utils import (
 from searx.engines.google import (
    supported_languages_url,
    _fetch_supported_languages,
    detect_google_sorry,
 )
 # pylint: enable=unused-import
 from searx.engines.google import (
-    get_lang_country,
+    get_lang_info,
-    filter_mapping,
+    detect_google_sorry,
 )
 # about
 about = {
    "website": 'https://news.google.com',
    "wikidata_id": 'Q12020',
-    "official_api_documentation": None,
+    "official_api_documentation": 'https://developers.google.com/custom-search',
    "use_official_api": False,
    "require_api_key": False,
    "results": 'HTML',
@ -69,51 +71,53 @@ paging = False
 language_support = True
 use_locale_domain = True
 time_range_support = True
-safesearch = True # not really, but it is not generated by google
+
 # Google-News results are always *SafeSearch*. Option 'safesearch' is set to
 # False here, otherwise checker will report safesearch-errors::
 #
 #  safesearch : results are identitical for safesearch=0 and safesearch=2
 safesearch = False
 def request(query, params):
    """Google-News search request"""
-    language, country, lang_country = get_lang_country(
+    lang_info = get_lang_info(
        # pylint: disable=undefined-variable
        params, supported_languages, language_aliases
    )
    subdomain = 'news.google.com'
-    if params['time_range']: # in time_range_dict:
+    # google news has only one domain
    lang_info['subdomain'] = 'news.google.com'
    ceid = "%s:%s" % (lang_info['country'], lang_info['language'])
    # google news redirects en to en-US
    if lang_info['hl'] == 'en':
        lang_info['hl'] = 'en-US'
    # Very special to google-news compared to other google engines, the time
    # range is included in the search term.
    if params['time_range']:
        query += ' ' + time_range_dict[params['time_range']]
-    query_url = 'https://'+ subdomain + '/search' + "?" + urlencode({
+    query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({
        'q': query,
-        'hl': language,
+        'hl': lang_info['hl'],
-        'lr': "lang_" + language,
+        'lr': lang_info['lr'],
        'ie': "utf8",
        'oe': "utf8",
-        'ceid' : "%s:%s" % (country, language),
+        'gl': lang_info['country'],
-        'gl' : country,
+    }) + ('&ceid=%s' % ceid)  # ceid includes a ':' character which must not be urlencoded
    })
    if params['safesearch']:
        query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
    params['url'] = query_url
    logger.debug("query_url --> %s", query_url)
    params['url'] = query_url
-    # en-US,en;q=0.8,en;q=0.5
+    logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language'])
-    params['headers']['Accept-Language'] = (
+    params['headers']['Accept-Language'] = lang_info['Accept-Language']
        lang_country + ',' + language + ';q=0.8,' + language + ';q=0.5'
        )
    logger.debug("HTTP header Accept-Language --> %s",
                 params['headers']['Accept-Language'])
    params['headers']['Accept'] = (
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
        )
    # hl=en redirect to hl=en-US / en-CA ...
    params['soft_max_redirects'] = 1
    #params['google_subdomain'] = subdomain
    return params
@ -123,9 +127,6 @@ def response(resp):
    detect_google_sorry(resp)
    # which subdomain ?
    # subdomain = resp.search_params.get('google_subdomain')
    # convert the text to dom
    dom = html.fromstring(resp.text)
--- a/searx/engines/google_videos.py
+++ b/searx/engines/google_videos.py
@ -1,99 +1,202 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
-"""
+"""Google (Video)
- Google (Videos)
+
 For detailed description of the *REST-full* API see: `Query Parameter
 Definitions`_.  Not all parameters can be appied.
 .. _admonition:: Content-Security-Policy (CSP)
   This engine needs to allow images from the `data URLs`_ (prefixed with the
   ``data:` scheme).::
     Header set Content-Security-Policy "img-src 'self' data: ;"
 .. _Query Parameter Definitions:
   https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
 .. _data URLs:
   https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs
 """
-from datetime import date, timedelta
+# pylint: disable=invalid-name, missing-function-docstring
 import re
 from urllib.parse import urlencode
 from lxml import html
-from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
+
-import re
+from searx import logger
 from searx.utils import (
    eval_xpath,
    eval_xpath_list,
    eval_xpath_getindex,
    extract_text,
 )
 from searx.engines.google import (
    get_lang_info,
    time_range_dict,
    filter_mapping,
    results_xpath,
    g_section_with_header,
    title_xpath,
    href_xpath,
    content_xpath,
    suggestion_xpath,
    spelling_suggestion_xpath,
    detect_google_sorry,
 )
 # pylint: disable=unused-import
 from searx.engines.google import (
    supported_languages_url
    ,  _fetch_supported_languages
 )
 # pylint: enable=unused-import
 # about
 about = {
    "website": 'https://www.google.com',
    "wikidata_id": 'Q219885',
-    "official_api_documentation": 'https://developers.google.com/custom-search/',
+    "official_api_documentation": 'https://developers.google.com/custom-search',
    "use_official_api": False,
    "require_api_key": False,
    "results": 'HTML',
 }
 logger = logger.getChild('google video')
 # engine dependent config
 categories = ['videos']
-paging = True
+paging = False
-safesearch = True
+language_support = True
 use_locale_domain = True
 time_range_support = True
-number_of_results = 10
+safesearch = True
-search_url = 'https://www.google.com/search'\
+RE_CACHE = {}
-    '?q={query}'\
+
-    '&tbm=vid'\
+def _re(regexpr):
-    '&{search_options}'
+    """returns compiled regular expression"""
-time_range_attr = "qdr:{range}"
+    RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr))
-time_range_custom_attr = "cdr:1,cd_min:{start},cd_max{end}"
+    return RE_CACHE[regexpr]
-time_range_dict = {'day': 'd',
+
-                   'week': 'w',
+def scrap_out_thumbs(dom):
-                   'month': 'm'}
+    """Scrap out thumbnail data from <script> tags.
    """
    ret_val = dict()
    thumb_name = 'vidthumb'
    for script in eval_xpath_list(dom, '//script[contains(., "_setImagesSrc")]'):
        _script = script.text
        # var s='data:image/jpeg;base64, ...'
        _imgdata = _re("s='([^']*)").findall( _script)
        if not _imgdata:
            continue
        # var ii=['vidthumb4','vidthumb7']
        for _vidthumb in _re(r"(%s\d+)" % thumb_name).findall(_script):
            # At least the equal sign in the URL needs to be decoded
            ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=")
    # {google.ldidly=-1;google.ldi={"vidthumb8":"https://...
    for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'):
        _script = script.text
        for key_val in _re(r'"%s\d+\":\"[^\"]*"' % thumb_name).findall( _script) :
            match = _re(r'"(%s\d+)":"(.*)"' % thumb_name).search(key_val)
            if match:
                # At least the equal sign in the URL needs to be decoded
                ret_val[match.group(1)] = match.group(2).replace(r"\u003d", "=")
    logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys())
    return ret_val
 # do search-request
 def request(query, params):
-    search_options = {
+    """Google-Video search request"""
-        'ijn': params['pageno'] - 1,
+
-        'start': (params['pageno'] - 1) * number_of_results
+    lang_info = get_lang_info(
-    }
+        # pylint: disable=undefined-variable
        params, supported_languages, language_aliases
    )
    query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({
        'q':   query,
        'tbm': "vid",
        'hl': lang_info['hl'],
        'lr': lang_info['lr'],
        'ie': "utf8",
        'oe': "utf8",
    })
    if params['time_range'] in time_range_dict:
-        search_options['tbs'] = time_range_attr.format(range=time_range_dict[params['time_range']])
+        query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
-    elif params['time_range'] == 'year':
+    if params['safesearch']:
-        now = date.today()
+        query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
        then = now - timedelta(days=365)
        start = then.strftime('%m/%d/%Y')
        end = now.strftime('%m/%d/%Y')
        search_options['tbs'] = time_range_custom_attr.format(start=start, end=end)
-    if safesearch and params['safesearch']:
+    logger.debug("query_url --> %s", query_url)
-        search_options['safe'] = 'on'
+    params['url'] = query_url
    params['url'] = search_url.format(query=urlencode({'q': query}),
                                      search_options=urlencode(search_options))
    logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language'])
    params['headers']['Accept-Language'] = lang_info['Accept-Language']
    params['headers']['Accept'] = (
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
        )
    return params
 # get response from search-request
 def response(resp):
    """Get response from google's search request"""
    results = []
    detect_google_sorry(resp)
    # convert the text to dom
    dom = html.fromstring(resp.text)
    vidthumb_imgdata = scrap_out_thumbs(dom)
    # parse results
-    for result in eval_xpath_list(dom, '//div[@class="g"]'):
+    for result in eval_xpath_list(dom, results_xpath):
-        title = extract_text(eval_xpath(result, './/h3'))
+        # google *sections*
-        url = eval_xpath_getindex(result, './/div[@class="r"]/a/@href', 0)
+        if extract_text(eval_xpath(result, g_section_with_header)):
-        content = extract_text(eval_xpath(result, './/span[@class="st"]'))
+            logger.debug("ingoring <g-section-with-header>")
            continue
-        # get thumbnails
+        title = extract_text(eval_xpath_getindex(result, title_xpath, 0))
-        script = str(dom.xpath('//script[contains(., "_setImagesSrc")]')[0].text)
+        url = eval_xpath_getindex(result, href_xpath, 0)
-        ids = result.xpath('.//div[@class="s"]//img/@id')
+        c_node = eval_xpath_getindex(result, content_xpath, 0)
        if len(ids) > 0:
            thumbnails_data = \
                re.findall('s=\'(.*?)(?:\\\\[a-z,1-9,\\\\]+\'|\')\;var ii=\[(?:|[\'vidthumb\d+\',]+)\'' + ids[0],
                           script)
            tmp = []
            if len(thumbnails_data) != 0:
                tmp = re.findall('(data:image/jpeg;base64,[a-z,A-Z,0-9,/,\+]+)', thumbnails_data[0])
            thumbnail = ''
            if len(tmp) != 0:
                thumbnail = tmp[-1]
-        # append result
+        # <img id="vidthumb1" ...>
-        results.append({'url': url,
+        img_id = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@id', 0, default=None)
        if img_id is None:
            continue
        img_src = vidthumb_imgdata.get(img_id, None)
        if not img_src:
            logger.error("no vidthumb imgdata for: %s" % img_id)
            img_src = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@src', 0)
        length = extract_text(eval_xpath(c_node, './/div[1]//a/div[3]'))
        content = extract_text(eval_xpath(c_node, './/div[2]/span'))
        pub_info = extract_text(eval_xpath(c_node, './/div[2]/div'))
        results.append({
            'url':         url,
            'title':       title,
            'content':     content,
-                        'thumbnail': thumbnail,
+            'length':      length,
-                        'template': 'videos.html'})
+            'author':      pub_info,
            'thumbnail':   img_src,
            'template':    'videos.html',
            })
    # parse suggestion
    for suggestion in eval_xpath_list(dom, suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})
    for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})
    return results
--- a/searx/languages.py
+++ b/searx/languages.py
@ -21,8 +21,6 @@ language_codes = \
    ('en-IE', 'English', 'Ireland', 'English'),
    ('en-IN', 'English', 'India', 'English'),
    ('en-NZ', 'English', 'New Zealand', 'English'),
    ('en-PH', 'English', 'Philippines', 'English'),
    ('en-SG', 'English', 'Singapore', 'English'),
    ('en-US', 'English', 'United States', 'English'),
    ('es', 'Español', '', 'Spanish'),
    ('es-AR', 'Español', 'Argentina', 'Spanish'),
@ -48,7 +46,6 @@ language_codes = \
    ('ko-KR', '한국어', '', 'Korean'),
    ('lt-LT', 'Lietuvių', '', 'Lithuanian'),
    ('lv-LV', 'Latviešu', '', 'Latvian'),
    ('ms-MY', 'Melayu', '', 'Malay'),
    ('nb-NO', 'Norsk Bokmål', '', 'Norwegian Bokmål'),
    ('nl', 'Nederlands', '', 'Dutch'),
    ('nl-BE', 'Nederlands', 'België', 'Dutch'),
--- a/searx/settings.yml
+++ b/searx/settings.yml
@ -117,6 +117,7 @@ checker:
    #    every: [86400, 90000]  # how often the checker runs
    # additional tests: only for the YAML anchors (see the engines section)
    additional_tests:
      rosebud: &test_rosebud
        matrix:
@ -127,6 +128,17 @@ checker:
          - ['one_title_contains', 'citizen kane']
        test:
          - unique_results
      android: &test_android
        matrix:
          query: ['android']
          lang: ['en', 'de', 'fr', 'zh-CN']
        result_container:
          - not_empty
          - ['one_title_contains', 'google']
        test:
          - unique_results
    # tests: only for the YAML anchors (see the engines section)
    tests:
      infobox: &tests_infobox
@ -480,18 +492,32 @@ engines:
  - name : google
    engine : google
    shortcut : go
    # additional_tests:
    #   android: *test_android
  - name : google images
    engine : google_images
    shortcut : goi
    # additional_tests:
    #   android: *test_android
    #   dali:
    #     matrix:
    #       query: ['Dali Christ']
    #       lang: ['en', 'de', 'fr', 'zh-CN']
    #     result_container:
    #       - ['one_title_contains', 'Salvador']
  - name : google news
    engine : google_news
    shortcut : gon
    # additional_tests:
    #   android: *test_android
  - name : google videos
    engine : google_videos
    shortcut : gov
    # additional_tests:
    #   android: *test_android
  - name : google scholar
    engine : xpath