Ponysearch/searx/engines/google_images.py

# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""This is the implementation of the google images engine using the google
internal API used the Google Go Android app.

This internal API offer results in

- JSON (_fmt:json)
- Protobuf (_fmt:pb)
- Protobuf compressed? (_fmt:pc)
- HTML (_fmt:html)
- Protobuf encoded in JSON (_fmt:jspb).

"""

from urllib.parse import urlencode
from json import loads

from searx.engines.google import (
    get_lang_info,
    time_range_dict,
    detect_google_sorry,
)

# pylint: disable=unused-import
from searx.engines.google import supported_languages_url, _fetch_supported_languages, fetch_traits

# pylint: enable=unused-import

# about
about = {
    "website": 'https://images.google.com',
    "wikidata_id": 'Q521550',
    "official_api_documentation": 'https://developers.google.com/custom-search',
    "use_official_api": False,
    "require_api_key": False,
    "results": 'JSON',
}

# engine dependent config
categories = ['images', 'web']
paging = True
use_locale_domain = True
time_range_support = True
safesearch = True
send_accept_language_header = True

filter_mapping = {0: 'images', 1: 'active', 2: 'active'}


def request(query, params):
    """Google-Image search request"""

    lang_info = get_lang_info(params, supported_languages, language_aliases, False)

    query_url = (
        'https://'
        + lang_info['subdomain']
        + '/search'
        + "?"
        + urlencode(
            {
                'q': query,
                'tbm': "isch",
                **lang_info['params'],
                'ie': "utf8",
                'oe': "utf8",
                'asearch': 'isch',
                'async': '_fmt:json,p:1,ijn:' + str(params['pageno']),
            }
        )
    )

    if params['time_range'] in time_range_dict:
        query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
    if params['safesearch']:
        query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
    params['url'] = query_url

    params['headers'].update(lang_info['headers'])
    params['headers']['User-Agent'] = 'NSTN/3.60.474802233.release Dalvik/2.1.0 (Linux; U; Android 12; US) gzip'
    params['headers']['Accept'] = '*/*'
    return params


def response(resp):
    """Get response from google's search request"""
    results = []

    detect_google_sorry(resp)

    json_start = resp.text.find('{"ischj":')
    json_data = loads(resp.text[json_start:])

    for item in json_data["ischj"]["metadata"]:

        result_item = {
            'url': item["result"]["referrer_url"],
            'title': item["result"]["page_title"],
            'content': item["text_in_grid"]["snippet"],
            'source': item["result"]["site_title"],
            'img_format': f'{item["original_image"]["width"]} x {item["original_image"]["height"]}',
            'img_src': item["original_image"]["url"],
            'thumbnail_src': item["thumbnail"]["url"],
            'template': 'images.html',
        }

        author = item["result"].get('iptc', {}).get('creator')
        if author:
            result_item['author'] = ', '.join(author)

        copyright_notice = item["result"].get('iptc', {}).get('copyright_notice')
        if copyright_notice:
            result_item['source'] += ' / ' + copyright_notice

        file_size = item.get('gsa', {}).get('file_size')
        if file_size:
            result_item['source'] += ' (%s)' % file_size

        results.append(result_item)

    return results
[fix] revise google images engine this commit is picked from #1985 2020-07-07 21:59:15 +02:00			`# SPDX-License-Identifier: AGPL-3.0-or-later`
[pylint] tag PYLINT_FILES by comment `# lint: pylint` These py files are linted by `test.pylint`, all other files are linted by `test.pep8`. Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-04-26 20:18:20 +02:00			`# lint: pylint`
[mod] google-images: slightly improvements of the engine Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2022-09-21 18:59:55 +02:00			`"""This is the implementation of the google images engine using the google`
			`internal API used the Google Go Android app.`

use the internal API for google images 2022-09-20 20:35:55 +02:00			`This internal API offer results in`
[mod] google-images: slightly improvements of the engine Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2022-09-21 18:59:55 +02:00
use the internal API for google images 2022-09-20 20:35:55 +02:00			`- JSON (_fmt:json)`
			`- Protobuf (_fmt:pb)`
			`- Protobuf compressed? (_fmt:pc)`
			`- HTML (_fmt:html)`
			`- Protobuf encoded in JSON (_fmt:jspb).`
[fix] revise google images engine this commit is picked from #1985 2020-07-07 21:59:15 +02:00
update versions.cfg to use the current up-to-date packages 2015-05-02 15:45:17 +02:00			`"""`
[enh] added google images engine 2013-10-19 22:19:14 +02:00
use the internal API for google images 2022-09-20 20:35:55 +02:00			`from urllib.parse import urlencode`
			`from json import loads`
[fix] revise google images engine this commit is picked from #1985 2020-07-07 21:59:15 +02:00
			`from searx.engines.google import (`
[fix] normalize the language & region aspects of all google engines BTW: make the engines ready for search.checker: - replace eval_xpath by eval_xpath_getindex and eval_xpath_list - google_images: remove outer try/except block Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-01-26 11:49:27 +01:00			`get_lang_info,`
[fix] pep8 2020-07-08 00:46:03 +02:00			`time_range_dict,`
[fix] revise of the google-news engine This revise is based on the methods developed in the revise of the google engine (see commit 410c2f9). Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-01-22 18:49:45 +01:00			`detect_google_sorry,`
[fix] revise google images engine this commit is picked from #1985 2020-07-07 21:59:15 +02:00			`)`

[fix] normalize the language & region aspects of all google engines BTW: make the engines ready for search.checker: - replace eval_xpath by eval_xpath_getindex and eval_xpath_list - google_images: remove outer try/except block Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-01-26 11:49:27 +01:00			`# pylint: disable=unused-import`
[mod] Google: fetch engine traits (data_type: supported_languages) Implements a fetch_traits function for the Google engines. .. note:: Does not include migration of the request methode from 'supported_languages' to 'traits' (EngineTraits) object! Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2022-10-08 11:32:08 +02:00			`from searx.engines.google import supported_languages_url, _fetch_supported_languages, fetch_traits`
[format.python] initial formatting of the python code This patch was generated by black [1]:: make format.python [1] https://github.com/psf/black Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-12-27 09:26:22 +01:00
[fix] normalize the language & region aspects of all google engines BTW: make the engines ready for search.checker: - replace eval_xpath by eval_xpath_getindex and eval_xpath_list - google_images: remove outer try/except block Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-01-26 11:49:27 +01:00			`# pylint: enable=unused-import`

[enh] engines: add about variable move meta information from comment to the about variable so the preferences, the documentation can show these information 2021-01-13 11:31:25 +01:00			`# about`
			`about = {`
[fix] normalize the language & region aspects of all google engines BTW: make the engines ready for search.checker: - replace eval_xpath by eval_xpath_getindex and eval_xpath_list - google_images: remove outer try/except block Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-01-26 11:49:27 +01:00			`"website": 'https://images.google.com',`
[enh] engines: add about variable move meta information from comment to the about variable so the preferences, the documentation can show these information 2021-01-13 11:31:25 +01:00			`"wikidata_id": 'Q521550',`
[fix] normalize the language & region aspects of all google engines BTW: make the engines ready for search.checker: - replace eval_xpath by eval_xpath_getindex and eval_xpath_list - google_images: remove outer try/except block Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-01-26 11:49:27 +01:00			`"official_api_documentation": 'https://developers.google.com/custom-search',`
[enh] engines: add about variable move meta information from comment to the about variable so the preferences, the documentation can show these information 2021-01-13 11:31:25 +01:00			`"use_official_api": False,`
			`"require_api_key": False,`
use the internal API for google images 2022-09-20 20:35:55 +02:00			`"results": 'JSON',`
[enh] engines: add about variable move meta information from comment to the about variable so the preferences, the documentation can show these information 2021-01-13 11:31:25 +01:00			`}`

add comments to google-engines 2014-09-01 15:10:05 +02:00			`# engine dependent config`
[enh] add more categories 2021-12-22 16:58:52 +01:00			`categories = ['images', 'web']`
use the internal API for google images 2022-09-20 20:35:55 +02:00			`paging = True`
[fix] revise google images engine this commit is picked from #1985 2020-07-07 21:59:15 +02:00			`use_locale_domain = True`
add time range search for google images 2016-07-18 17:25:40 +02:00			`time_range_support = True`
[fix] revise google images engine this commit is picked from #1985 2020-07-07 21:59:15 +02:00			`safesearch = True`
[mod] add 'Accept-Language' HTTP header to online processores Most engines that support languages (and regions) use the Accept-Language from the WEB browser to build a response that fits to the language (and region). - add new engine option: send_accept_language_header Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2022-08-01 17:01:59 +02:00			`send_accept_language_header = True`
[enh] added google images engine 2013-10-19 22:19:14 +02:00
[format.python] initial formatting of the python code This patch was generated by black [1]:: make format.python [1] https://github.com/psf/black Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-12-27 09:26:22 +01:00			`filter_mapping = {0: 'images', 1: 'active', 2: 'active'}`
[fix] revise google images engine this commit is picked from #1985 2020-07-07 21:59:15 +02:00
[fix] pep8 2020-07-08 00:46:03 +02:00
[enh] added google images engine 2013-10-19 22:19:14 +02:00			`def request(query, params):`
use the internal API for google images 2022-09-20 20:35:55 +02:00			`"""Google-Image search request"""`
[fix] revise google images engine this commit is picked from #1985 2020-07-07 21:59:15 +02:00
[format.python] initial formatting of the python code This patch was generated by black [1]:: make format.python [1] https://github.com/psf/black Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-12-27 09:26:22 +01:00			`lang_info = get_lang_info(params, supported_languages, language_aliases, False)`

			`query_url = (`
			`'https://'`
			`+ lang_info['subdomain']`
			`+ '/search'`
			`+ "?"`
use the internal API for google images 2022-09-20 20:35:55 +02:00			`+ urlencode(`
			`{`
			`'q': query,`
			`'tbm': "isch",`
			`**lang_info['params'],`
			`'ie': "utf8",`
			`'oe': "utf8",`
			`'asearch': 'isch',`
			`'async': '_fmt:json,p:1,ijn:' + str(params['pageno']),`
			`}`
			`)`
[fix] revise google images engine this commit is picked from #1985 2020-07-07 21:59:15 +02:00			`)`
[fix] google images paging - closes #571 2016-08-13 00:43:21 +02:00
[fix] time range detection 2016-07-26 00:22:05 +02:00			`if params['time_range'] in time_range_dict:`
[fix] revise google images engine this commit is picked from #1985 2020-07-07 21:59:15 +02:00			`query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})`
			`if params['safesearch']:`
			`query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})`
[fix] normalize the language & region aspects of all google engines BTW: make the engines ready for search.checker: - replace eval_xpath by eval_xpath_getindex and eval_xpath_list - google_images: remove outer try/except block Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-01-26 11:49:27 +01:00			`params['url'] = query_url`
[fix] revise google images engine this commit is picked from #1985 2020-07-07 21:59:15 +02:00
[enh] google engine: supports "default language" Same behaviour behaviour than Whoogle [1]. Only the google engine with the "Default language" choice "(all)"" is changed by this patch. When searching for a locate place, the result are in the expect language, without missing results [2]: > When a language is not specified, the language interpretation is left up to > Google to decide how the search results should be delivered. The query parameters are copied from Whoogle. With the ``all`` language: - add parameter ``source=lnt`` - don't use parameter ``lr`` - don't add a ``Accept-Language`` HTTP header. The new signature of function ``get_lang_info()`` is: lang_info = get_lang_info(params, lang_list, custom_aliases, supported_any_language) Argument ``supported_any_language`` is True for google.py and False for the other google engines. With this patch the function now returns: - query parameters: ``lang_info['params']`` - HTTP headers: ``lang_info['headers']`` - and as before this patch: - ``lang_info['subdomain']`` - ``lang_info['country']`` - ``lang_info['language']`` [1] https://github.com/benbusby/whoogle-search [2] https://github.com/benbusby/whoogle-search/releases/tag/v0.5.4 2021-06-06 08:18:07 +02:00			`params['headers'].update(lang_info['headers'])`
use the internal API for google images 2022-09-20 20:35:55 +02:00			`params['headers']['User-Agent'] = 'NSTN/3.60.474802233.release Dalvik/2.1.0 (Linux; U; Android 12; US) gzip'`
			`params['headers']['Accept'] = '/'`
[enh] added google images engine 2013-10-19 22:19:14 +02:00			`return params`

[fix] pep/flake8 compatibility 2014-01-20 02:31:20 +01:00
[enh] added google images engine 2013-10-19 22:19:14 +02:00			`def response(resp):`
[fix] revise google images engine this commit is picked from #1985 2020-07-07 21:59:15 +02:00			`"""Get response from google's search request"""`
Use string formatter to create source and img_format labels (#1566) google_images : use JSON embedded in HTML (engine expected pure JSON) 2019-05-28 05:33:31 +02:00			`results = []`

[fix] revise of the google-news engine This revise is based on the methods developed in the revise of the google engine (see commit 410c2f9). Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-01-22 18:49:45 +01:00			`detect_google_sorry(resp)`
[fix] revise google images engine this commit is picked from #1985 2020-07-07 21:59:15 +02:00
[mod] google-images: slightly improvements of the engine Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2022-09-21 18:59:55 +02:00			`json_start = resp.text.find('{"ischj":')`
			`json_data = loads(resp.text[json_start:])`
[fix] normalize the language & region aspects of all google engines BTW: make the engines ready for search.checker: - replace eval_xpath by eval_xpath_getindex and eval_xpath_list - google_images: remove outer try/except block Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-01-26 11:49:27 +01:00
[mod] google-images: slightly improvements of the engine Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2022-09-21 18:59:55 +02:00			`for item in json_data["ischj"]["metadata"]:`

			`result_item = {`
			`'url': item["result"]["referrer_url"],`
			`'title': item["result"]["page_title"],`
			`'content': item["text_in_grid"]["snippet"],`
			`'source': item["result"]["site_title"],`
			`'img_format': f'{item["original_image"]["width"]} x {item["original_image"]["height"]}',`
			`'img_src': item["original_image"]["url"],`
			`'thumbnail_src': item["thumbnail"]["url"],`
			`'template': 'images.html',`
			`}`

			`author = item["result"].get('iptc', {}).get('creator')`
			`if author:`
			`result_item['author'] = ', '.join(author)`

			`copyright_notice = item["result"].get('iptc', {}).get('copyright_notice')`
			`if copyright_notice:`
			`result_item['source'] += ' / ' + copyright_notice`

			`file_size = item.get('gsa', {}).get('file_size')`
			`if file_size:`
			`result_item['source'] += ' (%s)' % file_size`

			`results.append(result_item)`
Fix google image search - Because there is not full image url in the dom, we replace "image_url" with the same url as the "url" (url of source). See example HTML https://gist.github.com/Nachtalb/2dea8a4d2c723c49226ad9645838121f - Remove unused import - Fix google image search title - Keep google image safe value up to date 2019-04-12 23:12:56 +02:00
[enh] added google images engine 2013-10-19 22:19:14 +02:00			`return results`