Ponysearch/searx/engines/google_images.py

"""
 Google (Images)

 @website     https://www.google.com
 @provide-api yes (https://developers.google.com/custom-search/)

 @using-api   no
 @results     HTML chunks with JSON inside
 @stable      no
 @parse       url, title, img_src
"""

from urllib import urlencode
from urlparse import parse_qs
from json import loads
from lxml import html

# engine dependent config
categories = ['images']
paging = True
safesearch = True

search_url = 'https://www.google.com/search'\
    '?{query}'\
    '&tbm=isch'\
    '&ijn=1'\
    '&start={offset}'


# do search-request
def request(query, params):
    offset = (params['pageno'] - 1) * 100

    params['url'] = search_url.format(query=urlencode({'q': query}),
                                      offset=offset,
                                      safesearch=safesearch)

    if safesearch and params['safesearch']:
        params['url'] += '&' + urlencode({'safe': 'active'})

    return params


# get response from search-request
def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in dom.xpath('//div[@data-ved]'):
        data_url = result.xpath('./a/@href')[0]
        data_query = {k: v[0] for k, v in parse_qs(data_url.split('?', 1)[1]).iteritems()}

        metadata = loads(result.xpath('./div[@class="rg_meta"]/text()')[0])

        thumbnail_src = metadata['tu']

        # http to https
        thumbnail_src = thumbnail_src.replace("http://", "https://")

        # append result
        results.append({'url': data_query['imgrefurl'],
                        'title': metadata['pt'],
                        'content': metadata['s'],
                        'thumbnail_src': metadata['tu'],
                        'img_src': data_query['imgurl'],
                        'template': 'images.html'})

    # return results
    print len(results)
    return results
update versions.cfg to use the current up-to-date packages 2015-05-02 15:45:17 +02:00			`"""`
			`Google (Images)`

			`@website https://www.google.com`
[fix] replace the dead google images ajax api with a working one 2015-12-09 01:20:46 +01:00			`@provide-api yes (https://developers.google.com/custom-search/)`
update versions.cfg to use the current up-to-date packages 2015-05-02 15:45:17 +02:00
[doc] correct google images docstring 2015-12-09 01:23:05 +01:00			`@using-api no`
			`@results HTML chunks with JSON inside`
[fix] replace the dead google images ajax api with a working one 2015-12-09 01:20:46 +01:00			`@stable no`
update versions.cfg to use the current up-to-date packages 2015-05-02 15:45:17 +02:00			`@parse url, title, img_src`
			`"""`
[enh] added google images engine 2013-10-19 22:19:14 +02:00
[fix] replace the dead google images ajax api with a working one 2015-12-09 01:20:46 +01:00			`from urllib import urlencode`
			`from urlparse import parse_qs`
[enh] google images refactor 2013-10-19 23:12:18 +02:00			`from json import loads`
[fix] replace the dead google images ajax api with a working one 2015-12-09 01:20:46 +01:00			`from lxml import html`
[enh] added google images engine 2013-10-19 22:19:14 +02:00
add comments to google-engines 2014-09-01 15:10:05 +02:00			`# engine dependent config`
[mod] category -> images 2013-10-19 22:19:31 +02:00			`categories = ['images']`
add comments to google-engines 2014-09-01 15:10:05 +02:00			`paging = True`
[enh] add safesearch to google_images 2015-02-08 22:15:25 +01:00			`safesearch = True`
[enh] added google images engine 2013-10-19 22:19:14 +02:00
[fix] replace the dead google images ajax api with a working one 2015-12-09 01:20:46 +01:00			`search_url = 'https://www.google.com/search'\`
			`'?{query}'\`
			`'&tbm=isch'\`
			`'&ijn=1'\`
			`'&start={offset}'`
[fix] pep/flake8 compatibility 2014-01-20 02:31:20 +01:00
[enh] added google images engine 2013-10-19 22:19:14 +02:00
add comments to google-engines 2014-09-01 15:10:05 +02:00			`# do search-request`
[enh] added google images engine 2013-10-19 22:19:14 +02:00			`def request(query, params):`
[fix] replace the dead google images ajax api with a working one 2015-12-09 01:20:46 +01:00			`offset = (params['pageno'] - 1) * 100`
[enh] add safesearch to google_images 2015-02-08 22:15:25 +01:00
[enh] paging support for google images 2014-01-30 01:21:33 +01:00			`params['url'] = search_url.format(query=urlencode({'q': query}),`
[enh] add safesearch to google_images 2015-02-08 22:15:25 +01:00			`offset=offset,`
			`safesearch=safesearch)`
add comments to google-engines 2014-09-01 15:10:05 +02:00
[fix] replace the dead google images ajax api with a working one 2015-12-09 01:20:46 +01:00			`if safesearch and params['safesearch']:`
			`params['url'] += '&' + urlencode({'safe': 'active'})`

[enh] added google images engine 2013-10-19 22:19:14 +02:00			`return params`

[fix] pep/flake8 compatibility 2014-01-20 02:31:20 +01:00
add comments to google-engines 2014-09-01 15:10:05 +02:00			`# get response from search-request`
[enh] added google images engine 2013-10-19 22:19:14 +02:00			`def response(resp):`
			`results = []`
add comments to google-engines 2014-09-01 15:10:05 +02:00
[fix] replace the dead google images ajax api with a working one 2015-12-09 01:20:46 +01:00			`dom = html.fromstring(resp.text)`
add comments to google-engines 2014-09-01 15:10:05 +02:00
			`# parse results`
[fix] replace the dead google images ajax api with a working one 2015-12-09 01:20:46 +01:00			`for result in dom.xpath('//div[@data-ved]'):`
			`data_url = result.xpath('./a/@href')[0]`
			`data_query = {k: v[0] for k, v in parse_qs(data_url.split('?', 1)[1]).iteritems()}`

			`metadata = loads(result.xpath('./div[@class="rg_meta"]/text()')[0])`

			`thumbnail_src = metadata['tu']`
add comments to google-engines 2014-09-01 15:10:05 +02:00
[enh] reduce the number of http outgoing connections. engines that still use http : gigablast, bing image for thumbnails, 1x and dbpedia autocompleter 2015-05-02 11:43:12 +02:00			`# http to https`
			`thumbnail_src = thumbnail_src.replace("http://", "https://")`

add comments to google-engines 2014-09-01 15:10:05 +02:00			`# append result`
[fix] replace the dead google images ajax api with a working one 2015-12-09 01:20:46 +01:00			`results.append({'url': data_query['imgrefurl'],`
			`'title': metadata['pt'],`
			`'content': metadata['s'],`
			`'thumbnail_src': metadata['tu'],`
			`'img_src': data_query['imgurl'],`
[fix] pep/flake8 compatibility 2014-01-20 02:31:20 +01:00			`'template': 'images.html'})`
add comments to google-engines 2014-09-01 15:10:05 +02:00
			`# return results`
[fix] replace the dead google images ajax api with a working one 2015-12-09 01:20:46 +01:00			`print len(results)`
[enh] added google images engine 2013-10-19 22:19:14 +02:00			`return results`