forked from Ponysearch/Ponysearch
8a69ade875
Sometimes there is two requests to google (depending of the source IP) : one to google.com, the second to google.fr (for instance). Going to https://www.google.com/ncr and saving the PREF cookie for future use prevent this (there is no redirection). But, recently (or not ?), by doing this the search returns English results even if the Accept-Language is specified. There is still a way to prevent this : going to preference, set the search language. I don't know if this can be done by searx. For now, a quick fix is to disable the use of the PREF cookie when the search language is not English (google engine will slower but returns excepted results).
141 lines
4 KiB
Python
141 lines
4 KiB
Python
# Google (Web)
|
|
#
|
|
# @website https://www.google.com
|
|
# @provide-api yes (https://developers.google.com/custom-search/)
|
|
#
|
|
# @using-api no
|
|
# @results HTML
|
|
# @stable no (HTML can change)
|
|
# @parse url, title, content, suggestion
|
|
|
|
from urllib import urlencode
|
|
from urlparse import urlparse, parse_qsl
|
|
from lxml import html
|
|
from searx.poolrequests import get
|
|
from searx.engines.xpath import extract_text, extract_url
|
|
|
|
# engine dependent config
|
|
categories = ['general']
|
|
paging = True
|
|
language_support = True
|
|
|
|
# search-url
|
|
google_hostname = 'www.google.com'
|
|
search_path = '/search'
|
|
redirect_path = '/url'
|
|
images_path = '/images'
|
|
search_url = ('https://' +
|
|
google_hostname +
|
|
search_path +
|
|
'?{query}&start={offset}&gbv=1')
|
|
|
|
# specific xpath variables
|
|
results_xpath = '//li[@class="g"]'
|
|
url_xpath = './/h3/a/@href'
|
|
title_xpath = './/h3'
|
|
content_xpath = './/span[@class="st"]'
|
|
suggestion_xpath = '//p[@class="_Bmc"]'
|
|
|
|
images_xpath = './/div/a'
|
|
image_url_xpath = './@href'
|
|
image_img_src_xpath = './img/@src'
|
|
|
|
pref_cookie = ''
|
|
|
|
|
|
# see https://support.google.com/websearch/answer/873?hl=en
|
|
def get_google_pref_cookie():
|
|
global pref_cookie
|
|
if pref_cookie == '':
|
|
resp = get('https://www.google.com/ncr', allow_redirects=False)
|
|
pref_cookie = resp.cookies["PREF"]
|
|
return pref_cookie
|
|
|
|
|
|
# remove google-specific tracking-url
|
|
def parse_url(url_string):
|
|
parsed_url = urlparse(url_string)
|
|
if (parsed_url.netloc in [google_hostname, '']
|
|
and parsed_url.path == redirect_path):
|
|
query = dict(parse_qsl(parsed_url.query))
|
|
return query['q']
|
|
else:
|
|
return url_string
|
|
|
|
|
|
# do search-request
|
|
def request(query, params):
|
|
offset = (params['pageno'] - 1) * 10
|
|
|
|
if params['language'] == 'all':
|
|
language = 'en'
|
|
else:
|
|
language = params['language'].replace('_', '-').lower()
|
|
|
|
params['url'] = search_url.format(offset=offset,
|
|
query=urlencode({'q': query}))
|
|
|
|
params['headers']['Accept-Language'] = language
|
|
if language.startswith('en'):
|
|
params['cookies']['PREF'] = get_google_pref_cookie()
|
|
|
|
return params
|
|
|
|
|
|
# get response from search-request
|
|
def response(resp):
|
|
results = []
|
|
|
|
dom = html.fromstring(resp.text)
|
|
|
|
# parse results
|
|
for result in dom.xpath(results_xpath):
|
|
title = extract_text(result.xpath(title_xpath)[0])
|
|
try:
|
|
url = parse_url(extract_url(result.xpath(url_xpath), search_url))
|
|
parsed_url = urlparse(url)
|
|
if (parsed_url.netloc == google_hostname
|
|
and parsed_url.path == search_path):
|
|
# remove the link to google news
|
|
continue
|
|
|
|
# images result
|
|
if (parsed_url.netloc == google_hostname
|
|
and parsed_url.path == images_path):
|
|
# only thumbnail image provided,
|
|
# so skipping image results
|
|
# results = results + parse_images(result)
|
|
pass
|
|
else:
|
|
# normal result
|
|
content = extract_text(result.xpath(content_xpath)[0])
|
|
# append result
|
|
results.append({'url': url,
|
|
'title': title,
|
|
'content': content})
|
|
except:
|
|
continue
|
|
|
|
# parse suggestion
|
|
for suggestion in dom.xpath(suggestion_xpath):
|
|
# append suggestion
|
|
results.append({'suggestion': extract_text(suggestion)})
|
|
|
|
# return results
|
|
return results
|
|
|
|
|
|
def parse_images(result):
|
|
results = []
|
|
for image in result.xpath(images_xpath):
|
|
url = parse_url(extract_text(image.xpath(image_url_xpath)[0]))
|
|
img_src = extract_text(image.xpath(image_img_src_xpath)[0])
|
|
|
|
# append result
|
|
results.append({'url': url,
|
|
'title': '',
|
|
'content': '',
|
|
'img_src': img_src,
|
|
'template': 'images.html'})
|
|
|
|
return results
|