forked from Ponysearch/Ponysearch
Merge pull request #547 from return42/fix-442
[fix] google & google video engines
This commit is contained in:
commit
328473befd
2 changed files with 32 additions and 36 deletions
|
@ -138,12 +138,7 @@ content_xpath = './/div[@class="IsZvec"]'
|
||||||
|
|
||||||
# Suggestions are links placed in a *card-section*, we extract only the text
|
# Suggestions are links placed in a *card-section*, we extract only the text
|
||||||
# from the links not the links itself.
|
# from the links not the links itself.
|
||||||
suggestion_xpath = '//div[contains(@class, "card-section")]//a'
|
suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a'
|
||||||
|
|
||||||
# Since google does *auto-correction* on the first query these are not really
|
|
||||||
# *spelling suggestions*, we use them anyway.
|
|
||||||
spelling_suggestion_xpath = '//div[@class="med"]/p/a'
|
|
||||||
|
|
||||||
|
|
||||||
def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
|
def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
|
||||||
"""Composing various language properties for the google engines.
|
"""Composing various language properties for the google engines.
|
||||||
|
@ -322,7 +317,6 @@ def response(resp):
|
||||||
|
|
||||||
# convert the text to dom
|
# convert the text to dom
|
||||||
dom = html.fromstring(resp.text)
|
dom = html.fromstring(resp.text)
|
||||||
|
|
||||||
# results --> answer
|
# results --> answer
|
||||||
answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
|
answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
|
||||||
if answer_list:
|
if answer_list:
|
||||||
|
@ -379,9 +373,6 @@ def response(resp):
|
||||||
# append suggestion
|
# append suggestion
|
||||||
results.append({'suggestion': extract_text(suggestion)})
|
results.append({'suggestion': extract_text(suggestion)})
|
||||||
|
|
||||||
for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
|
|
||||||
results.append({'correction': extract_text(correction)})
|
|
||||||
|
|
||||||
# return results
|
# return results
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
|
@ -31,13 +31,9 @@ from searx.engines.google import (
|
||||||
get_lang_info,
|
get_lang_info,
|
||||||
time_range_dict,
|
time_range_dict,
|
||||||
filter_mapping,
|
filter_mapping,
|
||||||
results_xpath,
|
|
||||||
g_section_with_header,
|
g_section_with_header,
|
||||||
title_xpath,
|
title_xpath,
|
||||||
href_xpath,
|
|
||||||
content_xpath,
|
|
||||||
suggestion_xpath,
|
suggestion_xpath,
|
||||||
spelling_suggestion_xpath,
|
|
||||||
detect_google_sorry,
|
detect_google_sorry,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -74,11 +70,27 @@ def _re(regexpr):
|
||||||
RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr))
|
RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr))
|
||||||
return RE_CACHE[regexpr]
|
return RE_CACHE[regexpr]
|
||||||
|
|
||||||
|
|
||||||
|
def scrap_out_thumbs_src(dom):
|
||||||
|
ret_val = {}
|
||||||
|
thumb_name = 'dimg_'
|
||||||
|
for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'):
|
||||||
|
_script = script.text
|
||||||
|
# "dimg_35":"https://i.ytimg.c....",
|
||||||
|
_dimurl = _re("s='([^']*)").findall( _script)
|
||||||
|
for k,v in _re('(' + thumb_name + '[0-9]*)":"(http[^"]*)' ).findall(_script):
|
||||||
|
v = v.replace(r'\u003d','=')
|
||||||
|
v = v.replace(r'\u0026','&')
|
||||||
|
ret_val[k] = v
|
||||||
|
logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys())
|
||||||
|
return ret_val
|
||||||
|
|
||||||
|
|
||||||
def scrap_out_thumbs(dom):
|
def scrap_out_thumbs(dom):
|
||||||
"""Scrap out thumbnail data from <script> tags.
|
"""Scrap out thumbnail data from <script> tags.
|
||||||
"""
|
"""
|
||||||
ret_val = {}
|
ret_val = {}
|
||||||
thumb_name = 'vidthumb'
|
thumb_name = 'dimg_'
|
||||||
|
|
||||||
for script in eval_xpath_list(dom, '//script[contains(., "_setImagesSrc")]'):
|
for script in eval_xpath_list(dom, '//script[contains(., "_setImagesSrc")]'):
|
||||||
_script = script.text
|
_script = script.text
|
||||||
|
@ -88,20 +100,11 @@ def scrap_out_thumbs(dom):
|
||||||
if not _imgdata:
|
if not _imgdata:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# var ii=['vidthumb4','vidthumb7']
|
# var ii=['dimg_17']
|
||||||
for _vidthumb in _re(r"(%s\d+)" % thumb_name).findall(_script):
|
for _vidthumb in _re(r"(%s\d+)" % thumb_name).findall(_script):
|
||||||
# At least the equal sign in the URL needs to be decoded
|
# At least the equal sign in the URL needs to be decoded
|
||||||
ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=")
|
ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=")
|
||||||
|
|
||||||
# {google.ldidly=-1;google.ldi={"vidthumb8":"https://...
|
|
||||||
for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'):
|
|
||||||
_script = script.text
|
|
||||||
for key_val in _re(r'"%s\d+\":\"[^\"]*"' % thumb_name).findall( _script) :
|
|
||||||
match = _re(r'"(%s\d+)":"(.*)"' % thumb_name).search(key_val)
|
|
||||||
if match:
|
|
||||||
# At least the equal sign in the URL needs to be decoded
|
|
||||||
ret_val[match.group(1)] = match.group(2).replace(r"\u003d", "=")
|
|
||||||
|
|
||||||
logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys())
|
logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys())
|
||||||
return ret_val
|
return ret_val
|
||||||
|
|
||||||
|
@ -145,9 +148,11 @@ def response(resp):
|
||||||
# convert the text to dom
|
# convert the text to dom
|
||||||
dom = html.fromstring(resp.text)
|
dom = html.fromstring(resp.text)
|
||||||
vidthumb_imgdata = scrap_out_thumbs(dom)
|
vidthumb_imgdata = scrap_out_thumbs(dom)
|
||||||
|
thumbs_src = scrap_out_thumbs_src(dom)
|
||||||
|
logger.debug(str(thumbs_src))
|
||||||
|
|
||||||
# parse results
|
# parse results
|
||||||
for result in eval_xpath_list(dom, results_xpath):
|
for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'):
|
||||||
|
|
||||||
# google *sections*
|
# google *sections*
|
||||||
if extract_text(eval_xpath(result, g_section_with_header)):
|
if extract_text(eval_xpath(result, g_section_with_header)):
|
||||||
|
@ -155,21 +160,24 @@ def response(resp):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
title = extract_text(eval_xpath_getindex(result, title_xpath, 0))
|
title = extract_text(eval_xpath_getindex(result, title_xpath, 0))
|
||||||
url = eval_xpath_getindex(result, href_xpath, 0)
|
url = eval_xpath_getindex(result, './/div[@class="dXiKIc"]//a/@href', 0)
|
||||||
c_node = eval_xpath_getindex(result, content_xpath, 0)
|
|
||||||
|
|
||||||
# <img id="vidthumb1" ...>
|
# <img id="vidthumb1" ...>
|
||||||
img_id = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@id', 0, default=None)
|
img_id = eval_xpath_getindex(result, './/g-img/img/@id', 0, default=None)
|
||||||
if img_id is None:
|
if img_id is None:
|
||||||
|
logger.error("no img_id for: %s" % result)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
img_src = vidthumb_imgdata.get(img_id, None)
|
img_src = vidthumb_imgdata.get(img_id, None)
|
||||||
if not img_src:
|
if not img_src:
|
||||||
logger.error("no vidthumb imgdata for: %s" % img_id)
|
logger.error("no vidthumb imgdata for: %s" % img_id)
|
||||||
img_src = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@src', 0)
|
img_src = thumbs_src.get(img_id, "")
|
||||||
|
|
||||||
length = extract_text(eval_xpath(c_node, './/div[1]//a/div[3]'))
|
length = extract_text(eval_xpath(
|
||||||
content = extract_text(eval_xpath(c_node, './/div[2]/span'))
|
result, './/div[contains(@class, "P7xzyf")]/span/span'))
|
||||||
pub_info = extract_text(eval_xpath(c_node, './/div[2]/div'))
|
c_node = eval_xpath_getindex(result, './/div[@class="Uroaid"]', 0)
|
||||||
|
content = extract_text(c_node)
|
||||||
|
pub_info = extract_text(eval_xpath(result, './/div[@class="Zg1NU"]'))
|
||||||
|
|
||||||
results.append({
|
results.append({
|
||||||
'url': url,
|
'url': url,
|
||||||
|
@ -186,7 +194,4 @@ def response(resp):
|
||||||
# append suggestion
|
# append suggestion
|
||||||
results.append({'suggestion': extract_text(suggestion)})
|
results.append({'suggestion': extract_text(suggestion)})
|
||||||
|
|
||||||
for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
|
|
||||||
results.append({'correction': extract_text(correction)})
|
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
Loading…
Reference in a new issue