[enh] bing_images: use data from embedded JSON to improve results (e.g. real page title) ()

use data from embedded JSON to improve results (e.g. real page title), add image format and source info (see PR ), improve paging logic (it now works)
This commit is contained in:
Frank de Lange 2019-07-27 08:22:02 +02:00 committed by Alexandre Flament
parent f34b5cedb1
commit 11fc9913e9
2 changed files with 48 additions and 35 deletions
searx/engines
tests/unit/engines

View file

@ -10,9 +10,6 @@
@stable no (HTML can change)
@parse url, title, img_src
@todo currently there are up to 35 images receive per page,
because bing does not parse count=10.
limited response to 10 images
"""
from lxml import html
@ -28,10 +25,15 @@ safesearch = True
time_range_support = True
language_support = True
supported_languages_url = 'https://www.bing.com/account/general'
number_of_results = 28
# search-url
base_url = 'https://www.bing.com/'
search_string = 'images/search?{query}&count=10&first={offset}'
search_string = 'images/search'\
'?{query}'\
'&count={count}'\
'&first={first}'\
'&FORM=IBASEP'
time_range_string = '&qft=+filterui:age-lt{interval}'
time_range_dict = {'day': '1440',
'week': '10080',
@ -44,16 +46,14 @@ safesearch_types = {2: 'STRICT',
0: 'OFF'}
_quote_keys_regex = re.compile('({|,)([a-z][a-z0-9]*):(")', re.I | re.U)
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1
offset = ((params['pageno'] - 1) * number_of_results) + 1
search_path = search_string.format(
query=urlencode({'q': query}),
offset=offset)
count=number_of_results,
first=offset)
language = match_language(params['language'], supported_languages, language_aliases).lower()
@ -77,32 +77,31 @@ def response(resp):
dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath('//div[@id="mmComponent_images_1"]/ul/li/div/div[@class="imgpt"]'):
link = result.xpath('./a')[0]
for result in dom.xpath('//div[@class="imgpt"]'):
# TODO find actual title
title = link.xpath('.//img/@alt')[0]
img_format = result.xpath('./div[contains(@class, "img_info")]/span/text()')[0]
# Microsoft seems to experiment with this code so don't make the path too specific,
# just catch the text section for the first anchor in img_info assuming this to be
# the originating site.
source = result.xpath('./div[contains(@class, "img_info")]//a/text()')[0]
# parse json-data (it is required to add a space, to make it parsable)
json_data = loads(_quote_keys_regex.sub(r'\1"\2": \3', link.attrib.get('m')))
try:
m = loads(result.xpath('./a/@m')[0])
url = json_data.get('purl')
img_src = json_data.get('murl')
thumbnail = json_data.get('turl')
# strip 'Unicode private use area' highlighting, they render to Tux
# the Linux penguin and a standing diamond on my machine...
title = m.get('t', '').replace(u'\ue000', '').replace(u'\ue001', '')
results.append({'template': 'images.html',
'url': m['purl'],
'thumbnail_src': m['turl'],
'img_src': m['murl'],
'content': '',
'title': title,
'source': source,
'img_format': img_format})
except:
continue
# append result
results.append({'template': 'images.html',
'url': url,
'title': title,
'content': '',
'thumbnail_src': thumbnail,
'img_src': img_src})
# TODO stop parsing if 10 images are found
# if len(results) >= 10:
# break
# return results
return results

View file

@ -53,17 +53,25 @@ class TestBingImagesEngine(SearxTestCase):
<li>
<div>
<div class="imgpt">
<a m='{"purl":"page_url","murl":"img_url","turl":"thumb_url"}'>
<a m='{"purl":"page_url","murl":"img_url","turl":"thumb_url","t":"Page 1 title"}'>
<img src="" alt="alt text" />
</a>
<div class="img_info">
<span>1 x 1 - jpeg</span>
<a>1.example.org</a>
</div>
</div>
<div></div>
</div>
<div>
<div class="imgpt">
<a m='{"purl":"page_url2","murl":"img_url2","turl":"thumb_url2"}'>
<a m='{"purl":"page_url2","murl":"img_url2","turl":"thumb_url2","t":"Page 2 title"}'>
<img src="" alt="alt text 2" />
</a>
<div class="img_info">
<span>2 x 2 - jpeg</span>
<a>2.example.org</a>
</div>
</div>
</div>
</li>
@ -72,9 +80,13 @@ class TestBingImagesEngine(SearxTestCase):
<li>
<div>
<div class="imgpt">
<a m='{"purl":"page_url3","murl":"img_url3","turl":"thumb_url3"}'>
<a m='{"purl":"page_url3","murl":"img_url3","turl":"thumb_url3","t":"Page 3 title"}'>
<img src="" alt="alt text 3" />
</a>
<div class="img_info">
<span>3 x 3 - jpeg</span>
<a>3.example.org</a>
</div>
</div>
</div>
</li>
@ -86,11 +98,13 @@ class TestBingImagesEngine(SearxTestCase):
results = bing_images.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 3)
self.assertEqual(results[0]['title'], 'alt text')
self.assertEqual(results[0]['title'], 'Page 1 title')
self.assertEqual(results[0]['url'], 'page_url')
self.assertEqual(results[0]['content'], '')
self.assertEqual(results[0]['thumbnail_src'], 'thumb_url')
self.assertEqual(results[0]['img_src'], 'img_url')
self.assertEqual(results[0]['img_format'], '1 x 1 - jpeg')
self.assertEqual(results[0]['source'], '1.example.org')
def test_fetch_supported_languages(self):
html = """