Merge pull request #837 from dalf/fix-ina

[fix] ina engine
This commit is contained in:
Markus Heiser 2022-01-29 07:55:21 +00:00 committed by GitHub
commit 1a5e1e74ef
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -3,12 +3,10 @@
INA (Videos) INA (Videos)
""" """
from json import loads
from html import unescape from html import unescape
from urllib.parse import urlencode from urllib.parse import urlencode
from lxml import html from lxml import html
from dateutil import parser from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
from searx.utils import extract_text
# about # about
about = { about = {
@ -24,25 +22,24 @@ about = {
# engine dependent config # engine dependent config
categories = ['videos'] categories = ['videos']
paging = True paging = True
page_size = 48 page_size = 12
# search-url # search-url
base_url = 'https://www.ina.fr' base_url = 'https://www.ina.fr'
search_url = base_url + '/layout/set/ajax/recherche/result?autopromote=&hf={ps}&b={start}&type=Video&r=&{query}' search_url = base_url + '/ajax/recherche?{query}&espace=1&sort=pertinence&order=desc&offset={start}&modified=size'
# specific xpath variables # specific xpath variables
results_xpath = '//div[contains(@class,"search-results--list")]//div[@class="media-body"]' results_xpath = '//div[@id="searchHits"]/div'
url_xpath = './/a/@href' url_xpath = './/a/@href'
title_xpath = './/h3[@class="h3--title media-heading"]' title_xpath = './/div[contains(@class,"title-bloc-small")]'
thumbnail_xpath = './/img/@src' content_xpath = './/div[contains(@class,"sous-titre-fonction")]'
publishedDate_xpath = './/span[@class="broadcast"]' thumbnail_xpath = './/img/@data-src'
content_xpath = './/p[@class="media-body__summary"]' publishedDate_xpath = './/div[contains(@class,"dateAgenda")]'
# do search-request # do search-request
def request(query, params): def request(query, params):
params['url'] = search_url.format(ps=page_size, start=params['pageno'] * page_size, query=urlencode({'q': query})) params['url'] = search_url.format(start=params['pageno'] * page_size, query=urlencode({'q': query}))
return params return params
@ -51,26 +48,17 @@ def response(resp):
results = [] results = []
# we get html in a JSON container... # we get html in a JSON container...
response = loads(resp.text) dom = html.fromstring(resp.text)
dom = html.fromstring(response)
# parse results # parse results
for result in dom.xpath(results_xpath): for result in eval_xpath_list(dom, results_xpath):
videoid = result.xpath(url_xpath)[0] url_relative = eval_xpath_getindex(result, url_xpath, 0)
url = base_url + videoid url = base_url + url_relative
title = unescape(extract_text(result.xpath(title_xpath))) title = unescape(extract_text(eval_xpath(result, title_xpath)))
try: thumbnail = extract_text(eval_xpath(result, thumbnail_xpath))
thumbnail = extract_text(result.xpath(thumbnail_xpath)[0]) content = extract_text(eval_xpath(result, publishedDate_xpath)) + extract_text(
except: eval_xpath(result, content_xpath)
thumbnail = '' )
if thumbnail and thumbnail[0] == '/':
thumbnail = base_url + thumbnail
d = extract_text(result.xpath(publishedDate_xpath)[0])
d = d.split('/')
# force ISO date to avoid wrong parsing
d = "%s-%s-%s" % (d[2], d[1], d[0])
publishedDate = parser.parse(d)
content = extract_text(result.xpath(content_xpath))
# append result # append result
results.append( results.append(
@ -79,7 +67,6 @@ def response(resp):
'title': title, 'title': title,
'content': content, 'content': content,
'template': 'videos.html', 'template': 'videos.html',
'publishedDate': publishedDate,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
} }
) )