forked from Ponysearch/Ponysearch
[mod] json_engine: add content_html_to_text and title_html_to_text
Some JSON API returns HTML in either in the HTML or the content. This commit adds two new parameters to the json_engine: content_html_to_text and title_html_to_text, False by default. If True, then the searx.utils.html_to_text removes the HTML tags. Update crossref, openairedatasets and openairepublications engines
This commit is contained in:
parent
436d366448
commit
ff84a1af35
2 changed files with 19 additions and 5 deletions
|
@ -3,13 +3,15 @@
|
||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
from json import loads
|
from json import loads
|
||||||
from urllib.parse import urlencode
|
from urllib.parse import urlencode
|
||||||
from searx.utils import to_string
|
from searx.utils import to_string, html_to_text
|
||||||
|
|
||||||
|
|
||||||
search_url = None
|
search_url = None
|
||||||
url_query = None
|
url_query = None
|
||||||
content_query = None
|
content_query = None
|
||||||
title_query = None
|
title_query = None
|
||||||
|
content_html_to_text = False
|
||||||
|
title_html_to_text = False
|
||||||
paging = False
|
paging = False
|
||||||
suggestion_query = ''
|
suggestion_query = ''
|
||||||
results_query = ''
|
results_query = ''
|
||||||
|
@ -92,9 +94,17 @@ def request(query, params):
|
||||||
return params
|
return params
|
||||||
|
|
||||||
|
|
||||||
|
def identity(arg):
|
||||||
|
return arg
|
||||||
|
|
||||||
|
|
||||||
def response(resp):
|
def response(resp):
|
||||||
results = []
|
results = []
|
||||||
json = loads(resp.text)
|
json = loads(resp.text)
|
||||||
|
|
||||||
|
title_filter = html_to_text if title_html_to_text else identity
|
||||||
|
content_filter = html_to_text if content_html_to_text else identity
|
||||||
|
|
||||||
if results_query:
|
if results_query:
|
||||||
rs = query(json, results_query)
|
rs = query(json, results_query)
|
||||||
if not len(rs):
|
if not len(rs):
|
||||||
|
@ -111,8 +121,8 @@ def response(resp):
|
||||||
content = ""
|
content = ""
|
||||||
results.append({
|
results.append({
|
||||||
'url': to_string(url),
|
'url': to_string(url),
|
||||||
'title': to_string(title),
|
'title': title_filter(to_string(title)),
|
||||||
'content': to_string(content),
|
'content': content_filter(to_string(content)),
|
||||||
})
|
})
|
||||||
else:
|
else:
|
||||||
for url, title, content in zip(
|
for url, title, content in zip(
|
||||||
|
@ -122,8 +132,8 @@ def response(resp):
|
||||||
):
|
):
|
||||||
results.append({
|
results.append({
|
||||||
'url': to_string(url),
|
'url': to_string(url),
|
||||||
'title': to_string(title),
|
'title': title_filter(to_string(title)),
|
||||||
'content': to_string(content),
|
'content': content_filter(to_string(content)),
|
||||||
})
|
})
|
||||||
|
|
||||||
if not suggestion_query:
|
if not suggestion_query:
|
||||||
|
|
|
@ -267,7 +267,9 @@ engines:
|
||||||
search_url : https://search.crossref.org/dois?q={query}&page={pageno}
|
search_url : https://search.crossref.org/dois?q={query}&page={pageno}
|
||||||
url_query : doi
|
url_query : doi
|
||||||
title_query : title
|
title_query : title
|
||||||
|
title_html_to_text: True
|
||||||
content_query : fullCitation
|
content_query : fullCitation
|
||||||
|
content_html_to_text: True
|
||||||
categories : science
|
categories : science
|
||||||
shortcut : cr
|
shortcut : cr
|
||||||
about:
|
about:
|
||||||
|
@ -757,6 +759,7 @@ engines:
|
||||||
url_query : metadata/oaf:entity/oaf:result/children/instance/webresource/url/$
|
url_query : metadata/oaf:entity/oaf:result/children/instance/webresource/url/$
|
||||||
title_query : metadata/oaf:entity/oaf:result/title/$
|
title_query : metadata/oaf:entity/oaf:result/title/$
|
||||||
content_query : metadata/oaf:entity/oaf:result/description/$
|
content_query : metadata/oaf:entity/oaf:result/description/$
|
||||||
|
content_html_to_text: True
|
||||||
categories : science
|
categories : science
|
||||||
shortcut : oad
|
shortcut : oad
|
||||||
timeout: 5.0
|
timeout: 5.0
|
||||||
|
@ -776,6 +779,7 @@ engines:
|
||||||
url_query : metadata/oaf:entity/oaf:result/children/instance/webresource/url/$
|
url_query : metadata/oaf:entity/oaf:result/children/instance/webresource/url/$
|
||||||
title_query : metadata/oaf:entity/oaf:result/title/$
|
title_query : metadata/oaf:entity/oaf:result/title/$
|
||||||
content_query : metadata/oaf:entity/oaf:result/description/$
|
content_query : metadata/oaf:entity/oaf:result/description/$
|
||||||
|
content_html_to_text: True
|
||||||
categories : science
|
categories : science
|
||||||
shortcut : oap
|
shortcut : oap
|
||||||
timeout: 5.0
|
timeout: 5.0
|
||||||
|
|
Loading…
Reference in a new issue