[fix] engine - Crossref

Crossref was broken on result types journal-issue and component .. The old code
had lots of assumptions, and broke during parsing.  Now the assumptions are more
explicit and checked them with the API.
This commit is contained in:
jazzzooo 2023-09-13 16:21:10 +00:00 committed by Markus Heiser
parent ed6a5a01bb
commit 74600c028d

View file

@ -1,60 +1,64 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint # lint: pylint
"""Semantic Scholar (Science) """CrossRef"""
"""
# pylint: disable=use-dict-literal
from urllib.parse import urlencode from urllib.parse import urlencode
from searx.utils import html_to_text from datetime import datetime
about = { about = {
"website": 'https://www.crossref.org/', "website": "https://www.crossref.org/",
"wikidata_id": 'Q5188229', "wikidata_id": "Q5188229",
"official_api_documentation": 'https://github.com/CrossRef/rest-api-doc', "official_api_documentation": "https://api.crossref.org",
"use_official_api": False, "use_official_api": False,
"require_api_key": False, "require_api_key": False,
"results": 'JSON', "results": "JSON",
} }
categories = ['science', 'scientific publications'] categories = ["science", "scientific publications"]
paging = True paging = True
search_url = 'https://api.crossref.org/works' search_url = "https://api.crossref.org/works"
def request(query, params): def request(query, params):
params['url'] = search_url + '?' + urlencode(dict(query=query, offset=20 * (params['pageno'] - 1))) params["url"] = search_url + "?" + urlencode({"query": query, "offset": 20 * (params["pageno"] - 1)})
return params return params
def response(resp): def response(resp):
res = resp.json()
results = [] results = []
for record in res['message']['items']: for record in resp.json()["message"]["items"]:
record_type = record['type']
if record_type == 'book-chapter': if record["type"] == "component":
title = record['container-title'][0] # These seem to be files published along with papers. Not something you'd search for
if record['title'][0].lower().strip() != title.lower().strip(): continue
title = html_to_text(title) + ' (' + html_to_text(record['title'][0]) + ')' result = {
journal = None "template": "paper.html",
else: "content": record.get("abstract", ""),
title = html_to_text(record['title'][0]) "doi": record.get("DOI"),
journal = record.get('container-title', [None])[0] "pages": record.get("page"),
url = record.get('resource', {}).get('primary', {}).get('URL') or record['URL'] "publisher": record.get("publisher"),
authors = [author.get('given', '') + ' ' + author.get('family', '') for author in record.get('author', [])] "tags": record.get("subject"),
isbn = record.get('isbn') or [i['value'] for i in record.get('isbn-type', [])] "type": record.get("type"),
results.append( "url": record.get("URL"),
{ "volume": record.get("volume"),
'template': 'paper.html',
'url': url,
'title': title,
'journal': journal,
'volume': record.get('volume'),
'type': record['type'],
'content': html_to_text(record.get('abstract', '')),
'publisher': record.get('publisher'),
'authors': authors,
'doi': record['DOI'],
'isbn': isbn,
} }
) if record["type"] == "book-chapter":
result["title"] = record["container-title"][0]
if record["title"][0].lower().strip() != result["title"].lower().strip():
result["title"] += f" ({record['title'][0]})"
else:
result["title"] = record["title"][0] if "title" in record else record.get("container-title", [None])[0]
result["journal"] = record.get("container-title", [None])[0] if "title" in record else None
if "resource" in record and "primary" in record["resource"] and "URL" in record["resource"]["primary"]:
result["url"] = record["resource"]["primary"]["URL"]
if "published" in record and "date-parts" in record["published"]:
result["publishedDate"] = datetime(*(record["published"]["date-parts"][0] + [1, 1][:3]))
result["authors"] = [a.get("given", "") + " " + a.get("family", "") for a in record.get("author", [])]
result["isbn"] = record.get("isbn") or [i["value"] for i in record.get("isbn-type", [])]
# All the links are not PDFs, even if the URL ends with ".pdf"
# result["pdf_url"] = record.get("link", [{"URL": None}])[0]["URL"]
results.append(result)
return results return results