forked from Ponysearch/Ponysearch
[fix] engine - Crossref
Crossref was broken on result types journal-issue and component .. The old code had lots of assumptions, and broke during parsing. Now the assumptions are more explicit and checked them with the API.
This commit is contained in:
parent
ed6a5a01bb
commit
74600c028d
1 changed files with 43 additions and 39 deletions
|
@ -1,60 +1,64 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""Semantic Scholar (Science)
|
||||
"""
|
||||
# pylint: disable=use-dict-literal
|
||||
"""CrossRef"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from searx.utils import html_to_text
|
||||
from datetime import datetime
|
||||
|
||||
about = {
|
||||
"website": 'https://www.crossref.org/',
|
||||
"wikidata_id": 'Q5188229',
|
||||
"official_api_documentation": 'https://github.com/CrossRef/rest-api-doc',
|
||||
"website": "https://www.crossref.org/",
|
||||
"wikidata_id": "Q5188229",
|
||||
"official_api_documentation": "https://api.crossref.org",
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'JSON',
|
||||
"results": "JSON",
|
||||
}
|
||||
|
||||
categories = ['science', 'scientific publications']
|
||||
categories = ["science", "scientific publications"]
|
||||
paging = True
|
||||
search_url = 'https://api.crossref.org/works'
|
||||
search_url = "https://api.crossref.org/works"
|
||||
|
||||
|
||||
def request(query, params):
|
||||
params['url'] = search_url + '?' + urlencode(dict(query=query, offset=20 * (params['pageno'] - 1)))
|
||||
params["url"] = search_url + "?" + urlencode({"query": query, "offset": 20 * (params["pageno"] - 1)})
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
res = resp.json()
|
||||
results = []
|
||||
for record in res['message']['items']:
|
||||
record_type = record['type']
|
||||
if record_type == 'book-chapter':
|
||||
title = record['container-title'][0]
|
||||
if record['title'][0].lower().strip() != title.lower().strip():
|
||||
title = html_to_text(title) + ' (' + html_to_text(record['title'][0]) + ')'
|
||||
journal = None
|
||||
for record in resp.json()["message"]["items"]:
|
||||
|
||||
if record["type"] == "component":
|
||||
# These seem to be files published along with papers. Not something you'd search for
|
||||
continue
|
||||
result = {
|
||||
"template": "paper.html",
|
||||
"content": record.get("abstract", ""),
|
||||
"doi": record.get("DOI"),
|
||||
"pages": record.get("page"),
|
||||
"publisher": record.get("publisher"),
|
||||
"tags": record.get("subject"),
|
||||
"type": record.get("type"),
|
||||
"url": record.get("URL"),
|
||||
"volume": record.get("volume"),
|
||||
}
|
||||
if record["type"] == "book-chapter":
|
||||
result["title"] = record["container-title"][0]
|
||||
if record["title"][0].lower().strip() != result["title"].lower().strip():
|
||||
result["title"] += f" ({record['title'][0]})"
|
||||
else:
|
||||
title = html_to_text(record['title'][0])
|
||||
journal = record.get('container-title', [None])[0]
|
||||
url = record.get('resource', {}).get('primary', {}).get('URL') or record['URL']
|
||||
authors = [author.get('given', '') + ' ' + author.get('family', '') for author in record.get('author', [])]
|
||||
isbn = record.get('isbn') or [i['value'] for i in record.get('isbn-type', [])]
|
||||
results.append(
|
||||
{
|
||||
'template': 'paper.html',
|
||||
'url': url,
|
||||
'title': title,
|
||||
'journal': journal,
|
||||
'volume': record.get('volume'),
|
||||
'type': record['type'],
|
||||
'content': html_to_text(record.get('abstract', '')),
|
||||
'publisher': record.get('publisher'),
|
||||
'authors': authors,
|
||||
'doi': record['DOI'],
|
||||
'isbn': isbn,
|
||||
}
|
||||
)
|
||||
result["title"] = record["title"][0] if "title" in record else record.get("container-title", [None])[0]
|
||||
result["journal"] = record.get("container-title", [None])[0] if "title" in record else None
|
||||
|
||||
if "resource" in record and "primary" in record["resource"] and "URL" in record["resource"]["primary"]:
|
||||
result["url"] = record["resource"]["primary"]["URL"]
|
||||
if "published" in record and "date-parts" in record["published"]:
|
||||
result["publishedDate"] = datetime(*(record["published"]["date-parts"][0] + [1, 1][:3]))
|
||||
result["authors"] = [a.get("given", "") + " " + a.get("family", "") for a in record.get("author", [])]
|
||||
result["isbn"] = record.get("isbn") or [i["value"] for i in record.get("isbn-type", [])]
|
||||
# All the links are not PDFs, even if the URL ends with ".pdf"
|
||||
# result["pdf_url"] = record.get("link", [{"URL": None}])[0]["URL"]
|
||||
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
|
|
Loading…
Reference in a new issue