From 173b744ef0fa69c217381bfc000709ba7b336824 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Fri, 30 Oct 2020 09:36:11 +0100 Subject: [PATCH 1/3] [fix] digg - the ISO time stamp of published date has been changed Error pattern:: Engines cannot retrieve results: digg (unexpected crash time data '2020-10-16T14:09:55Z' does not match format '%Y-%m-%d %H:%M:%S') Signed-off-by: Markus Heiser --- searx/engines/digg.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/searx/engines/digg.py b/searx/engines/digg.py index 831d698bc..cb007d40d 100644 --- a/searx/engines/digg.py +++ b/searx/engines/digg.py @@ -54,7 +54,8 @@ def response(resp): # parse results for result in search_result['mapped']: - published = datetime.strptime(result['created']['ISO'], "%Y-%m-%d %H:%M:%S") + # 'created': {'ISO': '2020-10-16T14:09:55Z', ...} + published = datetime.strptime(result['created']['ISO'], "%Y-%m-%dT%H:%M:%SZ") # append result results.append({'url': result['url'], 'title': result['title'], From 6b0a896f01eb0c59a5ce8d46c18b3549f0007901 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 22 Nov 2020 11:37:12 +0100 Subject: [PATCH 2/3] [mod] digg - pylint searx/engines/digg.py Eliminate redundant file names which are tested by test.pylint and ignored by test.pep8 Signed-off-by: Markus Heiser --- Makefile | 16 ++++++++-------- searx/engines/digg.py | 1 + 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index 326c19336..4a873f0c6 100644 --- a/Makefile +++ b/Makefile @@ -212,15 +212,15 @@ gecko.driver: PHONY += test test.sh test.pylint test.pep8 test.unit test.coverage test.robot test: buildenv test.pylint test.pep8 test.unit gecko.driver test.robot +PYLINT_FILES=\ + searx/preferences.py \ + searx/testing.py \ + searx/engines/gigablast.py \ + searx/engines/deviantart.py \ + searx/engines/digg.py -# TODO: balance linting with pylint test.pylint: pyenvinstall - $(call cmd,pylint,\ - searx/preferences.py \ - searx/testing.py \ - searx/engines/gigablast.py \ - searx/engines/deviantart.py \ - ) + $(call cmd,pylint,$(PYLINT_FILES)) $(call cmd,pylint,\ --disable=$(PYLINT_SEARX_DISABLE_OPTION) \ --additional-builtins=$(PYLINT_ADDITIONAL_BUILTINS_FOR_ENGINES) \ @@ -249,7 +249,7 @@ test.sh: test.pep8: pyenvinstall @echo "TEST pycodestyle (formerly pep8)" - $(Q)$(PY_ENV_ACT); pycodestyle --exclude='searx/static, searx/languages.py, searx/engines/gigablast.py, searx/engines/deviantart.py' \ + $(Q)$(PY_ENV_ACT); pycodestyle --exclude='searx/static, searx/languages.py, $(foreach f,$(PYLINT_FILES),$(f),)' \ --max-line-length=120 --ignore "E117,E252,E402,E722,E741,W503,W504,W605" searx tests test.unit: pyenvinstall diff --git a/searx/engines/digg.py b/searx/engines/digg.py index cb007d40d..63bce51cf 100644 --- a/searx/engines/digg.py +++ b/searx/engines/digg.py @@ -9,6 +9,7 @@ @stable no (HTML can change) @parse url, title, content, publishedDate, thumbnail """ +# pylint: disable=missing-function-docstring import random import string From bef185723affdc549487e50ae521db61110c3383 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Wed, 2 Dec 2020 21:54:27 +0100 Subject: [PATCH 3/3] [refactor] digg - improve results and clean up source code - strip html tags and superfluous quotation marks from content - remove not needed cookie from request - remove superfluous imports Signed-off-by: Markus Heiser --- searx/engines/digg.py | 67 +++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 35 deletions(-) diff --git a/searx/engines/digg.py b/searx/engines/digg.py index 63bce51cf..85f727f0d 100644 --- a/searx/engines/digg.py +++ b/searx/engines/digg.py @@ -1,7 +1,7 @@ """ Digg (News, Social media) - @website https://digg.com/ + @website https://digg.com @provide-api no @using-api no @@ -11,59 +11,56 @@ """ # pylint: disable=missing-function-docstring -import random -import string from json import loads from urllib.parse import urlencode from datetime import datetime +from lxml import html + # engine dependent config categories = ['news', 'social media'] paging = True +base_url = 'https://digg.com' # search-url -base_url = 'https://digg.com/' -search_url = base_url + 'api/search/?{query}&from={position}&size=20&format=html' +search_url = base_url + ( + '/api/search/' + '?{query}' + '&from={position}' + '&size=20' + '&format=html' +) -# specific xpath variables -results_xpath = '//article' -link_xpath = './/small[@class="time"]//a' -title_xpath = './/h2//a//text()' -content_xpath = './/p//text()' -pubdate_xpath = './/time' - -digg_cookie_chars = string.ascii_uppercase + string.ascii_lowercase +\ - string.digits + "+_" - - -# do search-request def request(query, params): offset = (params['pageno'] - 1) * 20 - params['url'] = search_url.format(position=offset, - query=urlencode({'q': query})) - params['cookies']['frontend.auid'] = ''.join(random.choice( - digg_cookie_chars) for _ in range(22)) + params['url'] = search_url.format( + query = urlencode({'q': query}), + position = offset, + ) return params - -# get response from search-request def response(resp): results = [] - search_result = loads(resp.text) - # parse results - for result in search_result['mapped']: + for result in loads(resp.text)['mapped']: + + # strip html tags and superfluous quotation marks from content + content = html.document_fromstring( + result['excerpt'] + ).text_content() # 'created': {'ISO': '2020-10-16T14:09:55Z', ...} - published = datetime.strptime(result['created']['ISO'], "%Y-%m-%dT%H:%M:%SZ") - # append result - results.append({'url': result['url'], - 'title': result['title'], - 'content': result['excerpt'], - 'template': 'videos.html', - 'publishedDate': published, - 'thumbnail': result['images']['thumbImage']}) + published = datetime.strptime( + result['created']['ISO'], '%Y-%m-%dT%H:%M:%SZ' + ) + results.append({ + 'url': result['url'], + 'title': result['title'], + 'content' : content, + 'template': 'videos.html', + 'publishedDate': published, + 'thumbnail': result['images']['thumbImage'], + }) - # return results return results