Merge pull request #2285 from return42/fix-digg

bugfix & refactor digg engine
This commit is contained in:
Alexandre Flament 2020-12-03 10:20:40 +01:00 committed by GitHub
commit 6b5a578822
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 42 additions and 43 deletions

View file

@ -212,15 +212,15 @@ gecko.driver:
PHONY += test test.sh test.pylint test.pep8 test.unit test.coverage test.robot PHONY += test test.sh test.pylint test.pep8 test.unit test.coverage test.robot
test: buildenv test.pylint test.pep8 test.unit gecko.driver test.robot test: buildenv test.pylint test.pep8 test.unit gecko.driver test.robot
PYLINT_FILES=\
# TODO: balance linting with pylint
test.pylint: pyenvinstall
$(call cmd,pylint,\
searx/preferences.py \ searx/preferences.py \
searx/testing.py \ searx/testing.py \
searx/engines/gigablast.py \ searx/engines/gigablast.py \
searx/engines/deviantart.py \ searx/engines/deviantart.py \
) searx/engines/digg.py
test.pylint: pyenvinstall
$(call cmd,pylint,$(PYLINT_FILES))
$(call cmd,pylint,\ $(call cmd,pylint,\
--disable=$(PYLINT_SEARX_DISABLE_OPTION) \ --disable=$(PYLINT_SEARX_DISABLE_OPTION) \
--additional-builtins=$(PYLINT_ADDITIONAL_BUILTINS_FOR_ENGINES) \ --additional-builtins=$(PYLINT_ADDITIONAL_BUILTINS_FOR_ENGINES) \
@ -249,7 +249,7 @@ test.sh:
test.pep8: pyenvinstall test.pep8: pyenvinstall
@echo "TEST pycodestyle (formerly pep8)" @echo "TEST pycodestyle (formerly pep8)"
$(Q)$(PY_ENV_ACT); pycodestyle --exclude='searx/static, searx/languages.py, searx/engines/gigablast.py, searx/engines/deviantart.py' \ $(Q)$(PY_ENV_ACT); pycodestyle --exclude='searx/static, searx/languages.py, $(foreach f,$(PYLINT_FILES),$(f),)' \
--max-line-length=120 --ignore "E117,E252,E402,E722,E741,W503,W504,W605" searx tests --max-line-length=120 --ignore "E117,E252,E402,E722,E741,W503,W504,W605" searx tests
test.unit: pyenvinstall test.unit: pyenvinstall

View file

@ -1,7 +1,7 @@
""" """
Digg (News, Social media) Digg (News, Social media)
@website https://digg.com/ @website https://digg.com
@provide-api no @provide-api no
@using-api no @using-api no
@ -9,59 +9,58 @@
@stable no (HTML can change) @stable no (HTML can change)
@parse url, title, content, publishedDate, thumbnail @parse url, title, content, publishedDate, thumbnail
""" """
# pylint: disable=missing-function-docstring
import random
import string
from json import loads from json import loads
from urllib.parse import urlencode from urllib.parse import urlencode
from datetime import datetime from datetime import datetime
from lxml import html
# engine dependent config # engine dependent config
categories = ['news', 'social media'] categories = ['news', 'social media']
paging = True paging = True
base_url = 'https://digg.com'
# search-url # search-url
base_url = 'https://digg.com/' search_url = base_url + (
search_url = base_url + 'api/search/?{query}&from={position}&size=20&format=html' '/api/search/'
'?{query}'
'&from={position}'
'&size=20'
'&format=html'
)
# specific xpath variables
results_xpath = '//article'
link_xpath = './/small[@class="time"]//a'
title_xpath = './/h2//a//text()'
content_xpath = './/p//text()'
pubdate_xpath = './/time'
digg_cookie_chars = string.ascii_uppercase + string.ascii_lowercase +\
string.digits + "+_"
# do search-request
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * 20 offset = (params['pageno'] - 1) * 20
params['url'] = search_url.format(position=offset, params['url'] = search_url.format(
query=urlencode({'q': query})) query = urlencode({'q': query}),
params['cookies']['frontend.auid'] = ''.join(random.choice( position = offset,
digg_cookie_chars) for _ in range(22)) )
return params return params
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
search_result = loads(resp.text)
# parse results # parse results
for result in search_result['mapped']: for result in loads(resp.text)['mapped']:
published = datetime.strptime(result['created']['ISO'], "%Y-%m-%d %H:%M:%S") # strip html tags and superfluous quotation marks from content
# append result content = html.document_fromstring(
results.append({'url': result['url'], result['excerpt']
).text_content()
# 'created': {'ISO': '2020-10-16T14:09:55Z', ...}
published = datetime.strptime(
result['created']['ISO'], '%Y-%m-%dT%H:%M:%SZ'
)
results.append({
'url': result['url'],
'title': result['title'], 'title': result['title'],
'content': result['excerpt'], 'content' : content,
'template': 'videos.html', 'template': 'videos.html',
'publishedDate': published, 'publishedDate': published,
'thumbnail': result['images']['thumbImage']}) 'thumbnail': result['images']['thumbImage'],
})
# return results
return results return results