Merge pull request #91 from return42/xpath-misc

[doc] add documentation about the XPath engine
This commit is contained in:
Markus Heiser 2021-05-23 10:02:16 +00:00 committed by GitHub
commit 703f8c4a8b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 112 additions and 29 deletions

View file

@ -43,7 +43,7 @@ argument type information
categories list pages, in which the engine is working categories list pages, in which the engine is working
paging boolean support multible pages paging boolean support multible pages
time_range_support boolean support search time range time_range_support boolean support search time range
engine_type str ``online`` by default, other possibles values are engine_type str ``online`` by default, other possibles values are
``offline``, ``online_dictionnary``, ``online_currency`` ``offline``, ``online_dictionnary``, ``online_currency``
======================= =========== ======================================================== ======================= =========== ========================================================
@ -100,6 +100,8 @@ example code
paging = True paging = True
.. _engine request:
making a request making a request
================ ================
@ -198,6 +200,8 @@ example code
return params return params
.. _engine results:
returned results returned results
================ ================

View file

@ -9,6 +9,7 @@ Developer documentation
quickstart quickstart
contribution_guide contribution_guide
engine_overview engine_overview
xpath_engine
search_api search_api
plugins plugins
translation translation

View file

@ -0,0 +1,9 @@
.. _xpath_engine:
================
The XPath engine
================
.. automodule:: searx.engines.xpath
:members:

View file

@ -4,7 +4,8 @@ Welcome to searxng
*Search without being tracked.* *Search without being tracked.*
.. warning:: .. hint::
This is not searx, but searxng. This is not searx, but searxng.
Searxng is a free internet metasearch engine which aggregates results from more Searxng is a free internet metasearch engine which aggregates results from more

View file

@ -1,51 +1,106 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
# pylint: disable=missing-function-docstring
"""The XPath engine is a *generic* engine with which it is possible to configure
engines in the settings.
Here is a simple example of a XPath engine configured in the
:ref:`settings engine` section, further read :ref:`engines-dev`.
.. code:: yaml
- name : bitbucket
engine : xpath
paging : True
search_url : https://bitbucket.org/repo/all/{pageno}?name={query}
url_xpath : //article[@class="repo-summary"]//a[@class="repo-link"]/@href
title_xpath : //article[@class="repo-summary"]//a[@class="repo-link"]
content_xpath : //article[@class="repo-summary"]/p
"""
from urllib.parse import urlencode
from lxml import html from lxml import html
from urllib.parse import urlencode
from searx.utils import extract_text, extract_url, eval_xpath, eval_xpath_list from searx.utils import extract_text, extract_url, eval_xpath, eval_xpath_list
from searx import logger
logger = logger.getChild('XPath engine')
search_url = None search_url = None
url_xpath = None """
content_xpath = None Search URL of the engine, replacements are:
title_xpath = None
thumbnail_xpath = False ``{query}``:
paging = False Search terms from user.
suggestion_xpath = ''
``{pageno}``:
Page number if engine supports pagging :py:obj:`paging`
"""
soft_max_redirects = 0
'''Maximum redirects, soft limit. Record an error but don't stop the engine'''
results_xpath = '' results_xpath = ''
'''XPath selector for the list of result items'''
url_xpath = None
'''XPath selector of result's ``url``.'''
content_xpath = None
'''XPath selector of result's ``content``.'''
title_xpath = None
'''XPath selector of result's ``title``.'''
thumbnail_xpath = False
'''XPath selector of result's ``img_src``.'''
suggestion_xpath = ''
'''XPath selector of result's ``suggestion``.'''
cached_xpath = '' cached_xpath = ''
cached_url = '' cached_url = ''
soft_max_redirects = 0
# parameters for engines with paging support paging = False
# '''Engine supports paging [True or False].'''
# number of results on each page
# (only needed if the site requires not a page number, but an offset)
page_size = 1 page_size = 1
# number of the first page (usually 0 or 1) '''Number of results on each page. Only needed if the site requires not a page
first_page_num = 1 number, but an offset.'''
first_page_num = 1
'''Number of the first page (usually 0 or 1).'''
def request(query, params): def request(query, params):
'''Build request parameters (see :ref:`engine request`).
'''
query = urlencode({'q': query})[2:] query = urlencode({'q': query})[2:]
fp = {'query': query} fargs = {'query': query}
if paging and search_url.find('{pageno}') >= 0: if paging and search_url.find('{pageno}') >= 0:
fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num fargs['pageno'] = (params['pageno'] - 1) * page_size + first_page_num
params['url'] = search_url.format(**fp) params['url'] = search_url.format(**fargs)
params['query'] = query params['query'] = query
params['soft_max_redirects'] = soft_max_redirects params['soft_max_redirects'] = soft_max_redirects
logger.debug("query_url --> %s", params['url'])
return params return params
def response(resp): def response(resp):
'''Scrap *results* from the response (see :ref:`engine results`).
'''
results = [] results = []
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
is_onion = True if 'onions' in categories else False # pylint: disable=undefined-variable is_onion = 'onions' in categories # pylint: disable=undefined-variable
if results_xpath: if results_xpath:
for result in eval_xpath_list(dom, results_xpath): for result in eval_xpath_list(dom, results_xpath):
url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url) url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url)
title = extract_text(eval_xpath_list(result, title_xpath, min_len=1)) title = extract_text(eval_xpath_list(result, title_xpath, min_len=1))
content = extract_text(eval_xpath_list(result, content_xpath, min_len=1)) content = extract_text(eval_xpath_list(result, content_xpath, min_len=1))
@ -59,13 +114,16 @@ def response(resp):
# add alternative cached url if available # add alternative cached url if available
if cached_xpath: if cached_xpath:
tmp_result['cached_url'] = cached_url\ tmp_result['cached_url'] = (
cached_url
+ extract_text(eval_xpath_list(result, cached_xpath, min_len=1)) + extract_text(eval_xpath_list(result, cached_xpath, min_len=1))
)
if is_onion: if is_onion:
tmp_result['is_onion'] = True tmp_result['is_onion'] = True
results.append(tmp_result) results.append(tmp_result)
else: else:
if cached_xpath: if cached_xpath:
for url, title, content, cached in zip( for url, title, content, cached in zip(
@ -75,8 +133,12 @@ def response(resp):
map(extract_text, eval_xpath_list(dom, content_xpath)), map(extract_text, eval_xpath_list(dom, content_xpath)),
map(extract_text, eval_xpath_list(dom, cached_xpath)) map(extract_text, eval_xpath_list(dom, cached_xpath))
): ):
results.append({'url': url, 'title': title, 'content': content, results.append({
'cached_url': cached_url + cached, 'is_onion': is_onion}) 'url': url,
'title': title,
'content': content,
'cached_url': cached_url + cached, 'is_onion': is_onion
})
else: else:
for url, title, content in zip( for url, title, content in zip(
(extract_url(x, search_url) for (extract_url(x, search_url) for
@ -84,10 +146,16 @@ def response(resp):
map(extract_text, eval_xpath_list(dom, title_xpath)), map(extract_text, eval_xpath_list(dom, title_xpath)),
map(extract_text, eval_xpath_list(dom, content_xpath)) map(extract_text, eval_xpath_list(dom, content_xpath))
): ):
results.append({'url': url, 'title': title, 'content': content, 'is_onion': is_onion}) results.append({
'url': url,
'title': title,
'content': content,
'is_onion': is_onion
})
if not suggestion_xpath: if suggestion_xpath:
return results for suggestion in eval_xpath(dom, suggestion_xpath):
for suggestion in eval_xpath(dom, suggestion_xpath): results.append({'suggestion': extract_text(suggestion)})
results.append({'suggestion': extract_text(suggestion)})
logger.debug("found %s results", len(results))
return results return results