[mod] Startpage: reversed engineered & upgrade to data_type: traits_v1

One reason for the often seen CAPTCHA of the Startpage requests are the
incomplete requests SearXNG sends to startpage.com: this patch is a complete new
implementation of the ``request()`` function, reversed engineered from the
Startpage's search form.  The new implementation:

- use traits of data_type: traits_v1 and drop deprecated data_type: supported_languages
- adds time-range support
- adds save-search support
- fix searxng/searxng/issues 1884
- fix searxng/searxng/issues 1081 --> improvements to avoid CAPTCHA

In preparation for more categories (News, Images, Videos ..) from Startpage, the
variable ``startpage_categ`` was set up.  The default value is ``web`` and other
categories from Startpage are not yet implemented.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2022-10-30 11:23:20 +01:00
parent 858aa3e604
commit e9afc4f8ce
4 changed files with 218 additions and 410 deletions

View file

@ -10,9 +10,4 @@ Startpage engines
:backlinks: entry :backlinks: entry
.. automodule:: searx.engines.startpage .. automodule:: searx.engines.startpage
:members:
Functions
=========
.. autofunction:: searx.engines.startpage.fetch_traits
.. autofunction:: searx.engines.startpage.get_sc_code

View file

@ -109,9 +109,9 @@ def seznam(query, _lang):
] ]
def startpage(query, lang): def startpage(query, sxng_locale):
# startpage autocompleter """Autocomplete from Startpage. Supports Startpage's languages"""
lui = engines['startpage'].supported_languages.get(lang, 'english') # vintage / deprecated lui = engines['startpage'].traits.get_language(sxng_locale, 'english')
url = 'https://startpage.com/suggestions?{query}' url = 'https://startpage.com/suggestions?{query}'
resp = get(url.format(query=urlencode({'q': query, 'segment': 'startpage.udog', 'lui': lui}))) resp = get(url.format(query=urlencode({'q': query, 'segment': 'startpage.udog', 'lui': lui})))
data = resp.json() data = resp.json()

View file

@ -3078,7 +3078,7 @@
"startpage": { "startpage": {
"all_locale": null, "all_locale": null,
"custom": {}, "custom": {},
"data_type": "supported_languages", "data_type": "traits_v1",
"languages": { "languages": {
"af": "afrikaans", "af": "afrikaans",
"am": "amharic", "am": "amharic",
@ -3213,257 +3213,7 @@
"zh-HK": "zh-TW_HK", "zh-HK": "zh-TW_HK",
"zh-TW": "zh-TW_TW" "zh-TW": "zh-TW_TW"
}, },
"supported_languages": { "supported_languages": {}
"af": {
"alias": "afrikaans"
},
"am": {
"alias": "amharic"
},
"ar": {
"alias": "arabic"
},
"az": {
"alias": "azerbaijani"
},
"be": {
"alias": "belarusian"
},
"bg": {
"alias": "bulgarian"
},
"bn": {
"alias": "bengali"
},
"bs": {
"alias": "bosnian"
},
"ca": {
"alias": "catalan"
},
"cs": {
"alias": "czech"
},
"cy": {
"alias": "welsh"
},
"da": {
"alias": "dansk"
},
"de": {
"alias": "deutsch"
},
"el": {
"alias": "greek"
},
"en": {
"alias": "english"
},
"en-GB": {
"alias": "english_uk"
},
"eo": {
"alias": "esperanto"
},
"es": {
"alias": "espanol"
},
"et": {
"alias": "estonian"
},
"eu": {
"alias": "basque"
},
"fa": {
"alias": "persian"
},
"fi": {
"alias": "suomi"
},
"fo": {
"alias": "faroese"
},
"fr": {
"alias": "francais"
},
"fy": {
"alias": "frisian"
},
"ga": {
"alias": "irish"
},
"gd": {
"alias": "gaelic"
},
"gl": {
"alias": "galician"
},
"gu": {
"alias": "gujarati"
},
"he": {
"alias": "hebrew"
},
"hi": {
"alias": "hindi"
},
"hr": {
"alias": "croatian"
},
"hu": {
"alias": "hungarian"
},
"ia": {
"alias": "interlingua"
},
"id": {
"alias": "indonesian"
},
"is": {
"alias": "icelandic"
},
"it": {
"alias": "italiano"
},
"ja": {
"alias": "nihongo"
},
"jv": {
"alias": "javanese"
},
"ka": {
"alias": "georgian"
},
"kn": {
"alias": "kannada"
},
"ko": {
"alias": "hangul"
},
"la": {
"alias": "latin"
},
"lt": {
"alias": "lithuanian"
},
"lv": {
"alias": "latvian"
},
"mai": {
"alias": "bihari"
},
"mk": {
"alias": "macedonian"
},
"ml": {
"alias": "malayalam"
},
"mr": {
"alias": "marathi"
},
"ms": {
"alias": "malay"
},
"mt": {
"alias": "maltese"
},
"ne": {
"alias": "nepali"
},
"nl": {
"alias": "nederlands"
},
"no": {
"alias": "norsk"
},
"oc": {
"alias": "occitan"
},
"pa": {
"alias": "punjabi"
},
"pl": {
"alias": "polski"
},
"pt": {
"alias": "portugues"
},
"ro": {
"alias": "romanian"
},
"ru": {
"alias": "russian"
},
"si": {
"alias": "sinhalese"
},
"sk": {
"alias": "slovak"
},
"sl": {
"alias": "slovenian"
},
"sq": {
"alias": "albanian"
},
"sr": {
"alias": "serbian"
},
"su": {
"alias": "sudanese"
},
"sv": {
"alias": "svenska"
},
"sw": {
"alias": "swahili"
},
"ta": {
"alias": "tamil"
},
"te": {
"alias": "telugu"
},
"th": {
"alias": "thai"
},
"ti": {
"alias": "tigrinya"
},
"tl": {
"alias": "tagalog"
},
"tr": {
"alias": "turkce"
},
"uk": {
"alias": "ukrainian"
},
"ur": {
"alias": "urdu"
},
"uz": {
"alias": "uzbek"
},
"vi": {
"alias": "vietnamese"
},
"xh": {
"alias": "xhosa"
},
"zh": {
"alias": "jiantizhongwen"
},
"zh-HK": {
"alias": "fantizhengwen"
},
"zh-TW": {
"alias": "fantizhengwen"
},
"zu": {
"alias": "zulu"
}
}
}, },
"wikidata": { "wikidata": {
"all_locale": null, "all_locale": null,

View file

@ -50,38 +50,58 @@ W3C recommends subtag over macrolanguage [2]_.
Startpage languages Startpage languages
=================== ===================
The displayed name in Startpage's settings page depend on the location of the IP :py:obj:`send_accept_language_header`:
when the 'Accept-Language' HTTP header is unset (in the language update script The displayed name in Startpage's settings page depend on the location of the
we use "en-US,en;q=0.5" to get uniform names independent from the IP). IP when ``Accept-Language`` HTTP header is unset. In :py:obj:`fetch_traits`
we use::
Each option has a displayed name and a value, either of which may represent the 'Accept-Language': "en-US,en;q=0.5",
language name in the native script, the language name in English, an English ..
transliteration of the native name, the English name of the writing script used
by the language, or occasionally something else entirely. to get uniform names independent from the IP).
.. _startpage categories:
Startpage categories
====================
Startpage's category (for Web-search, News, Videos, ..) is set by
:py:obj:`startpage_categ` in settings.yml::
- name: startpage
engine: startpage
startpage_categ: web
...
.. hint::
The default category is ``web`` .. and other categories than ``web`` are not
yet implemented.
""" """
from typing import TYPE_CHECKING
from collections import OrderedDict
import re import re
from time import time
from urllib.parse import urlencode
from unicodedata import normalize, combining from unicodedata import normalize, combining
from time import time
from datetime import datetime, timedelta from datetime import datetime, timedelta
from dateutil import parser import dateutil.parser
from lxml import html import lxml.html
from babel import Locale import babel
from babel.localedata import locale_identifiers
from searx import network from searx import network
from searx.utils import extract_text, eval_xpath, match_language from searx.utils import extract_text, eval_xpath, gen_useragent
from searx.exceptions import ( from searx.exceptions import SearxEngineCaptchaException
SearxEngineResponseException, from searx.locales import region_tag
SearxEngineCaptchaException,
)
from searx.enginelib.traits import EngineTraits from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits traits: EngineTraits
# about # about
@ -94,18 +114,28 @@ about = {
"results": 'HTML', "results": 'HTML',
} }
startpage_categ = 'web'
"""Startpage's category, visit :ref:`startpage categories`.
"""
send_accept_language_header = True
"""Startpage tries to guess user's language and territory from the HTTP
``Accept-Language``. Optional the user can select a search-language (can be
different to the UI language) and a region filter.
"""
# engine dependent config # engine dependent config
categories = ['general', 'web'] categories = ['general', 'web']
# there is a mechanism to block "bot" search
# (probably the parameter qid), require
# storing of qid's between mulitble search-calls
paging = True paging = True
supported_languages_url = 'https://www.startpage.com/do/settings' time_range_support = True
safesearch = True
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
safesearch_dict = {0: '0', 1: '1', 2: '1'}
# search-url # search-url
base_url = 'https://startpage.com/' base_url = 'https://www.startpage.com'
search_url = base_url + 'sp/search?' search_url = base_url + '/sp/search'
# specific xpath variables # specific xpath variables
# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"] # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
@ -113,92 +143,193 @@ search_url = base_url + 'sp/search?'
results_xpath = '//div[@class="w-gl__result__main"]' results_xpath = '//div[@class="w-gl__result__main"]'
link_xpath = './/a[@class="w-gl__result-title result-link"]' link_xpath = './/a[@class="w-gl__result-title result-link"]'
content_xpath = './/p[@class="w-gl__description"]' content_xpath = './/p[@class="w-gl__description"]'
search_form_xpath = '//form[@id="search"]'
"""XPath of Startpage's origin search form
.. code: html
<form action="/sp/search" method="post">
<input type="text" name="query" value="" ..>
<input type="hidden" name="t" value="device">
<input type="hidden" name="lui" value="english">
<input type="hidden" name="sc" value="Q7Mt5TRqowKB00">
<input type="hidden" name="cat" value="web">
<input type="hidden" class="abp" id="abp-input" name="abp" value="1">
</form>
"""
# timestamp of the last fetch of 'sc' code # timestamp of the last fetch of 'sc' code
sc_code_ts = 0 sc_code_ts = 0
sc_code = '' sc_code = ''
sc_code_cache_sec = 30
"""Time in seconds the sc-code is cached in memory :py:obj:`get_sc_code`."""
def raise_captcha(resp): def get_sc_code(searxng_locale, params):
"""Get an actual ``sc`` argument from Startpage's search form (HTML page).
if str(resp.url).startswith('https://www.startpage.com/sp/captcha'): Startpage puts a ``sc`` argument on every HTML :py:obj:`search form
raise SearxEngineCaptchaException() <search_form_xpath>`. Without this argument Startpage considers the request
is from a bot. We do not know what is encoded in the value of the ``sc``
argument, but it seems to be a kind of a *time-stamp*.
Startpage's search form generates a new sc-code on each request. This
def get_sc_code(headers): function scrap a new sc-code from Startpage's home page every
"""Get an actual ``sc`` argument from Startpage's home page. :py:obj:`sc_code_cache_sec` seconds.
Startpage puts a ``sc`` argument on every link. Without this argument
Startpage considers the request is from a bot. We do not know what is
encoded in the value of the ``sc`` argument, but it seems to be a kind of a
*time-stamp*. This *time-stamp* is valid for a few hours.
This function scrap a new *time-stamp* from startpage's home page every hour
(3000 sec).
""" """
global sc_code_ts, sc_code # pylint: disable=global-statement global sc_code_ts, sc_code # pylint: disable=global-statement
if time() > (sc_code_ts + 3000): if sc_code and (time() < (sc_code_ts + sc_code_cache_sec)):
logger.debug("query new sc time-stamp ...") logger.debug("get_sc_code: reuse '%s'", sc_code)
return sc_code
resp = network.get(base_url, headers=headers) headers = {**params['headers']}
raise_captcha(resp) headers['Origin'] = base_url
dom = html.fromstring(resp.text) headers['Referer'] = base_url + '/'
# headers['Connection'] = 'keep-alive'
# headers['Accept-Encoding'] = 'gzip, deflate, br'
# headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'
# headers['User-Agent'] = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:105.0) Gecko/20100101 Firefox/105.0'
# add Accept-Language header
if searxng_locale == 'all':
searxng_locale = 'en-US'
locale = babel.Locale.parse(searxng_locale, sep='-')
if send_accept_language_header:
ac_lang = locale.language
if locale.territory:
ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % (
locale.language,
locale.territory,
locale.language,
)
headers['Accept-Language'] = ac_lang
get_sc_url = base_url + '/?sc=%s' % (sc_code)
logger.debug("query new sc time-stamp ... %s", get_sc_url)
logger.debug("headers: %s", headers)
resp = network.get(get_sc_url, headers=headers)
# ?? x = network.get('https://www.startpage.com/sp/cdn/images/filter-chevron.svg', headers=headers)
# ?? https://www.startpage.com/sp/cdn/images/filter-chevron.svg
# ?? ping-back URL: https://www.startpage.com/sp/pb?sc=TLsB0oITjZ8F21
if str(resp.url).startswith('https://www.startpage.com/sp/captcha'):
raise SearxEngineCaptchaException(
message="get_sc_code: got redirected to https://www.startpage.com/sp/captcha",
)
dom = lxml.html.fromstring(resp.text)
try: try:
# <input type="hidden" name="sc" value="..."> sc_code = eval_xpath(dom, search_form_xpath + '//input[@name="sc"]/@value')[0]
sc_code = eval_xpath(dom, '//input[@name="sc"]/@value')[0]
except IndexError as exc: except IndexError as exc:
# suspend startpage API --> https://github.com/searxng/searxng/pull/695 logger.debug("suspend startpage API --> https://github.com/searxng/searxng/pull/695")
raise SearxEngineResponseException( raise SearxEngineCaptchaException(
suspended_time=7 * 24 * 3600, message="PR-695: query new sc time-stamp failed!" message="get_sc_code: [PR-695] query new sc time-stamp failed! (%s)" % resp.url,
) from exc ) from exc
sc_code_ts = time() sc_code_ts = time()
logger.debug("new value is: %s", sc_code) logger.debug("get_sc_code: new value is: %s", sc_code)
return sc_code return sc_code
# do search-request
def request(query, params): def request(query, params):
"""Assemble a Startpage request.
# pylint: disable=line-too-long To avoid CAPTCHA we need to send a well formed HTTP POST request with a
# The format string from Startpage's FFox add-on [1]:: cookie. We need to form a request that is identical to the request build by
# Startpage's search form:
# https://www.startpage.com/do/dsearch?query={searchTerms}&cat=web&pl=ext-ff&language=__MSG_extensionUrlLanguage__&extVersion=1.3.0
#
# [1] https://addons.mozilla.org/en-US/firefox/addon/startpage-private-search/
- in the cookie the **region** is selected
- in the HTTP POST data the **language** is selected
Additionally the arguments form Startpage's search form needs to be set in
HTML POST data / compare ``<input>`` elements: :py:obj:`search_form_xpath`.
"""
if startpage_categ == 'web':
return _request_cat_web(query, params)
logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
return params
def _request_cat_web(query, params):
engine_region = traits.get_region(params['searxng_locale'], 'en-US')
engine_language = traits.get_language(params['searxng_locale'], 'en')
# build arguments
args = { args = {
'query': query, 'query': query,
'page': params['pageno'],
'cat': 'web', 'cat': 'web',
# 'pl': 'ext-ff', 't': 'device',
# 'extVersion': '1.3.0', 'sc': get_sc_code(params['searxng_locale'], params), # hint: this func needs HTTP headers,
# 'abp': "-1", 'with_date': time_range_dict.get(params['time_range'], ''),
'sc': get_sc_code(params['headers']),
} }
# set language if specified if engine_language:
if params['language'] != 'all': args['language'] = engine_language
lang_code = match_language(params['language'], supported_languages, fallback=None) args['lui'] = engine_language
if lang_code:
language_name = supported_languages[lang_code]['alias'] args['abp'] = '1'
args['language'] = language_name if params['pageno'] > 1:
args['lui'] = language_name args['page'] = params['pageno']
# build cookie
lang_homepage = 'en'
cookie = OrderedDict()
cookie['date_time'] = 'world'
cookie['disable_family_filter'] = safesearch_dict[params['safesearch']]
cookie['disable_open_in_new_window'] = '0'
cookie['enable_post_method'] = '1' # hint: POST
cookie['enable_proxy_safety_suggest'] = '1'
cookie['enable_stay_control'] = '1'
cookie['instant_answers'] = '1'
cookie['lang_homepage'] = 's/device/%s/' % lang_homepage
cookie['num_of_results'] = '10'
cookie['suggestions'] = '1'
cookie['wt_unit'] = 'celsius'
if engine_language:
cookie['language'] = engine_language
cookie['language_ui'] = engine_language
if engine_region:
cookie['search_results_region'] = engine_region
params['cookies']['preferences'] = 'N1N'.join(["%sEEE%s" % x for x in cookie.items()])
logger.debug('cookie preferences: %s', params['cookies']['preferences'])
# POST request
logger.debug("data: %s", args)
params['data'] = args
params['method'] = 'POST'
params['url'] = search_url
params['headers']['Origin'] = base_url
params['headers']['Referer'] = base_url + '/'
# is the Accept header needed?
# params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
params['url'] = search_url + urlencode(args)
return params return params
# get response from search-request # get response from search-request
def response(resp): def response(resp):
results = [] dom = lxml.html.fromstring(resp.text)
dom = html.fromstring(resp.text) if startpage_categ == 'web':
return _response_cat_web(dom)
logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
return []
def _response_cat_web(dom):
results = []
# parse results # parse results
for result in eval_xpath(dom, results_xpath): for result in eval_xpath(dom, results_xpath):
@ -233,7 +364,7 @@ def response(resp):
content = content[date_pos:] content = content[date_pos:]
try: try:
published_date = parser.parse(date_string, dayfirst=True) published_date = dateutil.parser.parse(date_string, dayfirst=True)
except ValueError: except ValueError:
pass pass
@ -259,78 +390,10 @@ def response(resp):
return results return results
# get supported languages from their site
def _fetch_supported_languages(resp):
# startpage's language selector is a mess each option has a displayed name
# and a value, either of which may represent the language name in the native
# script, the language name in English, an English transliteration of the
# native name, the English name of the writing script used by the language,
# or occasionally something else entirely.
# this cases are so special they need to be hardcoded, a couple of them are misspellings
language_names = {
'english_uk': 'en-GB',
'fantizhengwen': ['zh-TW', 'zh-HK'],
'hangul': 'ko',
'malayam': 'ml',
'norsk': 'nb',
'sinhalese': 'si',
'sudanese': 'su',
}
# get the English name of every language known by babel
language_names.update(
{
# fmt: off
name.lower(): lang_code
# pylint: disable=protected-access
for lang_code, name in Locale('en')._data['languages'].items()
# fmt: on
}
)
# get the native name of every language known by babel
for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, locale_identifiers()):
native_name = Locale(lang_code).get_language_name().lower()
# add native name exactly as it is
language_names[native_name] = lang_code
# add "normalized" language name (i.e. français becomes francais and español becomes espanol)
unaccented_name = ''.join(filter(lambda c: not combining(c), normalize('NFKD', native_name)))
if len(unaccented_name) == len(unaccented_name.encode()):
# add only if result is ascii (otherwise "normalization" didn't work)
language_names[unaccented_name] = lang_code
dom = html.fromstring(resp.text)
sp_lang_names = []
for option in dom.xpath('//form[@name="settings"]//select[@name="language"]/option'):
sp_lang_names.append((option.get('value'), extract_text(option).lower()))
supported_languages = {}
for sp_option_value, sp_option_text in sp_lang_names:
lang_code = language_names.get(sp_option_value) or language_names.get(sp_option_text)
if isinstance(lang_code, str):
supported_languages[lang_code] = {'alias': sp_option_value}
elif isinstance(lang_code, list):
for _lc in lang_code:
supported_languages[_lc] = {'alias': sp_option_value}
else:
print('Unknown language option in Startpage: {} ({})'.format(sp_option_value, sp_option_text))
return supported_languages
def fetch_traits(engine_traits: EngineTraits): def fetch_traits(engine_traits: EngineTraits):
"""Fetch :ref:`languages <startpage languages>` and :ref:`regions <startpage """Fetch :ref:`languages <startpage languages>` and :ref:`regions <startpage
regions>` from Startpage.""" regions>` from Startpage."""
# pylint: disable=import-outside-toplevel, too-many-locals, too-many-branches # pylint: disable=too-many-branches
# pylint: disable=too-many-statements
engine_traits.data_type = 'supported_languages' # deprecated
import babel
from searx.utils import gen_useragent
from searx.locales import region_tag
headers = { headers = {
'User-Agent': gen_useragent(), 'User-Agent': gen_useragent(),
@ -341,7 +404,7 @@ def fetch_traits(engine_traits: EngineTraits):
if not resp.ok: if not resp.ok:
print("ERROR: response from Startpage is not OK.") print("ERROR: response from Startpage is not OK.")
dom = html.fromstring(resp.text) dom = lxml.html.fromstring(resp.text)
# regions # regions