Merge branch 'master' into feature/accessibility

This commit is contained in:
Mathieu Brunot 2019-10-16 19:30:02 +02:00 committed by GitHub
commit a51b2b6c20
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
35 changed files with 373 additions and 473 deletions

View file

@ -11,7 +11,9 @@ ARG TIMESTAMP_UWSGI=0
ARG LABEL_VCS_REF= ARG LABEL_VCS_REF=
ARG LABEL_VCS_URL= ARG LABEL_VCS_URL=
ENV BASE_URL= \ ENV INSTANCE_NAME=searx \
AUTOCOMPLETE= \
BASE_URL= \
MORTY_KEY= \ MORTY_KEY= \
MORTY_URL= MORTY_URL=
EXPOSE 8080 EXPOSE 8080

View file

@ -29,6 +29,8 @@ do
printf " -f Always update on the configuration files (existing files are renamed with the .old suffix)\n" printf " -f Always update on the configuration files (existing files are renamed with the .old suffix)\n"
printf " Without this option, new configuration files are copied with the .new suffix\n" printf " Without this option, new configuration files are copied with the .new suffix\n"
printf "\nEnvironment variables:\n\n" printf "\nEnvironment variables:\n\n"
printf " INSTANCE_NAME settings.yml : general.instance_name\n"
printf " AUTOCOMPLETE settings.yml : search.autocomplete\n"
printf " BASE_URL settings.yml : server.base_url\n" printf " BASE_URL settings.yml : server.base_url\n"
printf " MORTY_URL settings.yml : result_proxy.url\n" printf " MORTY_URL settings.yml : result_proxy.url\n"
printf " MORTY_KEY settings.yml : result_proxy.key\n" printf " MORTY_KEY settings.yml : result_proxy.key\n"
@ -53,6 +55,8 @@ patch_searx_settings() {
# update settings.yml # update settings.yml
sed -i -e "s|base_url : False|base_url : ${BASE_URL}|g" \ sed -i -e "s|base_url : False|base_url : ${BASE_URL}|g" \
-e "s/instance_name : \"searx\"/instance_name : \"${INSTANCE_NAME}\"/g" \
-e "s/autocomplete : \"\"/autocomplete : \"${AUTOCOMPLETE}\"/g" \
-e "s/ultrasecretkey/$(openssl rand -hex 32)/g" \ -e "s/ultrasecretkey/$(openssl rand -hex 32)/g" \
"${CONF}" "${CONF}"

View file

@ -27,7 +27,7 @@ from json import loads
from requests import get from requests import get
from searx import settings from searx import settings
from searx import logger from searx import logger
from searx.utils import load_module, match_language from searx.utils import load_module, match_language, get_engine_from_settings
logger = logger.getChild('engines') logger = logger.getChild('engines')
@ -53,7 +53,8 @@ engine_default_args = {'paging': False,
'disabled': False, 'disabled': False,
'suspend_end_time': 0, 'suspend_end_time': 0,
'continuous_errors': 0, 'continuous_errors': 0,
'time_range_support': False} 'time_range_support': False,
'offline': False}
def load_engine(engine_data): def load_engine(engine_data):
@ -128,14 +129,16 @@ def load_engine(engine_data):
engine.stats = { engine.stats = {
'result_count': 0, 'result_count': 0,
'search_count': 0, 'search_count': 0,
'page_load_time': 0,
'page_load_count': 0,
'engine_time': 0, 'engine_time': 0,
'engine_time_count': 0, 'engine_time_count': 0,
'score_count': 0, 'score_count': 0,
'errors': 0 'errors': 0
} }
if not engine.offline:
engine.stats['page_load_time'] = 0
engine.stats['page_load_count'] = 0
for category_name in engine.categories: for category_name in engine.categories:
categories.setdefault(category_name, []).append(engine) categories.setdefault(category_name, []).append(engine)
@ -173,11 +176,6 @@ def get_engines_stats():
results_num = \ results_num = \
engine.stats['result_count'] / float(engine.stats['search_count']) engine.stats['result_count'] / float(engine.stats['search_count'])
if engine.stats['page_load_count'] != 0:
load_times = engine.stats['page_load_time'] / float(engine.stats['page_load_count']) # noqa
else:
load_times = 0
if engine.stats['engine_time_count'] != 0: if engine.stats['engine_time_count'] != 0:
this_engine_time = engine.stats['engine_time'] / float(engine.stats['engine_time_count']) # noqa this_engine_time = engine.stats['engine_time'] / float(engine.stats['engine_time_count']) # noqa
else: else:
@ -189,14 +187,19 @@ def get_engines_stats():
else: else:
score = score_per_result = 0.0 score = score_per_result = 0.0
max_pageload = max(load_times, max_pageload) if not engine.offline:
load_times = 0
if engine.stats['page_load_count'] != 0:
load_times = engine.stats['page_load_time'] / float(engine.stats['page_load_count']) # noqa
max_pageload = max(load_times, max_pageload)
pageloads.append({'avg': load_times, 'name': engine.name})
max_engine_times = max(this_engine_time, max_engine_times) max_engine_times = max(this_engine_time, max_engine_times)
max_results = max(results_num, max_results) max_results = max(results_num, max_results)
max_score = max(score, max_score) max_score = max(score, max_score)
max_score_per_result = max(score_per_result, max_score_per_result) max_score_per_result = max(score_per_result, max_score_per_result)
max_errors = max(max_errors, engine.stats['errors']) max_errors = max(max_errors, engine.stats['errors'])
pageloads.append({'avg': load_times, 'name': engine.name})
engine_times.append({'avg': this_engine_time, 'name': engine.name}) engine_times.append({'avg': this_engine_time, 'name': engine.name})
results.append({'avg': results_num, 'name': engine.name}) results.append({'avg': results_num, 'name': engine.name})
scores.append({'avg': score, 'name': engine.name}) scores.append({'avg': score, 'name': engine.name})
@ -255,7 +258,7 @@ def initialize_engines(engine_list):
load_engines(engine_list) load_engines(engine_list)
def engine_init(engine_name, init_fn): def engine_init(engine_name, init_fn):
init_fn() init_fn(get_engine_from_settings(engine_name))
logger.debug('%s engine: Initialized', engine_name) logger.debug('%s engine: Initialized', engine_name)
for engine_name, engine in engines.items(): for engine_name, engine in engines.items():

View file

@ -17,6 +17,7 @@ from searx.url_utils import urlencode
categories = ['science'] categories = ['science']
paging = True
base_url = 'http://export.arxiv.org/api/query?search_query=all:'\ base_url = 'http://export.arxiv.org/api/query?search_query=all:'\
+ '{query}&start={offset}&max_results={number_of_results}' + '{query}&start={offset}&max_results={number_of_results}'

View file

@ -24,7 +24,7 @@ time_range_support = True
# search-url # search-url
base_url = 'https://www.deviantart.com/' base_url = 'https://www.deviantart.com/'
search_url = base_url + 'browse/all/?offset={offset}&{query}' search_url = base_url + 'search?page={page}&{query}'
time_range_url = '&order={range}' time_range_url = '&order={range}'
time_range_dict = {'day': 11, time_range_dict = {'day': 11,
@ -37,9 +37,7 @@ def request(query, params):
if params['time_range'] and params['time_range'] not in time_range_dict: if params['time_range'] and params['time_range'] not in time_range_dict:
return params return params
offset = (params['pageno'] - 1) * 24 params['url'] = search_url.format(page=params['pageno'],
params['url'] = search_url.format(offset=offset,
query=urlencode({'q': query})) query=urlencode({'q': query}))
if params['time_range'] in time_range_dict: if params['time_range'] in time_range_dict:
params['url'] += time_range_url.format(range=time_range_dict[params['time_range']]) params['url'] += time_range_url.format(range=time_range_dict[params['time_range']])
@ -57,28 +55,27 @@ def response(resp):
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
regex = re.compile(r'\/200H\/')
# parse results # parse results
for result in dom.xpath('.//span[@class="thumb wide"]'): for row in dom.xpath('//div[contains(@data-hook, "content_row")]'):
link = result.xpath('.//a[@class="torpedo-thumb-link"]')[0] for result in row.xpath('./div'):
url = link.attrib.get('href') link = result.xpath('.//a[@data-hook="deviation_link"]')[0]
title = extract_text(result.xpath('.//span[@class="title"]')) url = link.attrib.get('href')
thumbnail_src = link.xpath('.//img')[0].attrib.get('src') title = link.attrib.get('title')
img_src = regex.sub('/', thumbnail_src) thumbnail_src = result.xpath('.//img')[0].attrib.get('src')
img_src = thumbnail_src
# http to https, remove domain sharding # http to https, remove domain sharding
thumbnail_src = re.sub(r"https?://(th|fc)\d+.", "https://th01.", thumbnail_src) thumbnail_src = re.sub(r"https?://(th|fc)\d+.", "https://th01.", thumbnail_src)
thumbnail_src = re.sub(r"http://", "https://", thumbnail_src) thumbnail_src = re.sub(r"http://", "https://", thumbnail_src)
url = re.sub(r"http://(.*)\.deviantart\.com/", "https://\\1.deviantart.com/", url) url = re.sub(r"http://(.*)\.deviantart\.com/", "https://\\1.deviantart.com/", url)
# append result # append result
results.append({'url': url, results.append({'url': url,
'title': title, 'title': title,
'img_src': img_src, 'img_src': img_src,
'thumbnail_src': thumbnail_src, 'thumbnail_src': thumbnail_src,
'template': 'images.html'}) 'template': 'images.html'})
# return results # return results
return results return results

View file

@ -15,7 +15,8 @@ import string
from dateutil import parser from dateutil import parser
from json import loads from json import loads
from lxml import html from lxml import html
from searx.url_utils import quote_plus from searx.url_utils import urlencode
from datetime import datetime
# engine dependent config # engine dependent config
categories = ['news', 'social media'] categories = ['news', 'social media']
@ -23,7 +24,7 @@ paging = True
# search-url # search-url
base_url = 'https://digg.com/' base_url = 'https://digg.com/'
search_url = base_url + 'api/search/{query}.json?position={position}&format=html' search_url = base_url + 'api/search/?{query}&from={position}&size=20&format=html'
# specific xpath variables # specific xpath variables
results_xpath = '//article' results_xpath = '//article'
@ -38,9 +39,9 @@ digg_cookie_chars = string.ascii_uppercase + string.ascii_lowercase +\
# do search-request # do search-request
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * 10 offset = (params['pageno'] - 1) * 20
params['url'] = search_url.format(position=offset, params['url'] = search_url.format(position=offset,
query=quote_plus(query)) query=urlencode({'q': query}))
params['cookies']['frontend.auid'] = ''.join(random.choice( params['cookies']['frontend.auid'] = ''.join(random.choice(
digg_cookie_chars) for _ in range(22)) digg_cookie_chars) for _ in range(22))
return params return params
@ -52,30 +53,17 @@ def response(resp):
search_result = loads(resp.text) search_result = loads(resp.text)
if 'html' not in search_result or search_result['html'] == '':
return results
dom = html.fromstring(search_result['html'])
# parse results # parse results
for result in dom.xpath(results_xpath): for result in search_result['mapped']:
url = result.attrib.get('data-contenturl')
thumbnail = result.xpath('.//img')[0].attrib.get('src')
title = ''.join(result.xpath(title_xpath))
content = ''.join(result.xpath(content_xpath))
pubdate = result.xpath(pubdate_xpath)[0].attrib.get('datetime')
publishedDate = parser.parse(pubdate)
# http to https
thumbnail = thumbnail.replace("http://static.digg.com", "https://static.digg.com")
published = datetime.strptime(result['created']['ISO'], "%Y-%m-%d %H:%M:%S")
# append result # append result
results.append({'url': url, results.append({'url': result['url'],
'title': title, 'title': result['title'],
'content': content, 'content': result['excerpt'],
'template': 'videos.html', 'template': 'videos.html',
'publishedDate': publishedDate, 'publishedDate': published,
'thumbnail': thumbnail}) 'thumbnail': result['images']['thumbImage']})
# return results # return results
return results return results

View file

@ -65,21 +65,36 @@ def get_region_code(lang, lang_list=[]):
def request(query, params): def request(query, params):
if params['time_range'] and params['time_range'] not in time_range_dict: if params['time_range'] not in (None, 'None', '') and params['time_range'] not in time_range_dict:
return params return params
offset = (params['pageno'] - 1) * 30 offset = (params['pageno'] - 1) * 30
region_code = get_region_code(params['language'], supported_languages) region_code = get_region_code(params['language'], supported_languages)
if region_code: params['url'] = 'https://duckduckgo.com/html/'
params['url'] = url.format( if params['pageno'] > 1:
query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset) params['method'] = 'POST'
params['data']['q'] = query
params['data']['s'] = offset
params['data']['dc'] = 30
params['data']['nextParams'] = ''
params['data']['v'] = 'l'
params['data']['o'] = 'json'
params['data']['api'] = '/d.js'
if params['time_range'] in time_range_dict:
params['data']['df'] = time_range_dict[params['time_range']]
if region_code:
params['data']['kl'] = region_code
else: else:
params['url'] = url.format( if region_code:
query=urlencode({'q': query}), offset=offset, dc_param=offset) params['url'] = url.format(
query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset)
else:
params['url'] = url.format(
query=urlencode({'q': query}), offset=offset, dc_param=offset)
if params['time_range'] in time_range_dict: if params['time_range'] in time_range_dict:
params['url'] += time_range_url.format(range=time_range_dict[params['time_range']]) params['url'] += time_range_url.format(range=time_range_dict[params['time_range']])
return params return params
@ -91,7 +106,9 @@ def response(resp):
doc = fromstring(resp.text) doc = fromstring(resp.text)
# parse results # parse results
for r in doc.xpath(result_xpath): for i, r in enumerate(doc.xpath(result_xpath)):
if i >= 30:
break
try: try:
res_url = r.xpath(url_xpath)[-1] res_url = r.xpath(url_xpath)[-1]
except: except:

View file

@ -35,8 +35,8 @@ search_string = 'search?{query}'\
'&ff={safesearch}'\ '&ff={safesearch}'\
'&rxiec={rxieu}'\ '&rxiec={rxieu}'\
'&ulse={ulse}'\ '&ulse={ulse}'\
'&rand={rxikd}' # current unix timestamp '&rand={rxikd}'\
'&dbez={dbez}'
# specific xpath variables # specific xpath variables
results_xpath = '//response//result' results_xpath = '//response//result'
url_xpath = './/url' url_xpath = './/url'
@ -70,7 +70,8 @@ def request(query, params):
rxieu=random.randint(1000000000, 9999999999), rxieu=random.randint(1000000000, 9999999999),
ulse=random.randint(100000000, 999999999), ulse=random.randint(100000000, 999999999),
lang=language, lang=language,
safesearch=safesearch) safesearch=safesearch,
dbez=random.randint(100000000, 999999999))
params['url'] = base_url + search_path params['url'] = base_url + search_path

View file

@ -66,7 +66,7 @@ def get_client_id():
return "" return ""
def init(): def init(engine_settings=None):
global guest_client_id global guest_client_id
# api-key # api-key
guest_client_id = get_client_id() guest_client_id = get_client_id()

View file

@ -15,6 +15,7 @@ from dateutil import parser
from datetime import datetime, timedelta from datetime import datetime, timedelta
import re import re
from searx.engines.xpath import extract_text from searx.engines.xpath import extract_text
from searx.languages import language_codes
# engine dependent config # engine dependent config
categories = ['general'] categories = ['general']
@ -22,7 +23,7 @@ categories = ['general']
# (probably the parameter qid), require # (probably the parameter qid), require
# storing of qid's between mulitble search-calls # storing of qid's between mulitble search-calls
# paging = False paging = True
language_support = True language_support = True
# search-url # search-url
@ -32,23 +33,32 @@ search_url = base_url + 'do/search'
# specific xpath variables # specific xpath variables
# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"] # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
# not ads: div[@class="result"] are the direct childs of div[@id="results"] # not ads: div[@class="result"] are the direct childs of div[@id="results"]
results_xpath = '//li[contains(@class, "search-result") and contains(@class, "search-item")]' results_xpath = '//div[@class="w-gl__result"]'
link_xpath = './/h3/a' link_xpath = './/a[@class="w-gl__result-title"]'
content_xpath = './p[@class="search-item__body"]' content_xpath = './/p[@class="w-gl__description"]'
# do search-request # do search-request
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * 10
params['url'] = search_url params['url'] = search_url
params['method'] = 'POST' params['method'] = 'POST'
params['data'] = {'query': query, params['data'] = {
'startat': offset} 'query': query,
'page': params['pageno'],
'cat': 'web',
'cmd': 'process_search',
'engine0': 'v1all',
}
# set language if specified # set language if specified
if params['language'] != 'all': if params['language'] != 'all':
params['data']['with_language'] = ('lang_' + params['language'].split('-')[0]) language = 'english'
for lc, _, _, lang in language_codes:
if lc == params['language']:
language = lang
params['data']['language'] = language
params['data']['lui'] = language
return params return params

View file

@ -55,7 +55,7 @@ def obtain_token():
return token return token
def init(): def init(engine_settings=None):
obtain_token() obtain_token()

View file

@ -11,8 +11,8 @@
""" """
from lxml import html from lxml import html
import re
from searx.url_utils import urlencode, urljoin from searx.url_utils import urlencode, urljoin
from searx.engines.xpath import extract_text
# engine dependent config # engine dependent config
categories = ['images'] categories = ['images']
@ -34,41 +34,18 @@ def request(query, params):
def response(resp): def response(resp):
results = [] results = []
# get links from result-text dom = html.fromstring(resp.text)
regex = re.compile('(</a>|<a)') for res in dom.xpath('//div[@class="List-item MainListing"]'):
results_parts = re.split(regex, resp.text)
cur_element = ''
# iterate over link parts
for result_part in results_parts:
# processed start and end of link # processed start and end of link
if result_part == '<a': link = res.xpath('//a')[0]
cur_element = result_part
continue
elif result_part != '</a>':
cur_element += result_part
continue
cur_element += result_part
# fix xml-error
cur_element = cur_element.replace('"></a>', '"/></a>')
dom = html.fromstring(cur_element)
link = dom.xpath('//a')[0]
url = urljoin(base_url, link.attrib.get('href')) url = urljoin(base_url, link.attrib.get('href'))
title = link.attrib.get('title', '') title = extract_text(link)
thumbnail_src = urljoin(base_url, link.xpath('.//img')[0].attrib['src']) thumbnail_src = urljoin(base_url, res.xpath('.//img')[0].attrib['src'])
# TODO: get image with higher resolution # TODO: get image with higher resolution
img_src = thumbnail_src img_src = thumbnail_src
# check if url is showing to a photo
if '/photo/' not in url:
continue
# append result # append result
results.append({'url': url, results.append({'url': url,
'title': title, 'title': title,

View file

@ -28,5 +28,6 @@ class SearxParameterException(SearxException):
else: else:
message = 'Invalid value "' + value + '" for parameter ' + name message = 'Invalid value "' + value + '" for parameter ' + name
super(SearxParameterException, self).__init__(message) super(SearxParameterException, self).__init__(message)
self.message = message
self.parameter_name = name self.parameter_name = name
self.parameter_value = value self.parameter_value = value

View file

@ -225,6 +225,9 @@ def https_url_rewrite(result):
def on_result(request, search, result): def on_result(request, search, result):
if 'parsed_url' not in result:
return True
if result['parsed_url'].scheme == 'http': if result['parsed_url'].scheme == 'http':
https_url_rewrite(result) https_url_rewrite(result)
return True return True

View file

@ -35,6 +35,9 @@ def get_doi_resolver(args, preference_doi_resolver):
def on_result(request, search, result): def on_result(request, search, result):
if 'parsed_url' not in result:
return True
doi = extract_doi(result['parsed_url']) doi = extract_doi(result['parsed_url'])
if doi and len(doi) < 50: if doi and len(doi) < 50:
for suffix in ('/', '.pdf', '/full', '/meta', '/abstract'): for suffix in ('/', '.pdf', '/full', '/meta', '/abstract'):

View file

@ -17,10 +17,10 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >.
from flask_babel import gettext from flask_babel import gettext
import re import re
from searx.url_utils import urlunparse from searx.url_utils import urlunparse, parse_qsl, urlencode
regexes = {re.compile(r'utm_[^&]+&?'), regexes = {re.compile(r'utm_[^&]+'),
re.compile(r'(wkey|wemail)[^&]+&?'), re.compile(r'(wkey|wemail)[^&]*'),
re.compile(r'&$')} re.compile(r'&$')}
name = gettext('Tracker URL remover') name = gettext('Tracker URL remover')
@ -30,16 +30,25 @@ preference_section = 'privacy'
def on_result(request, search, result): def on_result(request, search, result):
if 'parsed_url' not in result:
return True
query = result['parsed_url'].query query = result['parsed_url'].query
if query == "": if query == "":
return True return True
parsed_query = parse_qsl(query)
for reg in regexes: changed = False
query = reg.sub('', query) for i, (param_name, _) in enumerate(list(parsed_query)):
for reg in regexes:
if reg.match(param_name):
parsed_query.pop(i)
changed = True
break
if query != result['parsed_url'].query: if changed:
result['parsed_url'] = result['parsed_url']._replace(query=query) result['parsed_url'] = result['parsed_url']._replace(query=urlencode(parsed_query))
result['url'] = urlunparse(result['parsed_url']) result['url'] = urlunparse(result['parsed_url'])
return True return True

View file

@ -184,7 +184,7 @@ class SearchQuery(object):
self.lang = lang self.lang = lang
self.safesearch = safesearch self.safesearch = safesearch
self.pageno = pageno self.pageno = pageno
self.time_range = time_range self.time_range = None if time_range in ('', 'None', None) else time_range
self.timeout_limit = timeout_limit self.timeout_limit = timeout_limit
def __str__(self): def __str__(self):

View file

@ -197,6 +197,13 @@ class ResultContainer(object):
self.infoboxes.append(infobox) self.infoboxes.append(infobox)
def _merge_result(self, result, position): def _merge_result(self, result, position):
if 'url' in result:
self.__merge_url_result(result, position)
return
self.__merge_result_no_url(result, position)
def __merge_url_result(self, result, position):
result['parsed_url'] = urlparse(result['url']) result['parsed_url'] = urlparse(result['url'])
# if the result has no scheme, use http as default # if the result has no scheme, use http as default
@ -210,51 +217,60 @@ class ResultContainer(object):
if result.get('content'): if result.get('content'):
result['content'] = WHITESPACE_REGEX.sub(' ', result['content']) result['content'] = WHITESPACE_REGEX.sub(' ', result['content'])
# check for duplicates duplicated = self.__find_duplicated_http_result(result)
duplicated = False if duplicated:
self.__merge_duplicated_http_result(duplicated, result, position)
return
# if there is no duplicate found, append result
result['positions'] = [position]
with RLock():
self._merged_results.append(result)
def __find_duplicated_http_result(self, result):
result_template = result.get('template') result_template = result.get('template')
for merged_result in self._merged_results: for merged_result in self._merged_results:
if 'parsed_url' not in merged_result:
continue
if compare_urls(result['parsed_url'], merged_result['parsed_url'])\ if compare_urls(result['parsed_url'], merged_result['parsed_url'])\
and result_template == merged_result.get('template'): and result_template == merged_result.get('template'):
if result_template != 'images.html': if result_template != 'images.html':
# not an image, same template, same url : it's a duplicate # not an image, same template, same url : it's a duplicate
duplicated = merged_result return merged_result
break
else: else:
# it's an image # it's an image
# it's a duplicate if the parsed_url, template and img_src are differents # it's a duplicate if the parsed_url, template and img_src are differents
if result.get('img_src', '') == merged_result.get('img_src', ''): if result.get('img_src', '') == merged_result.get('img_src', ''):
duplicated = merged_result return merged_result
break return None
# merge duplicates together def __merge_duplicated_http_result(self, duplicated, result, position):
if duplicated: # using content with more text
# using content with more text if result_content_len(result.get('content', '')) >\
if result_content_len(result.get('content', '')) >\ result_content_len(duplicated.get('content', '')):
result_content_len(duplicated.get('content', '')): duplicated['content'] = result['content']
duplicated['content'] = result['content']
# merge all result's parameters not found in duplicate # merge all result's parameters not found in duplicate
for key in result.keys(): for key in result.keys():
if not duplicated.get(key): if not duplicated.get(key):
duplicated[key] = result.get(key) duplicated[key] = result.get(key)
# add the new position # add the new position
duplicated['positions'].append(position) duplicated['positions'].append(position)
# add engine to list of result-engines # add engine to list of result-engines
duplicated['engines'].add(result['engine']) duplicated['engines'].add(result['engine'])
# using https if possible # using https if possible
if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https': if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https':
duplicated['url'] = result['parsed_url'].geturl() duplicated['url'] = result['parsed_url'].geturl()
duplicated['parsed_url'] = result['parsed_url'] duplicated['parsed_url'] = result['parsed_url']
# if there is no duplicate found, append result def __merge_result_no_url(self, result, position):
else: result['engines'] = set([result['engine']])
result['positions'] = [position] result['positions'] = [position]
with RLock(): with RLock():
self._merged_results.append(result) self._merged_results.append(result)
def order_results(self): def order_results(self):
for result in self._merged_results: for result in self._merged_results:

View file

@ -77,7 +77,7 @@ def send_http_request(engine, request_params):
return req(request_params['url'], **request_args) return req(request_params['url'], **request_args)
def search_one_request(engine, query, request_params): def search_one_http_request(engine, query, request_params):
# update request parameters dependent on # update request parameters dependent on
# search-engine (contained in engines folder) # search-engine (contained in engines folder)
engine.request(query, request_params) engine.request(query, request_params)
@ -97,7 +97,53 @@ def search_one_request(engine, query, request_params):
return engine.response(response) return engine.response(response)
def search_one_offline_request(engine, query, request_params):
return engine.search(query, request_params)
def search_one_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit): def search_one_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit):
if engines[engine_name].offline:
return search_one_offline_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit) # noqa
return search_one_http_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit)
def search_one_offline_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit):
engine = engines[engine_name]
try:
search_results = search_one_offline_request(engine, query, request_params)
if search_results:
result_container.extend(engine_name, search_results)
engine_time = time() - start_time
result_container.add_timing(engine_name, engine_time, engine_time)
with threading.RLock():
engine.stats['engine_time'] += engine_time
engine.stats['engine_time_count'] += 1
except ValueError as e:
record_offline_engine_stats_on_error(engine, result_container, start_time)
logger.exception('engine {0} : invalid input : {1}'.format(engine_name, e))
except Exception as e:
record_offline_engine_stats_on_error(engine, result_container, start_time)
result_container.add_unresponsive_engine((
engine_name,
u'{0}: {1}'.format(gettext('unexpected crash'), e),
))
logger.exception('engine {0} : exception : {1}'.format(engine_name, e))
def record_offline_engine_stats_on_error(engine, result_container, start_time):
engine_time = time() - start_time
result_container.add_timing(engine.name, engine_time, engine_time)
with threading.RLock():
engine.stats['errors'] += 1
def search_one_http_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit):
# set timeout for all HTTP requests # set timeout for all HTTP requests
requests_lib.set_timeout_for_thread(timeout_limit, start_time=start_time) requests_lib.set_timeout_for_thread(timeout_limit, start_time=start_time)
# reset the HTTP total time # reset the HTTP total time
@ -111,7 +157,7 @@ def search_one_request_safe(engine_name, query, request_params, result_container
try: try:
# send requests and parse the results # send requests and parse the results
search_results = search_one_request(engine, query, request_params) search_results = search_one_http_request(engine, query, request_params)
# check if the engine accepted the request # check if the engine accepted the request
if search_results is not None: if search_results is not None:
@ -427,20 +473,22 @@ class Search(object):
continue continue
# set default request parameters # set default request parameters
request_params = default_request_params() request_params = {}
request_params['headers']['User-Agent'] = user_agent if not engine.offline:
request_params = default_request_params()
request_params['headers']['User-Agent'] = user_agent
if hasattr(engine, 'language') and engine.language:
request_params['language'] = engine.language
else:
request_params['language'] = search_query.lang
request_params['safesearch'] = search_query.safesearch
request_params['time_range'] = search_query.time_range
request_params['category'] = selected_engine['category'] request_params['category'] = selected_engine['category']
request_params['pageno'] = search_query.pageno request_params['pageno'] = search_query.pageno
if hasattr(engine, 'language') and engine.language:
request_params['language'] = engine.language
else:
request_params['language'] = search_query.lang
# 0 = None, 1 = Moderate, 2 = Strict
request_params['safesearch'] = search_query.safesearch
request_params['time_range'] = search_query.time_range
# append request to list # append request to list
requests.append((selected_engine['name'], search_query.query, request_params)) requests.append((selected_engine['name'], search_query.query, request_params))

View file

@ -161,11 +161,12 @@ engines:
weight : 2 weight : 2
disabled : True disabled : True
- name : digbt # cloudflare protected
engine : digbt # - name : digbt
shortcut : dbt # engine : digbt
timeout : 6.0 # shortcut : dbt
disabled : True # timeout : 6.0
# disabled : True
- name : digg - name : digg
engine : digg engine : digg
@ -703,9 +704,9 @@ engines:
shortcut: vo shortcut: vo
categories: social media categories: social media
search_url : https://searchvoat.co/?t={query} search_url : https://searchvoat.co/?t={query}
url_xpath : //div[@class="entry"]/p/a[contains(@class, "title")]/@href url_xpath : //div[@class="entry"]//p[@class="title"]/a/@href
title_xpath : //div[@class="entry"]/p/a[contains(@class, "title")] title_xpath : //div[@class="entry"]//p[@class="title"]/a/text()
content_xpath : //div[@class="entry"]/p/span[@class="domain"]/a/text() content_xpath : //div[@class="entry"]//span[@class="domain"]/a/text()
timeout : 10.0 timeout : 10.0
disabled : True disabled : True

File diff suppressed because one or more lines are too long

View file

@ -325,6 +325,10 @@ a {
font-size: 0.9em; font-size: 0.9em;
} }
.result .engines {
text-align: right;
}
.result .content { .result .content {
margin: 0; margin: 0;
color: #666; color: #666;

File diff suppressed because one or more lines are too long

View file

@ -376,6 +376,10 @@ table {
width: 100%; width: 100%;
} }
.result-table {
margin-bottom: 10px;
}
td { td {
padding: 0 4px; padding: 0 4px;
} }

View file

@ -0,0 +1,13 @@
<div class="result">
<table>
{% for key, value in result.items() %}
{% if key in ['engine', 'engines', 'template', 'score', 'category', 'positions'] %}
{% continue %}
{% endif %}
<tr>
<td><b>{{ key|upper }}</b>: {{ value|safe }}</td>
</tr>
{% endfor %}
</table>
<p class="engines">{{ result.engines|join(', ') }}</p>
</div>

View file

@ -0,0 +1,13 @@
<table class="result-table">
{% for key, value in result.items() %}
{% if key in ['engine', 'engines', 'template', 'score', 'category', 'positions'] %}
{% continue %}
{% endif %}
<tr>
<td><b>{{ key|upper }}</b>: {{ value|safe }}</td>
</tr>
{% endfor %}
<tr>
<td><b>ENGINES</b>: {{ result.engines|join(', ') }}</td>
</tr>
</table>

View file

@ -14,7 +14,7 @@
<!-- Draw result header --> <!-- Draw result header -->
{% macro result_header(result, favicons) -%} {% macro result_header(result, favicons) -%}
<h4 class="result_header">{% if result.engine~".png" in favicons %}{{ draw_favicon(result.engine) }} {% endif %}{{ result_link(result.url, result.title|safe) }}</h4> <h4 class="result_header">{% if result.engine~".png" in favicons %}{{ draw_favicon(result.engine) }} {% endif %}{% if result.url %}{{ result_link(result.url, result.title|safe) }}{% else %}{{ result.title|safe}}{% endif %}</h4>
{%- endmacro %} {%- endmacro %}
<!-- Draw result sub header --> <!-- Draw result sub header -->
@ -31,12 +31,16 @@
{% for engine in result.engines %} {% for engine in result.engines %}
<span class="label label-default">{{ engine }}</span> <span class="label label-default">{{ engine }}</span>
{% endfor %} {% endfor %}
{% if result.url %}
<small>{{ result_link("https://web.archive.org/web/" + result.url, icon('link') + _('cached'), "text-info") }}</small> <small>{{ result_link("https://web.archive.org/web/" + result.url, icon('link') + _('cached'), "text-info") }}</small>
{% endif %}
{% if proxify %} {% if proxify %}
<small>{{ result_link(proxify(result.url), icon('sort') + _('proxied'), "text-info") }}</small> <small>{{ result_link(proxify(result.url), icon('sort') + _('proxied'), "text-info") }}</small>
{% endif %} {% endif %}
</div> </div>
{% if result.pretty_url %}
<div class="external-link">{{ result.pretty_url }}</div> <div class="external-link">{{ result.pretty_url }}</div>
{% endif %}
{%- endmacro %} {%- endmacro %}
<!-- Draw result footer --> <!-- Draw result footer -->
@ -45,11 +49,15 @@
{% for engine in result.engines %} {% for engine in result.engines %}
<span class="label label-default">{{ engine }}</span> <span class="label label-default">{{ engine }}</span>
{% endfor %} {% endfor %}
{% if result.url %}
<small>{{ result_link("https://web.archive.org/web/" + result.url, icon('link') + _('cached'), "text-info") }}</small> <small>{{ result_link("https://web.archive.org/web/" + result.url, icon('link') + _('cached'), "text-info") }}</small>
{% endif %}
{% if proxify %} {% if proxify %}
<small>{{ result_link(proxify(result.url), icon('sort') + _('proxied'), "text-info") }}</small> <small>{{ result_link(proxify(result.url), icon('sort') + _('proxied'), "text-info") }}</small>
{% endif %} {% endif %}
{% if result.pretty_url %}
<div class="external-link">{{ result.pretty_url }}</div> <div class="external-link">{{ result.pretty_url }}</div>
{% endif %}
{%- endmacro %} {%- endmacro %}
{% macro preferences_item_header(info, label, rtl) -%} {% macro preferences_item_header(info, label, rtl) -%}

View file

@ -0,0 +1,19 @@
{% from 'oscar/macros.html' import result_footer, result_footer_rtl with context %}
<div class="panel panel-default">
<table class="table table-responsive table-bordered table-condensed">
{% for key, value in result.items() %}
{% if key in ['engine', 'engines', 'template', 'score', 'category', 'positions'] %}
{% continue %}
{% endif %}
<tr>
<td><b>{{ key|upper }}</b>: {{ value }}</td>
</tr>
{% endfor %}
</table>
{% if rtl %}
{{ result_footer_rtl(result) }}
{% else %}
{{ result_footer(result) }}
{% endif %}
</div>

View file

@ -0,0 +1,11 @@
<table>
{% for key, value in result.items() %}
{% if key in ['engine', 'engines', 'template', 'score', 'category', 'positions'] %}
{% continue %}
{% endif %}
<tr>
<td><b>{{ key|upper }}</b>: {{ value }}</td>
</tr>
{% endfor %}
</table>
<div class="engines">{% for engine in result.engines %}<span>{{ engine }}</span>{% endfor %}</div>{{- '' -}}

View file

@ -308,14 +308,15 @@ def int_or_zero(num):
def is_valid_lang(lang): def is_valid_lang(lang):
is_abbr = (len(lang) == 2) is_abbr = (len(lang) == 2)
lang = lang.lower().decode('utf-8')
if is_abbr: if is_abbr:
for l in language_codes: for l in language_codes:
if l[0][:2] == lang.lower(): if l[0][:2] == lang:
return (True, l[0][:2], l[3].lower()) return (True, l[0][:2], l[3].lower())
return False return False
else: else:
for l in language_codes: for l in language_codes:
if l[1].lower() == lang.lower(): if l[1].lower() == lang or l[3].lower() == lang:
return (True, l[0][:2], l[3].lower()) return (True, l[0][:2], l[3].lower())
return False return False
@ -434,3 +435,18 @@ def ecma_unescape(s):
# "%20" becomes " ", "%F3" becomes "ó" # "%20" becomes " ", "%F3" becomes "ó"
s = ecma_unescape2_re.sub(lambda e: unichr(int(e.group(1), 16)), s) s = ecma_unescape2_re.sub(lambda e: unichr(int(e.group(1), 16)), s)
return s return s
def get_engine_from_settings(name):
"""Return engine configuration from settings.yml of a given engine name"""
if 'engines' not in settings:
return {}
for engine in settings['engines']:
if 'name' not in engine:
continue
if name == engine['name']:
return engine
return {}

View file

@ -124,6 +124,7 @@ app = Flask(
app.jinja_env.trim_blocks = True app.jinja_env.trim_blocks = True
app.jinja_env.lstrip_blocks = True app.jinja_env.lstrip_blocks = True
app.jinja_env.add_extension('jinja2.ext.loopcontrols')
app.secret_key = settings['server']['secret_key'] app.secret_key = settings['server']['secret_key']
if not searx_debug \ if not searx_debug \
@ -538,14 +539,16 @@ def index():
if output_format == 'html': if output_format == 'html':
if 'content' in result and result['content']: if 'content' in result and result['content']:
result['content'] = highlight_content(escape(result['content'][:1024]), search_query.query) result['content'] = highlight_content(escape(result['content'][:1024]), search_query.query)
result['title'] = highlight_content(escape(result['title'] or u''), search_query.query) if 'title' in result and result['title']:
result['title'] = highlight_content(escape(result['title'] or u''), search_query.query)
else: else:
if result.get('content'): if result.get('content'):
result['content'] = html_to_text(result['content']).strip() result['content'] = html_to_text(result['content']).strip()
# removing html content and whitespace duplications # removing html content and whitespace duplications
result['title'] = ' '.join(html_to_text(result['title']).strip().split()) result['title'] = ' '.join(html_to_text(result['title']).strip().split())
result['pretty_url'] = prettify_url(result['url']) if 'url' in result:
result['pretty_url'] = prettify_url(result['url'])
# TODO, check if timezone is calculated right # TODO, check if timezone is calculated right
if 'publishedDate' in result: if 'publishedDate' in result:

View file

@ -22,74 +22,3 @@ class TestDeviantartEngine(SearxTestCase):
dicto['time_range'] = 'year' dicto['time_range'] = 'year'
params = deviantart.request(query, dicto) params = deviantart.request(query, dicto)
self.assertEqual({}, params['url']) self.assertEqual({}, params['url'])
def test_response(self):
self.assertRaises(AttributeError, deviantart.response, None)
self.assertRaises(AttributeError, deviantart.response, [])
self.assertRaises(AttributeError, deviantart.response, '')
self.assertRaises(AttributeError, deviantart.response, '[]')
response = mock.Mock(text='<html></html>')
self.assertEqual(deviantart.response(response), [])
response = mock.Mock(status_code=302)
self.assertEqual(deviantart.response(response), [])
html = """
<div id="page-1-results" class="page-results results-page-thumb torpedo-container">
<span class="thumb wide" href="http://amai911.deviantart.com/art/Horse-195212845"
data-super-full-width="900" data-super-full-height="600">
<a class="torpedo-thumb-link" href="https://url.of.image">
<img data-sigil="torpedo-img" src="https://url.of.thumbnail" />
</a>
<span class="info"><span class="title-wrap"><span class="title">Title of image</span></span>
</div>
"""
response = mock.Mock(text=html)
results = deviantart.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 1)
self.assertEqual(results[0]['title'], 'Title of image')
self.assertEqual(results[0]['url'], 'https://url.of.image')
self.assertNotIn('content', results[0])
self.assertEqual(results[0]['thumbnail_src'], 'https://url.of.thumbnail')
html = """
<span class="tt-fh-tc" style="width: 202px;">
<span class="tt-bb" style="width: 202px;">
</span>
<span class="shadow">
<a class="thumb" href="http://url.of.result/2nd.part.of.url"
title="Behoimi BE Animation Test by test-0, Jan 4,
2010 in Digital Art &gt; Animation"> <i></i>
<img width="200" height="200" alt="Test"
src="http://url.of.thumbnail" data-src="http://th08.deviantart.net/test.jpg">
</a>
</span>
<!-- ^TTT -->
</span>
<span class="details">
<a href="http://test-0.deviantart.com/art/Test" class="t"
title="Behoimi BE Animation Test by test-0, Jan 4, 2010">
<span class="tt-fh-oe">Title of image</span> </a>
<small>
<span class="category">
<span class="age">
5 years ago
</span>
in <a title="Behoimi BE Animation Test by test-0, Jan 4, 2010"
href="http://www.deviantart.com/browse/all/digitalart/animation/">Animation</a>
</span>
<div class="commentcount">
<a href="http://test-0.deviantart.com/art/Test#comments">
<span class="iconcommentsstats"></span>9 Comments</a>
</div>
<a class="mlt-link" href="http://www.deviantart.com/morelikethis/149167425">
<span class="mlt-icon"></span> <span class="mlt-text">More Like This</span> </a>
</span>
</small> <!-- TTT$ -->
"""
response = mock.Mock(text=html)
results = deviantart.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 0)

View file

@ -14,88 +14,3 @@ class TestDiggEngine(SearxTestCase):
self.assertIn('url', params) self.assertIn('url', params)
self.assertIn(query, params['url']) self.assertIn(query, params['url'])
self.assertIn('digg.com', params['url']) self.assertIn('digg.com', params['url'])
def test_response(self):
self.assertRaises(AttributeError, digg.response, None)
self.assertRaises(AttributeError, digg.response, [])
self.assertRaises(AttributeError, digg.response, '')
self.assertRaises(AttributeError, digg.response, '[]')
response = mock.Mock(text='{}')
self.assertEqual(digg.response(response), [])
response = mock.Mock(text='{"data": []}')
self.assertEqual(digg.response(response), [])
json = """
{
"status": "ok",
"num": 10,
"next_position": 20,
"html": "<article itemscope itemtype=\\"http://schema.org/Article\\"
class=\\"story-container digg-story-el hentry entry story-1sRANah col-1\\"
data-content-id=\\"1sRANah\\" data-contenturl=\\"http://url.of.link\\"
data-position=\\"0\\" data-diggs=\\"24\\" data-tweets=\\"69\\"
data-digg-score=\\"1190\\"> <div class=\\"story-image story-image-thumb\\">
<a data-position=\\"0\\" data-content-id=\\"1sRANah\\"
class=\\"story-link\\" href=\\"http://www.thedailybeast.com/\\"
target=\\"_blank\\"><img class=\\"story-image-img\\"
src=\\"http://url.of.image.jpeg\\" width=\\"312\\" height=\\"170\\"
alt=\\"\\" /> </a> </div> <div class=\\"story-content\\"><header
class=\\"story-header\\"> <div itemprop=\\"alternativeHeadline\\"
class=\\"story-kicker\\" >Kicker</div> <h2 itemprop=\\"headline\\"
class=\\"story-title entry-title\\"><a class=\\"story-title-link story-link\\"
rel=\\"bookmark\\" itemprop=\\"url\\" href=\\"http://www.thedailybeast.com/\\"
target=\\"_blank\\">Title of article</h2> <div class=\\"story-meta\\">
<div class=\\"story-score \\">
<div class=\\"story-score-diggscore diggscore-1sRANah\\">1190</div>
<div class=\\"story-score-details\\"> <div class=\\"arrow\\"></div>
<ul class=\\"story-score-details-list\\"> <li
class=\\"story-score-detail story-score-diggs\\"><span
class=\\"label\\">Diggs:</span> <span class=\\"count diggs-1sRANah\\">24</span>
</li> <li class=\\"story-score-detail story-score-twitter\\"><span
class=\\"label\\">Tweets:</span> <span class=\\"count tweets-1sRANah\\">69</span>
</li> <li class=\\"story-score-detail story-score-facebook\\"><span
class=\\"label\\">Facebook Shares:</span> <span
class=\\"count fb_shares-1sRANah\\">1097</span></li> </ul> </div> </div>
<span class=\\"story-meta-item story-source\\"> <a
itemprop=\\"publisher copyrightHolder sourceOrganization provider\\"
class=\\"story-meta-item-link story-source-link\\"
href=\\"/source/thedailybeast.com\\">The Daily Beast </a> </span>
<span class=\\"story-meta-item story-tag first-tag\\"> <a
itemprop=\\"keywords\\" rel=\\"tag\\"
class=\\"story-meta-item-link story-tag-link\\" href=\\"/tag/news\\">News</a>
</span> <abbr class=\\"published story-meta-item story-timestamp\\"
title=\\"2014-10-18 14:53:45\\"> <time datetime=\\"2014-10-18 14:53:45\\">18 Oct 2014</time>
</abbr> </div> </header> </div> <ul class=\\"story-actions\\"> <li
class=\\"story-action story-action-digg btn-story-action-container\\">
<a class=\\"target digg-1sRANah\\" href=\\"#\\">Digg</a></li> <li
class=\\"story-action story-action-save btn-story-action-container\\">
<a class=\\"target save-1sRANah\\" href=\\"#\\">Save</a></li> <li
class=\\"story-action story-action-share\\"><a
class=\\"target share-facebook\\" href=\\"https://www.facebook.com/\\">Facebook</a></li>
<li class=\\"story-action story-action-share\\"><a class=\\"target share-twitter\\"
href=\\"https://twitter.com/\\">Twitter</a></li> </ul> </article>"
}
"""
json = json.replace('\r\n', '').replace('\n', '').replace('\r', '')
response = mock.Mock(text=json)
results = digg.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 1)
self.assertEqual(results[0]['title'], 'Title of article')
self.assertEqual(results[0]['url'], 'http://url.of.link')
self.assertEqual(results[0]['thumbnail'], 'http://url.of.image.jpeg')
self.assertEqual(results[0]['content'], '')
json = """
{
"status": "error",
"num": 10,
"next_position": 20
}
"""
response = mock.Mock(text=json)
results = digg.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 0)

View file

@ -18,12 +18,9 @@ class TestStartpageEngine(SearxTestCase):
self.assertIn('data', params) self.assertIn('data', params)
self.assertIn('query', params['data']) self.assertIn('query', params['data'])
self.assertIn(query, params['data']['query']) self.assertIn(query, params['data']['query'])
self.assertIn('with_language', params['data'])
self.assertIn('lang_fr', params['data']['with_language'])
dicto['language'] = 'all' dicto['language'] = 'all'
params = startpage.request(query, dicto) params = startpage.request(query, dicto)
self.assertNotIn('with_language', params['data'])
def test_response(self): def test_response(self):
self.assertRaises(AttributeError, startpage.response, None) self.assertRaises(AttributeError, startpage.response, None)
@ -35,33 +32,32 @@ class TestStartpageEngine(SearxTestCase):
self.assertEqual(startpage.response(response), []) self.assertEqual(startpage.response(response), [])
html = """ html = """
<li class="search-result search-item"> <div class="w-gl__result">
<h3> <a
<a href='http://this.should.be.the.link/' id='title_2' name='title_2' > class="w-gl__result-title"
This should be the title href="http://this.should.be.the.link/"
data-onw="1"
rel="noopener noreferrer"
target="_blank">
<h3>This should be the title</h3>
</a> </a>
<span id='title_stars_2' name='title_stars_2'> </span> <div class="w-gl__result-second-line-container">
</h3> <div class="w-gl__result-url-container">
<p class="search-item__body"> <a
This should be the content. class="w-gl__result-url"
</p> href="http://this.should.be.the.link/"
<p> rel="noopener noreferrer"
<span class='url'>www.speed<b>test</b>.net/fr/ target="_blank">https://www.cnbc.com/2019/10/12/dj-zedd-banned-in-china-for-liking-a-south-park-tweet.html</a>
</span> </div>
- <a
<A class="proxy" id="proxy_link" HREF="https://ixquick-proxy.com/do/spg/proxy?ep=&edata=&ek=&ekdata=" class="w-gl__anonymous-view-url"
class='proxy'> href="https://eu-browse.startpage.com/do/proxy?ep=556b554d576b6f5054554546423167764b5445616455554d5342675441774659495246304848774f5267385453304941486b5949546c63704e33774f526b705544565647516d4a61554246304847674f4a556f6957415a4f436b455042426b6b4f7a64535a52784a56514a4f45307743446c567250445a4f4c52514e5677554e46776b4b545563704c7931554c5167465467644f42464d4f4255426f4d693152624634525741305845526c595746636b626d67494e42705743466c515252634f4267456e597a7346596b7856435134465345634f564249794b5752785643315863546769515773764a5163494c5877505246315865456f5141426b4f41774167596d6c5a4e30395758773442465251495677596c624770665a6b786344466b4151455663425249794d6a78525a55554157516f4342556766526b51314b57514e&amp;ek=4q58686o5047786n6343527259445247576p6o38&amp;ekdata=84abd523dc13cba5c65164d04d7d7263"
Navigation avec Ixquick Proxy target="_blank">Anonymous View</a>
</A> </div>
- <p class="w-gl__description">This should be the content.</p>
<A HREF="https://ixquick-proxy.com/do/spg/highlight.pl?l=francais&c=hf&cat=web&q=test&rl=NONE&rid= </div>
&hlq=https://startpage.com/do/search&mtabp=-1&mtcmd=process_search&mtlanguage=francais&mtengine0= """ # noqa
&mtcat=web&u=http:%2F%2Fwww.speedtest.net%2Ffr%2F" class='proxy'>
Mis en surbrillance
</A>
</p>
</li>
"""
response = mock.Mock(text=html.encode('utf-8')) response = mock.Mock(text=html.encode('utf-8'))
results = startpage.response(response) results = startpage.response(response)
self.assertEqual(type(results), list) self.assertEqual(type(results), list)
@ -69,72 +65,3 @@ class TestStartpageEngine(SearxTestCase):
self.assertEqual(results[0]['title'], 'This should be the title') self.assertEqual(results[0]['title'], 'This should be the title')
self.assertEqual(results[0]['url'], 'http://this.should.be.the.link/') self.assertEqual(results[0]['url'], 'http://this.should.be.the.link/')
self.assertEqual(results[0]['content'], 'This should be the content.') self.assertEqual(results[0]['content'], 'This should be the content.')
html = """
<li class="search-result search-item">
<h3>
<a href='http://www.google.com/aclk?sa=l&ai=C' id='title_2' name='title_2' >
This should be the title
</a>
<span id='title_stars_2' name='title_stars_2'> </span>
</h3>
<p class="search-item__body">
This should be the content.
</p>
<p>
<span class='url'>www.speed<b>test</b>.net/fr/
</span>
-
<A class="proxy" id="proxy_link" HREF="https://ixquick-proxy.com/do/spg/proxy?ep=&edata=&ek=&ekdata="
class='proxy'>
Navigation avec Ixquick Proxy
</A>
-
<A HREF="https://ixquick-proxy.com/do/spg/highlight.pl?l=francais&c=hf&cat=web&q=test&rl=NONE&rid=
&hlq=https://startpage.com/do/search&mtabp=-1&mtcmd=process_search&mtlanguage=francais&mtengine0=
&mtcat=web&u=http:%2F%2Fwww.speedtest.net%2Ffr%2F" class='proxy'>
Mis en surbrillance
</A>
</p>
</li>
<li class="search-result search-item">
<h3>
<span id='title_stars_2' name='title_stars_2'> </span>
</h3>
<p class="search-item__body">
This should be the content.
</p>
<p>
<span class='url'>www.speed<b>test</b>.net/fr/
</span>
</p>
</li>
<li class="search-result search-item">
<h3>
<a href='http://this.should.be.the.link/' id='title_2' name='title_2' >
This should be the title
</a>
<span id='title_stars_2' name='title_stars_2'> </span>
</h3>
<p>
<span class='url'>www.speed<b>test</b>.net/fr/
</span>
-
<A class="proxy" id="proxy_link" HREF="https://ixquick-proxy.com/do/spg/proxy?ep=&edata=&ek=&ekdata="
class='proxy'>
Navigation avec Ixquick Proxy
</A>
-
<A HREF="https://ixquick-proxy.com/do/spg/highlight.pl?l=francais&c=hf&cat=web&q=test&rl=NONE&rid=
&hlq=https://startpage.com/do/search&mtabp=-1&mtcmd=process_search&mtlanguage=francais&mtengine0=
&mtcat=web&u=http:%2F%2Fwww.speedtest.net%2Ffr%2F" class='proxy'>
Mis en surbrillance
</A>
</p>
</li>
"""
response = mock.Mock(text=html.encode('utf-8'))
results = startpage.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 1)
self.assertEqual(results[0]['content'], '')

View file

@ -12,46 +12,3 @@ class TestWww1xEngine(SearxTestCase):
self.assertTrue('url' in params) self.assertTrue('url' in params)
self.assertTrue(query in params['url']) self.assertTrue(query in params['url'])
self.assertTrue('1x.com' in params['url']) self.assertTrue('1x.com' in params['url'])
def test_response(self):
self.assertRaises(AttributeError, www1x.response, None)
self.assertRaises(AttributeError, www1x.response, [])
self.assertRaises(AttributeError, www1x.response, '')
self.assertRaises(AttributeError, www1x.response, '[]')
response = mock.Mock(text='<html></html>')
self.assertEqual(www1x.response(response), [])
html = """
<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE characters
[
<!ELEMENT characters (character*) >
<!ELEMENT character (#PCDATA ) >
<!ENTITY iexcl "&#161;" >
<!ENTITY cent "&#162;" >
<!ENTITY pound "&#163;" >
]
><root><searchresult><![CDATA[<table border="0" cellpadding="0" cellspacing="0" width="100%">
<tr>
<td style="min-width: 220px;" valign="top">
<div style="font-size: 30px; margin: 0px 0px 20px 0px;">Photos</div>
<div>
<a href="/photo/123456" class="dynamiclink">
<img border="0" class="searchresult" src="/images/user/testimage-123456.jpg" style="width: 125px; height: 120px;">
</a>
<a title="sjoerd lammers street photography" href="/member/sjoerdlammers" class="dynamiclink">
<img border="0" class="searchresult" src="/images/profile/60c48b394c677d2fa4d9e7d263aabf44-square.jpg">
</a>
</div>
</td>
</table>
]]></searchresult></root>
"""
response = mock.Mock(text=html)
results = www1x.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 1)
self.assertEqual(results[0]['url'], 'https://1x.com/photo/123456')
self.assertEqual(results[0]['thumbnail_src'], 'https://1x.com/images/user/testimage-123456.jpg')
self.assertEqual(results[0]['content'], '')
self.assertEqual(results[0]['template'], 'images.html')