This commit is contained in:
pw3t 2014-01-05 17:57:55 +01:00
commit 0d93ad2018
14 changed files with 98 additions and 55 deletions

View file

@ -5,7 +5,7 @@ number_of_results = 1
[bing] [bing]
engine = bing engine = bing
language = en-us locale = en-US
[cc] [cc]
engine=currency_convert engine=currency_convert
@ -20,6 +20,7 @@ engine = duckduckgo_definitions
[duckduckgo] [duckduckgo]
engine = duckduckgo engine = duckduckgo
locale = en-us
[flickr] [flickr]
engine = flickr engine = flickr
@ -63,17 +64,17 @@ categories = social media
[urbandictionary] [urbandictionary]
engine = xpath engine = xpath
search_url = http://www.urbandictionary.com/define.php?term={query} search_url = http://www.urbandictionary.com/define.php?term={query}
url_xpath = //div[@id="entries"]//div[@class="word"]//a url_xpath = //div[@id="entries"]//div[@class="word"]/a/@href
title_xpath = //div[@id="entries"]//div[@class="word"]//span//text() title_xpath = //div[@id="entries"]//div[@class="word"]/span
content_xpath = //div[@id="entries"]//div[@class="text"]//div[@class="definition"]//text() content_xpath = //div[@id="entries"]//div[@class="text"]/div[@class="definition"]
[yahoo] [yahoo]
engine = xpath engine = xpath
search_url = http://search.yahoo.com/search?p={query} search_url = http://search.yahoo.com/search?p={query}
results_xpath = //div[@class="res"] results_xpath = //div[@class="res"]
url_xpath = .//span[@class="url"]//text() url_xpath = .//h3/a/@href
content_xpath = .//div[@class="abstr"]//text() title_xpath = .//h3/a
title_xpath = .//h3/a//text() content_xpath = .//div[@class="abstr"]
suggestion_xpath = //div[@id="satat"]//a suggestion_xpath = //div[@id="satat"]//a
[youtube] [youtube]
@ -82,5 +83,6 @@ categories = videos
[dailymotion] [dailymotion]
engine = dailymotion engine = dailymotion
locale = en_US
categories = videos categories = videos

View file

@ -261,7 +261,7 @@ def get_engines_stats():
for engine in errors: for engine in errors:
if max_errors: if max_errors:
engine['percentage'] = int(engine['avg']/max_errors*100) engine['percentage'] = int(float(engine['avg'])/max_errors*100)
else: else:
engine['percentage'] = 0 engine['percentage'] = 0

View file

@ -4,11 +4,11 @@ from cgi import escape
base_url = 'http://www.bing.com/' base_url = 'http://www.bing.com/'
search_string = 'search?{query}' search_string = 'search?{query}'
language = 'en-us' # see http://msdn.microsoft.com/en-us/library/dd251064.aspx locale = 'en-US' # see http://msdn.microsoft.com/en-us/library/dd251064.aspx
def request(query, params): def request(query, params):
search_path = search_string.format(query=urlencode({'q': query, 'setmkt': language})) search_path = search_string.format(query=urlencode({'q': query, 'setmkt': locale}))
#if params['category'] == 'images': #if params['category'] == 'images':
# params['url'] = base_url + 'images/' + search_path # params['url'] = base_url + 'images/' + search_path
params['url'] = base_url + search_path params['url'] = base_url + search_path

View file

@ -1,16 +1,17 @@
from urllib import urlencode from urllib import urlencode
from lxml import html
from json import loads from json import loads
from cgi import escape from cgi import escape
categories = ['videos'] categories = ['videos']
localization = 'en' locale = 'en_US'
# see http://www.dailymotion.com/doc/api/obj-video.html # see http://www.dailymotion.com/doc/api/obj-video.html
search_url = 'https://api.dailymotion.com/videos?fields=title,description,duration,url,thumbnail_360_url&sort=relevance&limit=25&page=1&{query}' search_url = 'https://api.dailymotion.com/videos?fields=title,description,duration,url,thumbnail_360_url&sort=relevance&limit=25&page=1&{query}'
def request(query, params): def request(query, params):
global search_url global search_url
params['url'] = search_url.format(query=urlencode({'search': query, 'localization': localization })) params['url'] = search_url.format(query=urlencode({'search': query, 'localization': locale }))
return params return params
@ -27,6 +28,11 @@ def response(resp):
else: else:
content = '' content = ''
if res['description']: if res['description']:
content += escape(res['description'][:500]) description = text_content_from_html(res['description'])
content += description[:500]
results.append({'url': url, 'title': title, 'content': content}) results.append({'url': url, 'title': title, 'content': content})
return results return results
def text_content_from_html(html_string):
desc_html = html.fragment_fromstring(html_string, create_parent=True)
return desc_html.text_content()

View file

@ -3,10 +3,11 @@ from urllib import urlencode
from searx.utils import html_to_text from searx.utils import html_to_text
url = 'https://duckduckgo.com/' url = 'https://duckduckgo.com/'
search_url = url + 'd.js?{query}&l=us-en&p=1&s=0' search_url = url + 'd.js?{query}&p=1&s=0'
locale = 'us-en'
def request(query, params): def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query})) params['url'] = search_url.format(query=urlencode({'q': query, 'l': locale}))
return params return params

View file

@ -1,7 +1,7 @@
import json import json
from urllib import urlencode from urllib import urlencode
url = 'http://api.duckduckgo.com/?{query}&format=json&pretty=0' url = 'http://api.duckduckgo.com/?{query}&format=json&pretty=0&no_redirect=1'
def request(query, params): def request(query, params):
params['url'] = url.format(query=urlencode({'q': query})) params['url'] = url.format(query=urlencode({'q': query}))

View file

@ -1,6 +1,4 @@
from json import loads
from urllib import urlencode from urllib import urlencode
from searx.utils import html_to_text
from HTMLParser import HTMLParser from HTMLParser import HTMLParser
url = 'http://www.filecrop.com/' url = 'http://www.filecrop.com/'
@ -10,7 +8,7 @@ class FilecropResultParser(HTMLParser):
def __init__(self): def __init__(self):
HTMLParser.__init__(self) HTMLParser.__init__(self)
self.__start_processing = False self.__start_processing = False
self.results = [] self.results = []
self.result = {} self.result = {}
@ -22,7 +20,7 @@ class FilecropResultParser(HTMLParser):
if tag == 'tr': if tag == 'tr':
if ('bgcolor', '#edeff5') in attrs or ('bgcolor', '#ffffff') in attrs: if ('bgcolor', '#edeff5') in attrs or ('bgcolor', '#ffffff') in attrs:
self.__start_processing = True self.__start_processing = True
if not self.__start_processing: if not self.__start_processing:
return return
@ -50,7 +48,7 @@ class FilecropResultParser(HTMLParser):
self.data_counter = 0 self.data_counter = 0
self.results.append(self.result) self.results.append(self.result)
self.result = {} self.result = {}
def handle_data(self, data): def handle_data(self, data):
if not self.__start_processing: if not self.__start_processing:
return return
@ -59,7 +57,7 @@ class FilecropResultParser(HTMLParser):
self.result['content'] += data + ' ' self.result['content'] += data + ' '
else: else:
self.result['content'] = data + ' ' self.result['content'] = data + ' '
self.data_counter += 1 self.data_counter += 1
def request(query, params): def request(query, params):

0
searx/engines/flickr.py Executable file → Normal file
View file

0
searx/engines/google_images.py Executable file → Normal file
View file

View file

@ -19,14 +19,13 @@ def response(resp):
global base_url global base_url
results = [] results = []
dom = html.fromstring(resp.content) dom = html.fromstring(resp.content)
for result in dom.xpath('//div[@class="result"]'): # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
# not ads : div[@class="result"] are the direct childs of div[@id="results"]
for result in dom.xpath('//div[@id="results"]/div[@class="result"]'):
link = result.xpath('.//h3/a')[0] link = result.xpath('.//h3/a')[0]
url = link.attrib.get('href') url = link.attrib.get('href')
parsed_url = urlparse(url) parsed_url = urlparse(url)
# TODO better google link detection title = link.text_content()
if parsed_url.netloc.find('www.google.com') >= 0: content = result.xpath('./p[@class="desc"]')[0].text_content()
continue
title = ' '.join(link.xpath('.//text()'))
content = escape(' '.join(result.xpath('.//p[@class="desc"]//text()')))
results.append({'url': url, 'title': title, 'content': content}) results.append({'url': url, 'title': title, 'content': content})
return results return results

View file

@ -1,5 +1,5 @@
from lxml import html from lxml import html
from urllib import urlencode from urllib import urlencode, unquote
from urlparse import urlparse, urljoin from urlparse import urlparse, urljoin
from cgi import escape from cgi import escape
from lxml.etree import _ElementStringResult from lxml.etree import _ElementStringResult
@ -11,32 +11,64 @@ title_xpath = None
suggestion_xpath = '' suggestion_xpath = ''
results_xpath = '' results_xpath = ''
def extract_url(xpath_results): '''
url = '' if xpath_results is list, extract the text from each result and concat the list
parsed_search_url = urlparse(search_url) if xpath_results is a xml element, extract all the text node from it ( text_content() method from lxml )
if xpath_results is a string element, then it's already done
'''
def extract_text(xpath_results):
if type(xpath_results) == list: if type(xpath_results) == list:
# it's list of result : concat everything using recursive call
if not len(xpath_results): if not len(xpath_results):
raise Exception('Empty url resultset') raise Exception('Empty url resultset')
if type(xpath_results[0]) == _ElementStringResult: result = ''
url = ''.join(xpath_results) for e in xpath_results:
if url.startswith('//'): result = result + extract_text(e)
url = parsed_search_url.scheme+url return result
elif url.startswith('/'): elif type(xpath_results) == _ElementStringResult:
url = urljoin(search_url, url) # it's a string
#TODO return ''.join(xpath_results)
else:
url = xpath_results[0].attrib.get('href')
else: else:
url = xpath_results.attrib.get('href') # it's a element
if not url.startswith('http://') and not url.startswith('https://'): return xpath_results.text_content()
url = 'http://'+url
def extract_url(xpath_results):
url = extract_text(xpath_results)
if url.startswith('//'):
# add http or https to this kind of url //example.com/
parsed_search_url = urlparse(search_url)
url = parsed_search_url.scheme+url
elif url.startswith('/'):
# fix relative url to the search engine
url = urljoin(search_url, url)
# normalize url
url = normalize_url(url)
return url
def normalize_url(url):
parsed_url = urlparse(url) parsed_url = urlparse(url)
# add a / at this end of the url if there is no path
if not parsed_url.netloc: if not parsed_url.netloc:
raise Exception('Cannot parse url') raise Exception('Cannot parse url')
if not parsed_url.path: if not parsed_url.path:
url += '/' url += '/'
# FIXME : hack for yahoo
if parsed_url.hostname == 'search.yahoo.com' and parsed_url.path.startswith('/r'):
p = parsed_url.path
mark = p.find('/**')
if mark != -1:
return unquote(p[mark+3:]).decode('utf-8')
return url return url
def request(query, params): def request(query, params):
query = urlencode({'q': query})[2:] query = urlencode({'q': query})[2:]
params['url'] = search_url.format(query=query) params['url'] = search_url.format(query=query)
@ -50,15 +82,19 @@ def response(resp):
if results_xpath: if results_xpath:
for result in dom.xpath(results_xpath): for result in dom.xpath(results_xpath):
url = extract_url(result.xpath(url_xpath)) url = extract_url(result.xpath(url_xpath))
title = ' '.join(result.xpath(title_xpath)) title = extract_text(result.xpath(title_xpath)[0 ])
content = escape(' '.join(result.xpath(content_xpath))) content = extract_text(result.xpath(content_xpath)[0])
results.append({'url': url, 'title': title, 'content': content}) results.append({'url': url, 'title': title, 'content': content})
else: else:
for content, url, title in zip(dom.xpath(content_xpath), map(extract_url, dom.xpath(url_xpath)), dom.xpath(title_xpath)): for url, title, content in zip(
map(extract_url, dom.xpath(url_xpath)), \
map(extract_text, dom.xpath(title_xpath)), \
map(extract_text, dom.xpath(content_xpath)), \
):
results.append({'url': url, 'title': title, 'content': content}) results.append({'url': url, 'title': title, 'content': content})
if not suggestion_xpath: if not suggestion_xpath:
return results return results
for suggestion in dom.xpath(suggestion_xpath): for suggestion in dom.xpath(suggestion_xpath):
results.append({'suggestion': escape(''.join(suggestion.xpath('.//text()')))}) results.append({'suggestion': extract_text(suggestion)})
return results return results

View file

@ -1,5 +1,5 @@
from json import loads from json import loads
from urllib import urlencode, quote from urllib import urlencode
url = 'http://localhost:8090' url = 'http://localhost:8090'
search_url = '/yacysearch.json?{query}&maximumRecords=10' search_url = '/yacysearch.json?{query}&maximumRecords=10'
@ -10,7 +10,7 @@ def request(query, params):
def response(resp): def response(resp):
raw_search_results = loads(resp.text) raw_search_results = loads(resp.text)
if not len(raw_search_results): if not len(raw_search_results):
return [] return []
@ -22,10 +22,10 @@ def response(resp):
tmp_result = {} tmp_result = {}
tmp_result['title'] = result['title'] tmp_result['title'] = result['title']
tmp_result['url'] = result['link'] tmp_result['url'] = result['link']
tmp_result['content'] = '' tmp_result['content'] = ''
if len(result['description']): if len(result['description']):
tmp_result['content'] += result['description'] +"<br/>" tmp_result['content'] += result['description'] +"<br/>"
if len(result['pubDate']): if len(result['pubDate']):
tmp_result['content'] += result['pubDate'] + "<br/>" tmp_result['content'] += result['pubDate'] + "<br/>"

View file

@ -37,7 +37,7 @@
<p>It's ok if you don't trust us regarding the logs, <a href="https://github.com/asciimoo/searx">take the code</a> and run it yourself! decentralize!</p> <p>It's ok if you don't trust us regarding the logs, <a href="https://github.com/asciimoo/searx">take the code</a> and run it yourself! decentralize!</p>
<h3>How to add to firefox?</h3> <h3>How to add to firefox?</h3>
<p><a href="#" onclick="window.external.AddSearchProvider(window.location.protocol + '//' + window.location.host + '/opensearch.xml')">Install</a> searx as a search engine on any version of Firefox! (javascript required)</p> <p><a href="#" onclick="window.external.AddSearchProvider(window.location.protocol + '//' + window.location.host + '/opensearch.xml')">Install</a> searx as a search engine on any version of Firefox! (javascript required)</p>
<h2 id="faq">Developer FAQ</h2> <h2 id="dev_faq">Developer FAQ</h2>
<h3>New engines?</h3> <h3>New engines?</h3>
<p><ul> <p><ul>
<li>Edit your engines.cfg, see <a href="https://raw.github.com/asciimoo/searx/master/engines.cfg_sample">sample config</a></li> <li>Edit your engines.cfg, see <a href="https://raw.github.com/asciimoo/searx/master/engines.cfg_sample">sample config</a></li>

View file

@ -152,7 +152,8 @@ def preferences():
selected_categories.append(category) selected_categories.append(category)
if selected_categories: if selected_categories:
resp = make_response(redirect('/')) resp = make_response(redirect('/'))
resp.set_cookie('categories', ','.join(selected_categories)) # cookie max age: 4 weeks
resp.set_cookie('categories', ','.join(selected_categories), max_age=60*60*24*7*4)
return resp return resp
return render('preferences.html') return render('preferences.html')