[mod] do not escape html content in engines

This commit is contained in:
Adam Tauber 2016-12-09 11:44:24 +01:00
parent 28f12ef5a0
commit 16bdc0baf4
30 changed files with 56 additions and 97 deletions

View file

@ -12,7 +12,6 @@
""" """
from urlparse import urljoin from urlparse import urljoin
from cgi import escape
from urllib import urlencode from urllib import urlencode
from lxml import html from lxml import html
from searx.engines.xpath import extract_text from searx.engines.xpath import extract_text
@ -135,7 +134,7 @@ def response(resp):
for result in dom.xpath(xpath_results): for result in dom.xpath(xpath_results):
link = result.xpath(xpath_link)[0] link = result.xpath(xpath_link)[0]
href = urljoin(base_url, link.attrib.get('href')) href = urljoin(base_url, link.attrib.get('href'))
title = escape(extract_text(link)) title = extract_text(link)
results.append({'url': href, results.append({'url': href,
'title': title}) 'title': title})

View file

@ -16,7 +16,6 @@
from lxml import etree from lxml import etree
from urllib import urlencode from urllib import urlencode
from searx.utils import searx_useragent from searx.utils import searx_useragent
from cgi import escape
from datetime import datetime from datetime import datetime
import re import re
@ -94,7 +93,7 @@ def response(resp):
url = item.text url = item.text
elif item.attrib["name"] == "dcdescription": elif item.attrib["name"] == "dcdescription":
content = escape(item.text[:300]) content = item.text[:300]
if len(item.text) > 300: if len(item.text) > 300:
content += "..." content += "..."

View file

@ -14,7 +14,6 @@
""" """
from urllib import urlencode from urllib import urlencode
from cgi import escape
from lxml import html from lxml import html
from searx.engines.xpath import extract_text from searx.engines.xpath import extract_text
@ -61,7 +60,7 @@ def response(resp):
link = result.xpath('.//h3/a')[0] link = result.xpath('.//h3/a')[0]
url = link.attrib.get('href') url = link.attrib.get('href')
title = extract_text(link) title = extract_text(link)
content = escape(extract_text(result.xpath('.//p'))) content = extract_text(result.xpath('.//p'))
# append result # append result
results.append({'url': url, results.append({'url': url,
@ -73,7 +72,7 @@ def response(resp):
link = result.xpath('.//h2/a')[0] link = result.xpath('.//h2/a')[0]
url = link.attrib.get('href') url = link.attrib.get('href')
title = extract_text(link) title = extract_text(link)
content = escape(extract_text(result.xpath('.//p'))) content = extract_text(result.xpath('.//p'))
# append result # append result
results.append({'url': url, results.append({'url': url,

View file

@ -11,7 +11,6 @@
""" """
from urlparse import urljoin from urlparse import urljoin
from cgi import escape
from urllib import quote from urllib import quote
from lxml import html from lxml import html
from operator import itemgetter from operator import itemgetter
@ -51,8 +50,8 @@ def response(resp):
for result in search_res: for result in search_res:
link = result.xpath('.//td[@class="torrent_name"]//a')[0] link = result.xpath('.//td[@class="torrent_name"]//a')[0]
href = urljoin(url, link.attrib.get('href')) href = urljoin(url, link.attrib.get('href'))
title = escape(extract_text(link)) title = extract_text(link)
content = escape(extract_text(result.xpath('.//pre[@class="snippet"]')[0])) content = extract_text(result.xpath('.//pre[@class="snippet"]')[0])
content = "<br />".join(content.split("\n")) content = "<br />".join(content.split("\n"))
filesize = result.xpath('.//span[@class="attr_val"]/text()')[0].split()[0] filesize = result.xpath('.//span[@class="attr_val"]/text()')[0].split()[0]

View file

@ -14,7 +14,6 @@
from urllib import urlencode from urllib import urlencode
from json import loads from json import loads
from cgi import escape
from datetime import datetime from datetime import datetime
# engine dependent config # engine dependent config
@ -57,7 +56,7 @@ def response(resp):
for res in search_res['list']: for res in search_res['list']:
title = res['title'] title = res['title']
url = res['url'] url = res['url']
content = escape(res['description']) content = res['description']
thumbnail = res['thumbnail_360_url'] thumbnail = res['thumbnail_360_url']
publishedDate = datetime.fromtimestamp(res['created_time'], None) publishedDate = datetime.fromtimestamp(res['created_time'], None)
embedded = embedded_url.format(videoid=res['id']) embedded = embedded_url.format(videoid=res['id'])

View file

@ -51,10 +51,11 @@ def response(resp):
if url.startswith('http://'): if url.startswith('http://'):
url = 'https' + url[4:] url = 'https' + url[4:]
content = result['artist']['name'] +\ content = '{} - {} - {}'.format(
" &bull; " +\ result['artist']['name'],
result['album']['title'] +\ result['album']['title'],
" &bull; " + result['title'] result['title'])
embedded = embedded_url.format(audioid=result['id']) embedded = embedded_url.format(audioid=result['id'])
# append result # append result

View file

@ -12,7 +12,6 @@
import re import re
from urlparse import urljoin from urlparse import urljoin
from lxml import html from lxml import html
from cgi import escape
from searx.utils import is_valid_lang from searx.utils import is_valid_lang
categories = ['general'] categories = ['general']
@ -62,8 +61,8 @@ def response(resp):
results.append({ results.append({
'url': urljoin(resp.url, '?%d' % k), 'url': urljoin(resp.url, '?%d' % k),
'title': escape(from_result.text_content()), 'title': from_result.text_content(),
'content': escape('; '.join(to_results)) 'content': '; '.join(to_results)
}) })
return results return results

View file

@ -13,7 +13,6 @@
from urllib import quote_plus from urllib import quote_plus
from json import loads from json import loads
from lxml import html from lxml import html
from cgi import escape
from dateutil import parser from dateutil import parser
# engine dependent config # engine dependent config
@ -56,7 +55,7 @@ def response(resp):
url = result.attrib.get('data-contenturl') url = result.attrib.get('data-contenturl')
thumbnail = result.xpath('.//img')[0].attrib.get('src') thumbnail = result.xpath('.//img')[0].attrib.get('src')
title = ''.join(result.xpath(title_xpath)) title = ''.join(result.xpath(title_xpath))
content = escape(''.join(result.xpath(content_xpath))) content = ''.join(result.xpath(content_xpath))
pubdate = result.xpath(pubdate_xpath)[0].attrib.get('datetime') pubdate = result.xpath(pubdate_xpath)[0].attrib.get('datetime')
publishedDate = parser.parse(pubdate) publishedDate = parser.parse(pubdate)

View file

@ -9,7 +9,6 @@
@parse url, title, content @parse url, title, content
""" """
from cgi import escape
from urllib import urlencode from urllib import urlencode
from searx.engines.xpath import extract_text from searx.engines.xpath import extract_text
from lxml import html from lxml import html
@ -43,7 +42,7 @@ def response(resp):
img_src = app.xpath('.//img/@src')[0] img_src = app.xpath('.//img/@src')[0]
content = extract_text(app.xpath('./p')[0]) content = extract_text(app.xpath('./p')[0])
content = escape(content.replace(title, '', 1).strip()) content = content.replace(title, '', 1).strip()
results.append({'url': url, results.append({'url': url,
'title': title, 'title': title,

View file

@ -77,21 +77,13 @@ def response(resp):
url = build_flickr_url(photo['owner'], photo['id']) url = build_flickr_url(photo['owner'], photo['id'])
title = photo['title']
content = '<span class="photo-author">' +\
photo['ownername'] +\
'</span><br />' +\
'<span class="description">' +\
photo['description']['_content'] +\
'</span>'
# append result # append result
results.append({'url': url, results.append({'url': url,
'title': title, 'title': photo['title'],
'img_src': img_src, 'img_src': img_src,
'thumbnail_src': thumbnail_src, 'thumbnail_src': thumbnail_src,
'content': content, 'content': content = photo['description']['_content'],
'author': photo['ownername'],
'template': 'images.html'}) 'template': 'images.html'})
# return results # return results

View file

@ -102,16 +102,15 @@ def response(resp):
title = photo.get('title', '') title = photo.get('title', '')
content = '<span class="photo-author">' +\ author = photo['username']
photo['username'] +\
'</span><br />'
# append result # append result
results.append({'url': url, results.append({'url': url,
'title': title, 'title': title,
'img_src': img_src, 'img_src': img_src,
'thumbnail_src': thumbnail_src, 'thumbnail_src': thumbnail_src,
'content': content, 'content': '',
'author': author,
'template': 'images.html'}) 'template': 'images.html'})
return results return results

View file

@ -10,7 +10,6 @@
@parse url, title, content @parse url, title, content
""" """
from cgi import escape
from json import loads from json import loads
from random import randint from random import randint
from time import time from time import time
@ -78,8 +77,8 @@ def response(resp):
for result in response_json['results']: for result in response_json['results']:
# append result # append result
results.append({'url': result['url'], results.append({'url': result['url'],
'title': escape(result['title']), 'title': result['title'],
'content': escape(result['sum'])}) 'content': result['sum']})
# return results # return results
return results return results

View file

@ -12,7 +12,6 @@
from urllib import urlencode from urllib import urlencode
from json import loads from json import loads
from cgi import escape
# engine dependent config # engine dependent config
categories = ['it'] categories = ['it']
@ -48,7 +47,7 @@ def response(resp):
url = res['html_url'] url = res['html_url']
if res['description']: if res['description']:
content = escape(res['description'][:500]) content = res['description'][:500]
else: else:
content = '' content = ''

View file

@ -9,7 +9,6 @@
# @parse url, title, content, suggestion # @parse url, title, content, suggestion
import re import re
from cgi import escape
from urllib import urlencode from urllib import urlencode
from urlparse import urlparse, parse_qsl from urlparse import urlparse, parse_qsl
from lxml import html, etree from lxml import html, etree
@ -155,7 +154,7 @@ def parse_url(url_string, google_hostname):
def extract_text_from_dom(result, xpath): def extract_text_from_dom(result, xpath):
r = result.xpath(xpath) r = result.xpath(xpath)
if len(r) > 0: if len(r) > 0:
return escape(extract_text(r[0])) return extract_text(r[0])
return None return None
@ -264,7 +263,7 @@ def response(resp):
# parse suggestion # parse suggestion
for suggestion in dom.xpath(suggestion_xpath): for suggestion in dom.xpath(suggestion_xpath):
# append suggestion # append suggestion
results.append({'suggestion': escape(extract_text(suggestion))}) results.append({'suggestion': extract_text(suggestion)})
# return results # return results
return results return results

View file

@ -11,7 +11,6 @@
""" """
from urlparse import urljoin from urlparse import urljoin
from cgi import escape
from urllib import quote from urllib import quote
from lxml import html from lxml import html
from operator import itemgetter from operator import itemgetter
@ -57,7 +56,7 @@ def response(resp):
link = result.xpath('.//a[@class="cellMainLink"]')[0] link = result.xpath('.//a[@class="cellMainLink"]')[0]
href = urljoin(url, link.attrib['href']) href = urljoin(url, link.attrib['href'])
title = extract_text(link) title = extract_text(link)
content = escape(extract_text(result.xpath(content_xpath))) content = extract_text(result.xpath(content_xpath))
seed = extract_text(result.xpath('.//td[contains(@class, "green")]')) seed = extract_text(result.xpath('.//td[contains(@class, "green")]'))
leech = extract_text(result.xpath('.//td[contains(@class, "red")]')) leech = extract_text(result.xpath('.//td[contains(@class, "red")]'))
filesize_info = extract_text(result.xpath('.//td[contains(@class, "nobr")]')) filesize_info = extract_text(result.xpath('.//td[contains(@class, "nobr")]'))

View file

@ -9,7 +9,6 @@
@parse url, title, content, seed, leech, torrentfile @parse url, title, content, seed, leech, torrentfile
""" """
from cgi import escape
from urllib import urlencode from urllib import urlencode
from lxml import html from lxml import html
from searx.engines.xpath import extract_text from searx.engines.xpath import extract_text
@ -78,7 +77,7 @@ def response(resp):
# torrent title # torrent title
page_a = result.xpath(xpath_title)[0] page_a = result.xpath(xpath_title)[0]
title = escape(extract_text(page_a)) title = extract_text(page_a)
# link to the page # link to the page
href = page_a.attrib.get('href') href = page_a.attrib.get('href')
@ -90,7 +89,7 @@ def response(resp):
try: try:
file_size, suffix = result.xpath(xpath_filesize)[0].split(' ') file_size, suffix = result.xpath(xpath_filesize)[0].split(' ')
file_size = int(float(file_size) * get_filesize_mul(suffix)) file_size = int(float(file_size) * get_filesize_mul(suffix))
except Exception as e: except:
file_size = None file_size = None
# seed count # seed count
@ -105,7 +104,6 @@ def response(resp):
# content string contains all information not included into template # content string contains all information not included into template
content = 'Category: "{category}". Downloaded {downloads} times.' content = 'Category: "{category}". Downloaded {downloads} times.'
content = content.format(category=category, downloads=downloads) content = content.format(category=category, downloads=downloads)
content = escape(content)
results.append({'url': href, results.append({'url': href,
'title': title, 'title': title,

View file

@ -9,7 +9,6 @@
# @parse url, title, content, seed, leech, magnetlink # @parse url, title, content, seed, leech, magnetlink
from urlparse import urljoin from urlparse import urljoin
from cgi import escape
from urllib import quote from urllib import quote
from lxml import html from lxml import html
from operator import itemgetter from operator import itemgetter
@ -62,7 +61,7 @@ def response(resp):
link = result.xpath('.//div[@class="detName"]//a')[0] link = result.xpath('.//div[@class="detName"]//a')[0]
href = urljoin(url, link.attrib.get('href')) href = urljoin(url, link.attrib.get('href'))
title = extract_text(link) title = extract_text(link)
content = escape(extract_text(result.xpath(content_xpath))) content = extract_text(result.xpath(content_xpath))
seed, leech = result.xpath('.//td[@align="right"]/text()')[:2] seed, leech = result.xpath('.//td[@align="right"]/text()')[:2]
# convert seed to int if possible # convert seed to int if possible

View file

@ -11,7 +11,6 @@
""" """
import json import json
from cgi import escape
from urllib import urlencode from urllib import urlencode
from urlparse import urlparse, urljoin from urlparse import urlparse, urljoin
from datetime import datetime from datetime import datetime
@ -68,7 +67,7 @@ def response(resp):
img_results.append(params) img_results.append(params)
else: else:
created = datetime.fromtimestamp(data['created_utc']) created = datetime.fromtimestamp(data['created_utc'])
content = escape(data['selftext']) content = data['selftext']
if len(content) > 500: if len(content) > 500:
content = content[:500] + '...' content = content[:500] + '...'
params['content'] = content params['content'] = content

View file

@ -44,20 +44,12 @@ def response(resp):
# parse results # parse results
for result in search_results.get('results', []): for result in search_results.get('results', []):
href = result['url'] href = result['url']
title = "[" + result['type'] + "] " +\ title = "[{}] {} {}".format(result['type'], result['namespace'], result['name'])
result['namespace'] +\
" " + result['name']
content = '<span class="highlight">[' +\
result['type'] + "] " +\
result['name'] + " " +\
result['synopsis'] +\
"</span><br />" +\
result['description']
# append result # append result
results.append({'url': href, results.append({'url': href,
'title': title, 'title': title,
'content': content}) 'content': result['description']})
# return results # return results
return results return results

View file

@ -9,7 +9,6 @@
# @parse url, title, content, seed, leech, magnetlink # @parse url, title, content, seed, leech, magnetlink
from urlparse import urljoin from urlparse import urljoin
from cgi import escape
from urllib import quote from urllib import quote
from lxml import html from lxml import html
from operator import itemgetter from operator import itemgetter

View file

@ -46,10 +46,11 @@ def response(resp):
if result['type'] == 'track': if result['type'] == 'track':
title = result['name'] title = result['name']
url = result['external_urls']['spotify'] url = result['external_urls']['spotify']
content = result['artists'][0]['name'] +\ content = '{} - {} - {}'.format(
" &bull; " +\ result['artists'][0]['name'],
result['album']['name'] +\ result['album']['name'],
" &bull; " + result['name'] result['name'])
embedded = embedded_url.format(audioid=result['id']) embedded = embedded_url.format(audioid=result['id'])
# append result # append result

View file

@ -11,7 +11,6 @@
""" """
from urlparse import urljoin from urlparse import urljoin
from cgi import escape
from urllib import urlencode from urllib import urlencode
from lxml import html from lxml import html
from searx.engines.xpath import extract_text from searx.engines.xpath import extract_text
@ -48,8 +47,8 @@ def response(resp):
for result in dom.xpath(results_xpath): for result in dom.xpath(results_xpath):
link = result.xpath(link_xpath)[0] link = result.xpath(link_xpath)[0]
href = urljoin(url, link.attrib.get('href')) href = urljoin(url, link.attrib.get('href'))
title = escape(extract_text(link)) title = extract_text(link)
content = escape(extract_text(result.xpath(content_xpath))) content = extract_text(result.xpath(content_xpath))
# append result # append result
results.append({'url': href, results.append({'url': href,

View file

@ -11,7 +11,6 @@
# @todo paging # @todo paging
from lxml import html from lxml import html
from cgi import escape
from dateutil import parser from dateutil import parser
from datetime import datetime, timedelta from datetime import datetime, timedelta
import re import re
@ -79,10 +78,10 @@ def response(resp):
if re.match(r"^http(s|)://(www\.)?ixquick\.com/do/search\?.*$", url): if re.match(r"^http(s|)://(www\.)?ixquick\.com/do/search\?.*$", url):
continue continue
title = escape(extract_text(link)) title = extract_text(link)
if result.xpath('./p[@class="desc clk"]'): if result.xpath('./p[@class="desc clk"]'):
content = escape(extract_text(result.xpath('./p[@class="desc clk"]'))) content = extract_text(result.xpath('./p[@class="desc clk"]'))
else: else:
content = '' content = ''

View file

@ -10,7 +10,6 @@
@parse url, title, content @parse url, title, content
""" """
from cgi import escape
from urllib import quote_plus from urllib import quote_plus
from lxml import html from lxml import html
from searx.languages import language_codes from searx.languages import language_codes
@ -59,7 +58,7 @@ def response(resp):
elif search_lang: elif search_lang:
href = href + search_lang + '/' href = href + search_lang + '/'
title = escape(extract_text(link)) title = extract_text(link)
content = extract_text(result.xpath('.//div[contains(@class,"red")]')) content = extract_text(result.xpath('.//div[contains(@class,"red")]'))
content = content + " - " content = content + " - "
@ -75,7 +74,7 @@ def response(resp):
# append result # append result
results.append({'url': href, results.append({'url': href,
'title': title, 'title': title,
'content': escape(content)}) 'content': content})
# return results # return results
return results return results

View file

@ -10,7 +10,6 @@
@parse url, title, content @parse url, title, content
""" """
from cgi import escape
from json import loads from json import loads
from urllib import urlencode, unquote from urllib import urlencode, unquote
import re import re
@ -78,7 +77,7 @@ def response(resp):
# append result # append result
results.append({'url': result['SourceUrl'], results.append({'url': result['SourceUrl'],
'title': escape(result['Title']), 'title': result['Title'],
'content': '', 'content': '',
'img_src': img_url, 'img_src': img_url,
'template': 'images.html'}) 'template': 'images.html'})
@ -90,8 +89,8 @@ def response(resp):
# append result # append result
results.append({'url': result_url, results.append({'url': result_url,
'title': escape(result_title), 'title': result_title,
'content': escape(result_content)}) 'content': result_content})
# parse images # parse images
for result in json.get('Images', []): for result in json.get('Images', []):
@ -100,7 +99,7 @@ def response(resp):
# append result # append result
results.append({'url': result['SourceUrl'], results.append({'url': result['SourceUrl'],
'title': escape(result['Title']), 'title': result['Title'],
'content': '', 'content': '',
'img_src': img_url, 'img_src': img_url,
'template': 'images.html'}) 'template': 'images.html'})

View file

@ -11,7 +11,6 @@
""" """
import re import re
from cgi import escape
from urllib import urlencode from urllib import urlencode
from lxml import html from lxml import html
from searx.engines.xpath import extract_text from searx.engines.xpath import extract_text

View file

@ -12,7 +12,6 @@
""" """
import re import re
from cgi import escape
from urllib import urlencode from urllib import urlencode
from lxml import html from lxml import html
from searx.engines.xpath import extract_text from searx.engines.xpath import extract_text

View file

@ -9,7 +9,6 @@
@parse url, title, content @parse url, title, content
""" """
import re import re
from cgi import escape
from searx.utils import is_valid_lang from searx.utils import is_valid_lang
categories = ['general'] categories = ['general']
@ -52,14 +51,14 @@ def request(query, params):
def response(resp): def response(resp):
results = [] results = []
results.append({ results.append({
'url': escape(web_url.format( 'url': web_url.format(
from_lang=resp.search_params['from_lang'][2], from_lang=resp.search_params['from_lang'][2],
to_lang=resp.search_params['to_lang'][2], to_lang=resp.search_params['to_lang'][2],
query=resp.search_params['query'])), query=resp.search_params['query']),
'title': escape('[{0}-{1}] {2}'.format( 'title': '[{0}-{1}] {2}'.format(
resp.search_params['from_lang'][1], resp.search_params['from_lang'][1],
resp.search_params['to_lang'][1], resp.search_params['to_lang'][1],
resp.search_params['query'])), resp.search_params['query']),
'content': escape(resp.json()['responseData']['translatedText']) 'content': resp.json()['responseData']['translatedText']
}) })
return results return results

View file

@ -8,7 +8,6 @@
# @stable no # @stable no
# @parse url, infobox # @parse url, infobox
from cgi import escape
from json import loads from json import loads
from time import time from time import time
from urllib import urlencode from urllib import urlencode

View file

@ -9,7 +9,6 @@
@parse url, title, content @parse url, title, content
""" """
from cgi import escape
from urllib import urlencode from urllib import urlencode
from lxml import html from lxml import html
from searx.search import logger from searx.search import logger
@ -52,8 +51,8 @@ def response(resp):
for result in dom.xpath(results_xpath): for result in dom.xpath(results_xpath):
try: try:
res = {'url': result.xpath(url_xpath)[0], res = {'url': result.xpath(url_xpath)[0],
'title': escape(''.join(result.xpath(title_xpath))), 'title': ''.join(result.xpath(title_xpath)),
'content': escape(''.join(result.xpath(content_xpath)))} 'content': ''.join(result.xpath(content_xpath))}
except: except:
logger.exception('yandex parse crash') logger.exception('yandex parse crash')
continue continue