[mod][fix] https rewrite refactor ++ fixes

This commit is contained in:
Adam Tauber 2014-12-19 22:40:37 +01:00
parent 813247b37a
commit f141773814
3 changed files with 68 additions and 61 deletions

View file

@ -16,6 +16,7 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >.
''' '''
import re import re
from urlparse import urlparse
from lxml import etree from lxml import etree
from os import listdir from os import listdir
from os.path import isfile, isdir, join from os.path import isfile, isdir, join
@ -86,15 +87,23 @@ def load_single_https_ruleset(filepath):
# TODO hack, which convert a javascript regex group # TODO hack, which convert a javascript regex group
# into a valid python regex group # into a valid python regex group
rule_from = ruleset.attrib.get('from').replace('$', '\\') rule_from = ruleset.attrib['from'].replace('$', '\\')
rule_to = ruleset.attrib.get('to').replace('$', '\\') if rule_from.endswith('\\'):
rule_from = rule_from[:-1]+'$'
rule_to = ruleset.attrib['to'].replace('$', '\\')
if rule_to.endswith('\\'):
rule_to = rule_to[:-1]+'$'
# TODO, not working yet because of the hack above, # TODO, not working yet because of the hack above,
# currently doing that in webapp.py # currently doing that in webapp.py
# rule_from_rgx = re.compile(rule_from, re.I) # rule_from_rgx = re.compile(rule_from, re.I)
# append rule # append rule
rules.append((rule_from, rule_to)) try:
rules.append((re.compile(rule_from, re.I | re.U), rule_to))
except:
# TODO log regex error
continue
# this child define an exclusion # this child define an exclusion
elif ruleset.tag == 'exclusion': elif ruleset.tag == 'exclusion':
@ -143,3 +152,56 @@ def load_https_rules(rules_path):
https_rules.append(ruleset) https_rules.append(ruleset)
print(' * {n} https-rules loaded'.format(n=len(https_rules))) print(' * {n} https-rules loaded'.format(n=len(https_rules)))
def https_url_rewrite(result):
skip_https_rewrite = False
# check if HTTPS rewrite is possible
for target, rules, exclusions in https_rules:
# check if target regex match with url
if target.match(result['parsed_url'].netloc):
# process exclusions
for exclusion in exclusions:
# check if exclusion match with url
if exclusion.match(result['url']):
skip_https_rewrite = True
break
# skip https rewrite if required
if skip_https_rewrite:
break
# process rules
for rule in rules:
try:
new_result_url = rule[0].sub(rule[1], result['url'])
except:
break
# parse new url
new_parsed_url = urlparse(new_result_url)
# continiue if nothing was rewritten
if result['url'] == new_result_url:
continue
# get domainname from result
# TODO, does only work correct with TLD's like
# asdf.com, not for asdf.com.de
# TODO, using publicsuffix instead of this rewrite rule
old_result_domainname = '.'.join(
result['parsed_url'].hostname.split('.')[-2:])
new_result_domainname = '.'.join(
new_parsed_url.hostname.split('.')[-2:])
# check if rewritten hostname is the same,
# to protect against wrong or malicious rewrite rules
if old_result_domainname == new_result_domainname:
# set new url
result['url'] = new_result_url
# target has matched, do not search over the other rules
break
return result

View file

@ -89,7 +89,7 @@
<rule from="^http://([aiw]\d|api|wis)\.sndcdn\.com/" <rule from="^http://([aiw]\d|api|wis)\.sndcdn\.com/"
to="https://$1.sndcdn.com/" /> to="https://$1.sndcdn.com/" />
<rule from="^http://((?:api|backstage|blog|connect|developers|ec-media|eventlogger|help-assets|media|visuals|w|www)\.)?soundcloud\.com/" <rule from="^http://((?:api|backstage|blog|connect|developers|ec-media|eventlogger|help-assets|media|visuals|w|www)\.|)soundcloud\.com/"
to="https://$1soundcloud.com/" /> to="https://$1soundcloud.com/" />
<rule from="^https?://scbackstage\.wpengine\.netdna-cdn\.com/" <rule from="^https?://scbackstage\.wpengine\.netdna-cdn\.com/"

View file

@ -41,15 +41,12 @@ from searx.utils import (
UnicodeWriter, highlight_content, html_to_text, get_themes UnicodeWriter, highlight_content, html_to_text, get_themes
) )
from searx.version import VERSION_STRING from searx.version import VERSION_STRING
from searx.https_rewrite import https_rules
from searx.languages import language_codes from searx.languages import language_codes
from searx.https_rewrite import https_url_rewrite
from searx.search import Search from searx.search import Search
from searx.query import Query from searx.query import Query
from searx.autocomplete import backends as autocomplete_backends from searx.autocomplete import backends as autocomplete_backends
from urlparse import urlparse
import re
static_path, templates_path, themes =\ static_path, templates_path, themes =\
get_themes(settings['themes_path'] get_themes(settings['themes_path']
@ -215,59 +212,7 @@ def index():
if settings['server']['https_rewrite']\ if settings['server']['https_rewrite']\
and result['parsed_url'].scheme == 'http': and result['parsed_url'].scheme == 'http':
skip_https_rewrite = False result = https_url_rewrite(result)
# check if HTTPS rewrite is possible
for target, rules, exclusions in https_rules:
# check if target regex match with url
if target.match(result['url']):
# process exclusions
for exclusion in exclusions:
# check if exclusion match with url
if exclusion.match(result['url']):
skip_https_rewrite = True
break
# skip https rewrite if required
if skip_https_rewrite:
break
# process rules
for rule in rules:
try:
# TODO, precompile rule
p = re.compile(rule[0])
# rewrite url if possible
new_result_url = p.sub(rule[1], result['url'])
except:
break
# parse new url
new_parsed_url = urlparse(new_result_url)
# continiue if nothing was rewritten
if result['url'] == new_result_url:
continue
# get domainname from result
# TODO, does only work correct with TLD's like
# asdf.com, not for asdf.com.de
# TODO, using publicsuffix instead of this rewrite rule
old_result_domainname = '.'.join(
result['parsed_url'].hostname.split('.')[-2:])
new_result_domainname = '.'.join(
new_parsed_url.hostname.split('.')[-2:])
# check if rewritten hostname is the same,
# to protect against wrong or malicious rewrite rules
if old_result_domainname == new_result_domainname:
# set new url
result['url'] = new_result_url
# target has matched, do not search over the other rules
break
if search.request_data.get('format', 'html') == 'html': if search.request_data.get('format', 'html') == 'html':
if 'content' in result: if 'content' in result: