[enh] better url comparison

This commit is contained in:
asciimoo 2013-10-19 17:36:44 +02:00
parent 34941aca3f
commit 70cbc09e93

View file

@ -22,6 +22,7 @@ from imp import load_source
import grequests import grequests
from itertools import izip_longest, chain from itertools import izip_longest, chain
from operator import itemgetter from operator import itemgetter
from urlparse import urlparse
engine_dir = dirname(realpath(__file__)) engine_dir = dirname(realpath(__file__))
@ -87,16 +88,23 @@ def search(query, request, selected_engines):
results = [] results = []
# deduplication + scoring # deduplication + scoring
for i,res in enumerate(flat_res): for i,res in enumerate(flat_res):
res['parsed_url'] = urlparse(res['url'])
score = flat_len - i score = flat_len - i
duplicated = False duplicated = False
for new_res in results: for new_res in results:
if res['url'] == new_res['url']: if res['parsed_url'].netloc == new_res['parsed_url'].netloc and\
res['parsed_url'].path == new_res['parsed_url'].path:
duplicated = new_res duplicated = new_res
break break
if duplicated: if duplicated:
if len(res.get('content', '')) > len(duplicated.get('content', '')): if len(res.get('content', '')) > len(duplicated.get('content', '')):
duplicated['content'] = res['content'] duplicated['content'] = res['content']
duplicated['score'] += score duplicated['score'] += score
if duplicated['parsed_url'].scheme == 'https':
continue
elif res['parsed_url'].scheme == 'https':
duplicated['parsed_url'].scheme == 'https'
duplicated['url'] = duplicated['parsed_url'].geturl()
else: else:
res['score'] = score res['score'] = score
results.append(res) results.append(res)