forked from Ponysearch/Ponysearch
parent
a0a1284998
commit
c2e4014287
2 changed files with 25 additions and 21 deletions
|
@ -99,9 +99,8 @@ def response(resp):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# link to wikipedia article
|
# link to wikipedia article
|
||||||
# parenthesis are not quoted to make infobox mergeable with wikidata's
|
|
||||||
wikipedia_link = url_lang(resp.search_params['language']) \
|
wikipedia_link = url_lang(resp.search_params['language']) \
|
||||||
+ 'wiki/' + quote(title.replace(' ', '_').encode('utf8')).replace('%28', '(').replace('%29', ')')
|
+ 'wiki/' + quote(title.replace(' ', '_').encode('utf8'))
|
||||||
|
|
||||||
results.append({'url': wikipedia_link, 'title': title})
|
results.append({'url': wikipedia_link, 'title': title})
|
||||||
|
|
||||||
|
|
|
@ -18,7 +18,17 @@ def result_content_len(content):
|
||||||
|
|
||||||
|
|
||||||
def compare_urls(url_a, url_b):
|
def compare_urls(url_a, url_b):
|
||||||
if url_a.netloc != url_b.netloc or url_a.query != url_b.query:
|
# ignore www. in comparison
|
||||||
|
if url_a.netloc.startswith('www.'):
|
||||||
|
host_a = url_a.netloc.replace('www.', '', 1)
|
||||||
|
else:
|
||||||
|
host_a = url_a.netloc
|
||||||
|
if url_b.netloc.startswith('www.'):
|
||||||
|
host_b = url_b.netloc.replace('www.', '', 1)
|
||||||
|
else:
|
||||||
|
host_b = url_b.netloc
|
||||||
|
|
||||||
|
if host_a != host_b or url_a.query != url_b.query:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# remove / from the end of the url if required
|
# remove / from the end of the url if required
|
||||||
|
@ -37,16 +47,18 @@ def merge_two_infoboxes(infobox1, infobox2):
|
||||||
urls1 = infobox1.get('urls', None)
|
urls1 = infobox1.get('urls', None)
|
||||||
if urls1 is None:
|
if urls1 is None:
|
||||||
urls1 = []
|
urls1 = []
|
||||||
|
|
||||||
|
for url2 in infobox2.get('urls', []):
|
||||||
|
unique_url = True
|
||||||
|
for url1 in infobox1.get('urls', []):
|
||||||
|
if compare_urls(urlparse(url1.get('url', '')), urlparse(url2.get('url', ''))):
|
||||||
|
unique_url = False
|
||||||
|
break
|
||||||
|
if unique_url:
|
||||||
|
urls1.append(url2)
|
||||||
|
|
||||||
infobox1['urls'] = urls1
|
infobox1['urls'] = urls1
|
||||||
|
|
||||||
urlSet = set()
|
|
||||||
for url in infobox1.get('urls', []):
|
|
||||||
urlSet.add(url.get('url', None))
|
|
||||||
|
|
||||||
for url in infobox2.get('urls', []):
|
|
||||||
if url.get('url', None) not in urlSet:
|
|
||||||
urls1.append(url)
|
|
||||||
|
|
||||||
if 'img_src' in infobox2:
|
if 'img_src' in infobox2:
|
||||||
img1 = infobox1.get('img_src', None)
|
img1 = infobox1.get('img_src', None)
|
||||||
img2 = infobox2.get('img_src')
|
img2 = infobox2.get('img_src')
|
||||||
|
@ -97,7 +109,6 @@ class ResultContainer(object):
|
||||||
self.results = defaultdict(list)
|
self.results = defaultdict(list)
|
||||||
self._merged_results = []
|
self._merged_results = []
|
||||||
self.infoboxes = []
|
self.infoboxes = []
|
||||||
self._infobox_ids = {}
|
|
||||||
self.suggestions = set()
|
self.suggestions = set()
|
||||||
self.answers = set()
|
self.answers = set()
|
||||||
self._number_of_results = []
|
self._number_of_results = []
|
||||||
|
@ -138,14 +149,13 @@ class ResultContainer(object):
|
||||||
add_infobox = True
|
add_infobox = True
|
||||||
infobox_id = infobox.get('id', None)
|
infobox_id = infobox.get('id', None)
|
||||||
if infobox_id is not None:
|
if infobox_id is not None:
|
||||||
existingIndex = self._infobox_ids.get(infobox_id, None)
|
for existingIndex in self.infoboxes:
|
||||||
if existingIndex is not None:
|
if compare_urls(urlparse(existingIndex.get('id', '')), urlparse(infobox_id)):
|
||||||
merge_two_infoboxes(self.infoboxes[existingIndex], infobox)
|
merge_two_infoboxes(existingIndex, infobox)
|
||||||
add_infobox = False
|
add_infobox = False
|
||||||
|
|
||||||
if add_infobox:
|
if add_infobox:
|
||||||
self.infoboxes.append(infobox)
|
self.infoboxes.append(infobox)
|
||||||
self._infobox_ids[infobox_id] = len(self.infoboxes) - 1
|
|
||||||
|
|
||||||
def _merge_result(self, result, position):
|
def _merge_result(self, result, position):
|
||||||
result['parsed_url'] = urlparse(result['url'])
|
result['parsed_url'] = urlparse(result['url'])
|
||||||
|
@ -155,11 +165,6 @@ class ResultContainer(object):
|
||||||
result['parsed_url'] = result['parsed_url']._replace(scheme="http")
|
result['parsed_url'] = result['parsed_url']._replace(scheme="http")
|
||||||
result['url'] = result['parsed_url'].geturl()
|
result['url'] = result['parsed_url'].geturl()
|
||||||
|
|
||||||
result['host'] = result['parsed_url'].netloc
|
|
||||||
|
|
||||||
if result['host'].startswith('www.'):
|
|
||||||
result['host'] = result['host'].replace('www.', '', 1)
|
|
||||||
|
|
||||||
result['engines'] = [result['engine']]
|
result['engines'] = [result['engine']]
|
||||||
|
|
||||||
# strip multiple spaces and cariage returns from content
|
# strip multiple spaces and cariage returns from content
|
||||||
|
|
Loading…
Reference in a new issue