forked from Ponysearch/Ponysearch
[mod] result.py: merge infobox URL and attributes when the same label or the same entity
entity are wikidata entity (like "Q42" for "Douglas Adams", see https://www.wikidata.org/wiki/Q42 )
This commit is contained in:
parent
23f4203dfb
commit
382fded665
1 changed files with 26 additions and 8 deletions
|
@ -20,6 +20,18 @@ def result_content_len(content):
|
||||||
|
|
||||||
|
|
||||||
def compare_urls(url_a, url_b):
|
def compare_urls(url_a, url_b):
|
||||||
|
"""Lazy compare between two URL.
|
||||||
|
"www.example.com" and "example.com" are equals.
|
||||||
|
"www.example.com/path/" and "www.example.com/path" are equals.
|
||||||
|
"https://www.example.com/" and "http://www.example.com/" are equals.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url_a (ParseResult): first URL
|
||||||
|
url_b (ParseResult): second URL
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if url_a and url_b are equals
|
||||||
|
"""
|
||||||
# ignore www. in comparison
|
# ignore www. in comparison
|
||||||
if url_a.netloc.startswith('www.'):
|
if url_a.netloc.startswith('www.'):
|
||||||
host_a = url_a.netloc.replace('www.', '', 1)
|
host_a = url_a.netloc.replace('www.', '', 1)
|
||||||
|
@ -68,8 +80,10 @@ def merge_two_infoboxes(infobox1, infobox2):
|
||||||
for url2 in infobox2.get('urls', []):
|
for url2 in infobox2.get('urls', []):
|
||||||
unique_url = True
|
unique_url = True
|
||||||
parsed_url2 = urlparse(url2.get('url', ''))
|
parsed_url2 = urlparse(url2.get('url', ''))
|
||||||
|
entity_url2 = url2.get('entity')
|
||||||
for url1 in urls1:
|
for url1 in urls1:
|
||||||
if compare_urls(urlparse(url1.get('url', '')), parsed_url2):
|
if (entity_url2 is not None and url1.get('entity') == entity_url2)\
|
||||||
|
or compare_urls(urlparse(url1.get('url', '')), parsed_url2):
|
||||||
unique_url = False
|
unique_url = False
|
||||||
break
|
break
|
||||||
if unique_url:
|
if unique_url:
|
||||||
|
@ -86,18 +100,22 @@ def merge_two_infoboxes(infobox1, infobox2):
|
||||||
infobox1['img_src'] = img2
|
infobox1['img_src'] = img2
|
||||||
|
|
||||||
if 'attributes' in infobox2:
|
if 'attributes' in infobox2:
|
||||||
attributes1 = infobox1.get('attributes', None)
|
attributes1 = infobox1.get('attributes')
|
||||||
if attributes1 is None:
|
if attributes1 is None:
|
||||||
attributes1 = []
|
infobox1['attributes'] = attributes1 = []
|
||||||
infobox1['attributes'] = attributes1
|
|
||||||
|
|
||||||
attributeSet = set()
|
attributeSet = set()
|
||||||
for attribute in infobox1.get('attributes', []):
|
for attribute in attributes1:
|
||||||
if attribute.get('label', None) not in attributeSet:
|
label = attribute.get('label')
|
||||||
attributeSet.add(attribute.get('label', None))
|
if label not in attributeSet:
|
||||||
|
attributeSet.add(label)
|
||||||
|
entity = attribute.get('entity')
|
||||||
|
if entity not in attributeSet:
|
||||||
|
attributeSet.add(entity)
|
||||||
|
|
||||||
for attribute in infobox2.get('attributes', []):
|
for attribute in infobox2.get('attributes', []):
|
||||||
if attribute.get('label', None) not in attributeSet:
|
if attribute.get('label') not in attributeSet\
|
||||||
|
and attribute.get('entity') not in attributeSet:
|
||||||
attributes1.append(attribute)
|
attributes1.append(attribute)
|
||||||
|
|
||||||
if 'content' in infobox2:
|
if 'content' in infobox2:
|
||||||
|
|
Loading…
Reference in a new issue