forked from Ponysearch/Ponysearch
[enh] fix content fetching, parse published date from description
This commit is contained in:
parent
a959977ab4
commit
4508c96667
2 changed files with 40 additions and 9 deletions
|
@ -12,6 +12,8 @@
|
||||||
|
|
||||||
from lxml import html
|
from lxml import html
|
||||||
from cgi import escape
|
from cgi import escape
|
||||||
|
from dateutil import parser
|
||||||
|
from datetime import datetime, timedelta
|
||||||
import re
|
import re
|
||||||
from searx.engines.xpath import extract_text
|
from searx.engines.xpath import extract_text
|
||||||
|
|
||||||
|
@ -79,11 +81,40 @@ def response(resp):
|
||||||
|
|
||||||
title = escape(extract_text(link))
|
title = escape(extract_text(link))
|
||||||
|
|
||||||
if result.xpath('./p[@class="desc"]'):
|
if result.xpath('./p[@class="desc clk"]'):
|
||||||
content = escape(extract_text(result.xpath('./p[@class="desc"]')))
|
content = escape(extract_text(result.xpath('./p[@class="desc clk"]')))
|
||||||
else:
|
else:
|
||||||
content = ''
|
content = ''
|
||||||
|
|
||||||
|
published_date = None
|
||||||
|
|
||||||
|
# check if search result starts with something like: "2 Sep 2014 ... "
|
||||||
|
if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
|
||||||
|
date_pos = content.find('...')+4
|
||||||
|
date_string = content[0:date_pos-5]
|
||||||
|
published_date = parser.parse(date_string, dayfirst=True)
|
||||||
|
|
||||||
|
# fix content string
|
||||||
|
content = content[date_pos:]
|
||||||
|
|
||||||
|
# check if search result starts with something like: "5 days ago ... "
|
||||||
|
elif re.match("^[0-9]+ days? ago \.\.\. ", content):
|
||||||
|
date_pos = content.find('...')+4
|
||||||
|
date_string = content[0:date_pos-5]
|
||||||
|
|
||||||
|
# calculate datetime
|
||||||
|
published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))
|
||||||
|
|
||||||
|
# fix content string
|
||||||
|
content = content[date_pos:]
|
||||||
|
|
||||||
|
if published_date:
|
||||||
|
# append result
|
||||||
|
results.append({'url': url,
|
||||||
|
'title': title,
|
||||||
|
'content': content,
|
||||||
|
'publishedDate': published_date})
|
||||||
|
else:
|
||||||
# append result
|
# append result
|
||||||
results.append({'url': url,
|
results.append({'url': url,
|
||||||
'title': title,
|
'title': title,
|
||||||
|
|
|
@ -42,7 +42,7 @@ class TestStartpageEngine(SearxTestCase):
|
||||||
</a>
|
</a>
|
||||||
<span id='title_stars_2' name='title_stars_2'> </span>
|
<span id='title_stars_2' name='title_stars_2'> </span>
|
||||||
</h3>
|
</h3>
|
||||||
<p class='desc'>
|
<p class='desc clk'>
|
||||||
This should be the content.
|
This should be the content.
|
||||||
</p>
|
</p>
|
||||||
<p>
|
<p>
|
||||||
|
@ -78,7 +78,7 @@ class TestStartpageEngine(SearxTestCase):
|
||||||
</a>
|
</a>
|
||||||
<span id='title_stars_2' name='title_stars_2'> </span>
|
<span id='title_stars_2' name='title_stars_2'> </span>
|
||||||
</h3>
|
</h3>
|
||||||
<p class='desc'>
|
<p class='desc clk'>
|
||||||
This should be the content.
|
This should be the content.
|
||||||
</p>
|
</p>
|
||||||
<p>
|
<p>
|
||||||
|
@ -101,7 +101,7 @@ class TestStartpageEngine(SearxTestCase):
|
||||||
<h3>
|
<h3>
|
||||||
<span id='title_stars_2' name='title_stars_2'> </span>
|
<span id='title_stars_2' name='title_stars_2'> </span>
|
||||||
</h3>
|
</h3>
|
||||||
<p class='desc'>
|
<p class='desc clk'>
|
||||||
This should be the content.
|
This should be the content.
|
||||||
</p>
|
</p>
|
||||||
<p>
|
<p>
|
||||||
|
|
Loading…
Reference in a new issue