forked from Ponysearch/Ponysearch
Merge pull request #57 from pointhi/results
improving publishDate extraction and output of it
This commit is contained in:
commit
018a14431b
9 changed files with 26 additions and 15 deletions
|
@ -3,3 +3,4 @@ flask-babel
|
||||||
grequests
|
grequests
|
||||||
lxml
|
lxml
|
||||||
pyyaml
|
pyyaml
|
||||||
|
python-dateutil
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
|
|
||||||
from urllib import urlencode
|
from urllib import urlencode
|
||||||
from json import loads
|
from json import loads
|
||||||
|
from dateutil import parser
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
categories = ['news']
|
categories = ['news']
|
||||||
|
@ -32,16 +33,9 @@ def response(resp):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
for result in search_res['responseData']['results']:
|
for result in search_res['responseData']['results']:
|
||||||
# S.149 (159), library.pdf
|
|
||||||
# datetime.strptime("Mon, 10 Mar 2014 16:26:15 -0700",
|
# Mon, 10 Mar 2014 16:26:15 -0700
|
||||||
# "%a, %d %b %Y %H:%M:%S %z")
|
publishedDate = parser.parse(result['publishedDate'])
|
||||||
# publishedDate = parse(result['publishedDate'])
|
|
||||||
publishedDate = datetime.strptime(
|
|
||||||
str.join(' ', result['publishedDate'].split(None)[0:5]),
|
|
||||||
"%a, %d %b %Y %H:%M:%S")
|
|
||||||
#utc_offset = timedelta(result['publishedDate'].split(None)[5])
|
|
||||||
# local = utc + offset
|
|
||||||
#publishedDate = publishedDate + utc_offset
|
|
||||||
|
|
||||||
results.append({'url': result['unescapedUrl'],
|
results.append({'url': result['unescapedUrl'],
|
||||||
'title': result['titleNoFormatting'],
|
'title': result['titleNoFormatting'],
|
||||||
|
|
|
@ -2,6 +2,8 @@ from urllib import urlencode
|
||||||
from HTMLParser import HTMLParser
|
from HTMLParser import HTMLParser
|
||||||
from lxml import html
|
from lxml import html
|
||||||
from xpath import extract_text
|
from xpath import extract_text
|
||||||
|
from datetime import datetime
|
||||||
|
from dateutil import parser
|
||||||
|
|
||||||
base_url = 'http://vimeo.com'
|
base_url = 'http://vimeo.com'
|
||||||
search_url = base_url + '/search?{query}'
|
search_url = base_url + '/search?{query}'
|
||||||
|
@ -10,6 +12,7 @@ content_xpath = None
|
||||||
title_xpath = None
|
title_xpath = None
|
||||||
results_xpath = ''
|
results_xpath = ''
|
||||||
content_tpl = '<a href="{0}"> <img src="{2}"/> </a>'
|
content_tpl = '<a href="{0}"> <img src="{2}"/> </a>'
|
||||||
|
publishedDate_xpath = './/p[@class="meta"]//attribute::datetime'
|
||||||
|
|
||||||
# the cookie set by vimeo contains all the following values,
|
# the cookie set by vimeo contains all the following values,
|
||||||
# but only __utma seems to be requiered
|
# but only __utma seems to be requiered
|
||||||
|
@ -40,9 +43,12 @@ def response(resp):
|
||||||
url = base_url + result.xpath(url_xpath)[0]
|
url = base_url + result.xpath(url_xpath)[0]
|
||||||
title = p.unescape(extract_text(result.xpath(title_xpath)))
|
title = p.unescape(extract_text(result.xpath(title_xpath)))
|
||||||
thumbnail = extract_text(result.xpath(content_xpath)[0])
|
thumbnail = extract_text(result.xpath(content_xpath)[0])
|
||||||
|
publishedDate = parser.parse(extract_text(result.xpath(publishedDate_xpath)[0]))
|
||||||
|
|
||||||
results.append({'url': url,
|
results.append({'url': url,
|
||||||
'title': title,
|
'title': title,
|
||||||
'content': content_tpl.format(url, title, thumbnail),
|
'content': content_tpl.format(url, title, thumbnail),
|
||||||
'template': 'videos.html',
|
'template': 'videos.html',
|
||||||
|
'publishedDate': publishedDate,
|
||||||
'thumbnail': thumbnail})
|
'thumbnail': thumbnail})
|
||||||
return results
|
return results
|
||||||
|
|
|
@ -6,6 +6,7 @@ from searx.engines.xpath import extract_text, extract_url
|
||||||
from searx.engines.yahoo import parse_url
|
from searx.engines.yahoo import parse_url
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
import re
|
import re
|
||||||
|
from dateutil import parser
|
||||||
|
|
||||||
categories = ['news']
|
categories = ['news']
|
||||||
search_url = 'http://news.search.yahoo.com/search?{query}&b={offset}'
|
search_url = 'http://news.search.yahoo.com/search?{query}&b={offset}'
|
||||||
|
@ -52,9 +53,7 @@ def response(resp):
|
||||||
- timedelta(hours=int(timeNumbers[0]))\
|
- timedelta(hours=int(timeNumbers[0]))\
|
||||||
- timedelta(minutes=int(timeNumbers[1]))
|
- timedelta(minutes=int(timeNumbers[1]))
|
||||||
else:
|
else:
|
||||||
# TODO year in string possible?
|
publishedDate = parser.parse(publishedDate)
|
||||||
publishedDate = datetime.strptime(publishedDate,
|
|
||||||
"%b %d %H:%M%p")
|
|
||||||
|
|
||||||
if publishedDate.year == 1900:
|
if publishedDate.year == 1900:
|
||||||
publishedDate = publishedDate.replace(year=datetime.now().year)
|
publishedDate = publishedDate.replace(year=datetime.now().year)
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
from json import loads
|
from json import loads
|
||||||
from urllib import urlencode
|
from urllib import urlencode
|
||||||
|
from dateutil import parser
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
categories = ['videos']
|
categories = ['videos']
|
||||||
|
|
||||||
|
@ -35,6 +37,10 @@ def response(resp):
|
||||||
content = ''
|
content = ''
|
||||||
thumbnail = ''
|
thumbnail = ''
|
||||||
|
|
||||||
|
#"2013-12-31T15:22:51.000Z"
|
||||||
|
pubdate = result['published']['$t']
|
||||||
|
publishedDate = parser.parse(pubdate)
|
||||||
|
|
||||||
if result['media$group']['media$thumbnail']:
|
if result['media$group']['media$thumbnail']:
|
||||||
thumbnail = result['media$group']['media$thumbnail'][0]['url']
|
thumbnail = result['media$group']['media$thumbnail'][0]['url']
|
||||||
content += '<a href="{0}" title="{0}" ><img src="{1}" /></a>'.format(url, thumbnail) # noqa
|
content += '<a href="{0}" title="{0}" ><img src="{1}" /></a>'.format(url, thumbnail) # noqa
|
||||||
|
@ -48,6 +54,7 @@ def response(resp):
|
||||||
'title': title,
|
'title': title,
|
||||||
'content': content,
|
'content': content,
|
||||||
'template': 'videos.html',
|
'template': 'videos.html',
|
||||||
|
'publishedDate': publishedDate,
|
||||||
'thumbnail': thumbnail})
|
'thumbnail': thumbnail})
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
|
@ -16,6 +16,7 @@
|
||||||
<title>{{ r.title }}</title>
|
<title>{{ r.title }}</title>
|
||||||
<link>{{ r.url }}</link>
|
<link>{{ r.url }}</link>
|
||||||
<description>{{ r.content }}</description>
|
<description>{{ r.content }}</description>
|
||||||
|
{% if r.pubdate %}<pubDate>{{ r.pubdate }}</pubDate>{% endif %}
|
||||||
</item>
|
</item>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</channel>
|
</channel>
|
||||||
|
|
|
@ -5,6 +5,7 @@
|
||||||
|
|
||||||
<p>
|
<p>
|
||||||
<h3 class="result_title"><a href="{{ result.url }}">{{ result.title|safe }}</a></h3>
|
<h3 class="result_title"><a href="{{ result.url }}">{{ result.title|safe }}</a></h3>
|
||||||
|
{% if result.publishedDate %}<p class="published_date">{{ result.publishedDate }}</p>{% endif %}
|
||||||
<a href="{{ result.url }}"><img width="400px" src="{{ result.thumbnail }}" title={{ result.title }} alt=" {{ result.title }}"/></a>
|
<a href="{{ result.url }}"><img width="400px" src="{{ result.thumbnail }}" title={{ result.title }} alt=" {{ result.title }}"/></a>
|
||||||
<p class="url">{{ result.url }}</p>
|
<p class="url">{{ result.url }}</p>
|
||||||
</p>
|
</p>
|
||||||
|
|
|
@ -159,8 +159,8 @@ def index():
|
||||||
|
|
||||||
# TODO, check if timezone is calculated right
|
# TODO, check if timezone is calculated right
|
||||||
if 'publishedDate' in result:
|
if 'publishedDate' in result:
|
||||||
if result['publishedDate'] >= datetime.now() - timedelta(days=1):
|
if result['publishedDate'].replace(tzinfo=None) >= datetime.now() - timedelta(days=1):
|
||||||
timedifference = datetime.now() - result['publishedDate']
|
timedifference = datetime.now() - result['publishedDate'].replace(tzinfo=None)
|
||||||
minutes = int((timedifference.seconds / 60) % 60)
|
minutes = int((timedifference.seconds / 60) % 60)
|
||||||
hours = int(timedifference.seconds / 60 / 60)
|
hours = int(timedifference.seconds / 60 / 60)
|
||||||
if hours == 0:
|
if hours == 0:
|
||||||
|
@ -168,6 +168,7 @@ def index():
|
||||||
else:
|
else:
|
||||||
result['publishedDate'] = gettext(u'{hours} hour(s), {minutes} minute(s) ago').format(hours=hours, minutes=minutes) # noqa
|
result['publishedDate'] = gettext(u'{hours} hour(s), {minutes} minute(s) ago').format(hours=hours, minutes=minutes) # noqa
|
||||||
else:
|
else:
|
||||||
|
result['pubdate'] = result['publishedDate'].strftime('%a, %d %b %Y %H:%M:%S %z')
|
||||||
result['publishedDate'] = format_date(result['publishedDate'])
|
result['publishedDate'] = format_date(result['publishedDate'])
|
||||||
|
|
||||||
if search.request_data.get('format') == 'json':
|
if search.request_data.get('format') == 'json':
|
||||||
|
|
1
setup.py
1
setup.py
|
@ -35,6 +35,7 @@ setup(
|
||||||
'lxml',
|
'lxml',
|
||||||
'pyyaml',
|
'pyyaml',
|
||||||
'setuptools',
|
'setuptools',
|
||||||
|
'python-dateutil',
|
||||||
],
|
],
|
||||||
extras_require={
|
extras_require={
|
||||||
'test': [
|
'test': [
|
||||||
|
|
Loading…
Reference in a new issue