Merge pull request #57 from pointhi/results

improving publishDate extraction and output of it
This commit is contained in:
Adam Tauber 2014-03-18 18:20:10 +01:00
commit 018a14431b
9 changed files with 26 additions and 15 deletions

View file

@ -3,3 +3,4 @@ flask-babel
grequests grequests
lxml lxml
pyyaml pyyaml
python-dateutil

View file

@ -2,6 +2,7 @@
from urllib import urlencode from urllib import urlencode
from json import loads from json import loads
from dateutil import parser
from datetime import datetime from datetime import datetime
categories = ['news'] categories = ['news']
@ -32,16 +33,9 @@ def response(resp):
return [] return []
for result in search_res['responseData']['results']: for result in search_res['responseData']['results']:
# S.149 (159), library.pdf
# datetime.strptime("Mon, 10 Mar 2014 16:26:15 -0700", # Mon, 10 Mar 2014 16:26:15 -0700
# "%a, %d %b %Y %H:%M:%S %z") publishedDate = parser.parse(result['publishedDate'])
# publishedDate = parse(result['publishedDate'])
publishedDate = datetime.strptime(
str.join(' ', result['publishedDate'].split(None)[0:5]),
"%a, %d %b %Y %H:%M:%S")
#utc_offset = timedelta(result['publishedDate'].split(None)[5])
# local = utc + offset
#publishedDate = publishedDate + utc_offset
results.append({'url': result['unescapedUrl'], results.append({'url': result['unescapedUrl'],
'title': result['titleNoFormatting'], 'title': result['titleNoFormatting'],

View file

@ -2,6 +2,8 @@ from urllib import urlencode
from HTMLParser import HTMLParser from HTMLParser import HTMLParser
from lxml import html from lxml import html
from xpath import extract_text from xpath import extract_text
from datetime import datetime
from dateutil import parser
base_url = 'http://vimeo.com' base_url = 'http://vimeo.com'
search_url = base_url + '/search?{query}' search_url = base_url + '/search?{query}'
@ -10,6 +12,7 @@ content_xpath = None
title_xpath = None title_xpath = None
results_xpath = '' results_xpath = ''
content_tpl = '<a href="{0}"> <img src="{2}"/> </a>' content_tpl = '<a href="{0}"> <img src="{2}"/> </a>'
publishedDate_xpath = './/p[@class="meta"]//attribute::datetime'
# the cookie set by vimeo contains all the following values, # the cookie set by vimeo contains all the following values,
# but only __utma seems to be requiered # but only __utma seems to be requiered
@ -40,9 +43,12 @@ def response(resp):
url = base_url + result.xpath(url_xpath)[0] url = base_url + result.xpath(url_xpath)[0]
title = p.unescape(extract_text(result.xpath(title_xpath))) title = p.unescape(extract_text(result.xpath(title_xpath)))
thumbnail = extract_text(result.xpath(content_xpath)[0]) thumbnail = extract_text(result.xpath(content_xpath)[0])
publishedDate = parser.parse(extract_text(result.xpath(publishedDate_xpath)[0]))
results.append({'url': url, results.append({'url': url,
'title': title, 'title': title,
'content': content_tpl.format(url, title, thumbnail), 'content': content_tpl.format(url, title, thumbnail),
'template': 'videos.html', 'template': 'videos.html',
'publishedDate': publishedDate,
'thumbnail': thumbnail}) 'thumbnail': thumbnail})
return results return results

View file

@ -6,6 +6,7 @@ from searx.engines.xpath import extract_text, extract_url
from searx.engines.yahoo import parse_url from searx.engines.yahoo import parse_url
from datetime import datetime, timedelta from datetime import datetime, timedelta
import re import re
from dateutil import parser
categories = ['news'] categories = ['news']
search_url = 'http://news.search.yahoo.com/search?{query}&b={offset}' search_url = 'http://news.search.yahoo.com/search?{query}&b={offset}'
@ -52,9 +53,7 @@ def response(resp):
- timedelta(hours=int(timeNumbers[0]))\ - timedelta(hours=int(timeNumbers[0]))\
- timedelta(minutes=int(timeNumbers[1])) - timedelta(minutes=int(timeNumbers[1]))
else: else:
# TODO year in string possible? publishedDate = parser.parse(publishedDate)
publishedDate = datetime.strptime(publishedDate,
"%b %d %H:%M%p")
if publishedDate.year == 1900: if publishedDate.year == 1900:
publishedDate = publishedDate.replace(year=datetime.now().year) publishedDate = publishedDate.replace(year=datetime.now().year)

View file

@ -1,5 +1,7 @@
from json import loads from json import loads
from urllib import urlencode from urllib import urlencode
from dateutil import parser
from datetime import datetime
categories = ['videos'] categories = ['videos']
@ -35,6 +37,10 @@ def response(resp):
content = '' content = ''
thumbnail = '' thumbnail = ''
#"2013-12-31T15:22:51.000Z"
pubdate = result['published']['$t']
publishedDate = parser.parse(pubdate)
if result['media$group']['media$thumbnail']: if result['media$group']['media$thumbnail']:
thumbnail = result['media$group']['media$thumbnail'][0]['url'] thumbnail = result['media$group']['media$thumbnail'][0]['url']
content += '<a href="{0}" title="{0}" ><img src="{1}" /></a>'.format(url, thumbnail) # noqa content += '<a href="{0}" title="{0}" ><img src="{1}" /></a>'.format(url, thumbnail) # noqa
@ -48,6 +54,7 @@ def response(resp):
'title': title, 'title': title,
'content': content, 'content': content,
'template': 'videos.html', 'template': 'videos.html',
'publishedDate': publishedDate,
'thumbnail': thumbnail}) 'thumbnail': thumbnail})
return results return results

View file

@ -16,6 +16,7 @@
<title>{{ r.title }}</title> <title>{{ r.title }}</title>
<link>{{ r.url }}</link> <link>{{ r.url }}</link>
<description>{{ r.content }}</description> <description>{{ r.content }}</description>
{% if r.pubdate %}<pubDate>{{ r.pubdate }}</pubDate>{% endif %}
</item> </item>
{% endfor %} {% endfor %}
</channel> </channel>

View file

@ -5,6 +5,7 @@
<p> <p>
<h3 class="result_title"><a href="{{ result.url }}">{{ result.title|safe }}</a></h3> <h3 class="result_title"><a href="{{ result.url }}">{{ result.title|safe }}</a></h3>
{% if result.publishedDate %}<p class="published_date">{{ result.publishedDate }}</p>{% endif %}
<a href="{{ result.url }}"><img width="400px" src="{{ result.thumbnail }}" title={{ result.title }} alt=" {{ result.title }}"/></a> <a href="{{ result.url }}"><img width="400px" src="{{ result.thumbnail }}" title={{ result.title }} alt=" {{ result.title }}"/></a>
<p class="url">{{ result.url }}</p> <p class="url">{{ result.url }}</p>
</p> </p>

View file

@ -159,8 +159,8 @@ def index():
# TODO, check if timezone is calculated right # TODO, check if timezone is calculated right
if 'publishedDate' in result: if 'publishedDate' in result:
if result['publishedDate'] >= datetime.now() - timedelta(days=1): if result['publishedDate'].replace(tzinfo=None) >= datetime.now() - timedelta(days=1):
timedifference = datetime.now() - result['publishedDate'] timedifference = datetime.now() - result['publishedDate'].replace(tzinfo=None)
minutes = int((timedifference.seconds / 60) % 60) minutes = int((timedifference.seconds / 60) % 60)
hours = int(timedifference.seconds / 60 / 60) hours = int(timedifference.seconds / 60 / 60)
if hours == 0: if hours == 0:
@ -168,6 +168,7 @@ def index():
else: else:
result['publishedDate'] = gettext(u'{hours} hour(s), {minutes} minute(s) ago').format(hours=hours, minutes=minutes) # noqa result['publishedDate'] = gettext(u'{hours} hour(s), {minutes} minute(s) ago').format(hours=hours, minutes=minutes) # noqa
else: else:
result['pubdate'] = result['publishedDate'].strftime('%a, %d %b %Y %H:%M:%S %z')
result['publishedDate'] = format_date(result['publishedDate']) result['publishedDate'] = format_date(result['publishedDate'])
if search.request_data.get('format') == 'json': if search.request_data.get('format') == 'json':

View file

@ -35,6 +35,7 @@ setup(
'lxml', 'lxml',
'pyyaml', 'pyyaml',
'setuptools', 'setuptools',
'python-dateutil',
], ],
extras_require={ extras_require={
'test': [ 'test': [