extract publishDate from vimeo

This commit is contained in:
Thomas Pointhuber 2014-03-18 15:56:22 +01:00
parent 337bd6d907
commit 993271bed3
2 changed files with 7 additions and 1 deletions

View file

@ -2,6 +2,8 @@ from urllib import urlencode
from HTMLParser import HTMLParser from HTMLParser import HTMLParser
from lxml import html from lxml import html
from xpath import extract_text from xpath import extract_text
from datetime import datetime
from dateutil import parser
base_url = 'http://vimeo.com' base_url = 'http://vimeo.com'
search_url = base_url + '/search?{query}' search_url = base_url + '/search?{query}'
@ -10,6 +12,7 @@ content_xpath = None
title_xpath = None title_xpath = None
results_xpath = '' results_xpath = ''
content_tpl = '<a href="{0}"> <img src="{2}"/> </a>' content_tpl = '<a href="{0}"> <img src="{2}"/> </a>'
publishedDate_xpath = './/p[@class="meta"]//attribute::datetime'
# the cookie set by vimeo contains all the following values, # the cookie set by vimeo contains all the following values,
# but only __utma seems to be requiered # but only __utma seems to be requiered
@ -40,9 +43,12 @@ def response(resp):
url = base_url + result.xpath(url_xpath)[0] url = base_url + result.xpath(url_xpath)[0]
title = p.unescape(extract_text(result.xpath(title_xpath))) title = p.unescape(extract_text(result.xpath(title_xpath)))
thumbnail = extract_text(result.xpath(content_xpath)[0]) thumbnail = extract_text(result.xpath(content_xpath)[0])
publishedDate = parser.parse(extract_text(result.xpath(publishedDate_xpath)[0]))
results.append({'url': url, results.append({'url': url,
'title': title, 'title': title,
'content': content_tpl.format(url, title, thumbnail), 'content': content_tpl.format(url, title, thumbnail),
'template': 'videos.html', 'template': 'videos.html',
'publishedDate': publishedDate,
'thumbnail': thumbnail}) 'thumbnail': thumbnail})
return results return results