[fix] wolframalpha page changes

related issues: #508 #509
This commit is contained in:
Adam Tauber 2016-02-17 17:07:19 +01:00
parent 4e5af8d87b
commit d06178139f
2 changed files with 59 additions and 223 deletions

View file

@ -8,79 +8,85 @@
# @stable no # @stable no
# @parse answer # @parse answer
from re import search, sub from cgi import escape
from json import loads from json import loads
from time import time
from urllib import urlencode from urllib import urlencode
from lxml import html
import HTMLParser from searx.poolrequests import get as http_get
# search-url # search-url
url = 'http://www.wolframalpha.com/' url = 'https://www.wolframalpha.com/'
search_url = url + 'input/?{query}' search_url = url + 'input/?{query}'
search_url = url + 'input/json.jsp'\
'?async=true'\
'&banners=raw'\
'&debuggingdata=false'\
'&format=image,plaintext,imagemap,minput,moutput'\
'&formattimeout=2'\
'&{query}'\
'&output=JSON'\
'&parsetimeout=2'\
'&proxycode={token}'\
'&scantimeout=0.5'\
'&sponsorcategories=true'\
'&statemethod=deploybutton'
# xpath variables # xpath variables
scripts_xpath = '//script' scripts_xpath = '//script'
title_xpath = '//title' title_xpath = '//title'
failure_xpath = '//p[attribute::class="pfail"]' failure_xpath = '//p[attribute::class="pfail"]'
token = {'value': '',
'last_updated': None}
# seems, wolframalpha resets its token in every hour
def obtain_token():
update_time = time() - (time() % 3600)
token_response = http_get('https://www.wolframalpha.com/input/api/v1/code?ts=9999999999999999999', timeout=2.0)
token['value'] = loads(token_response.text)['code']
token['last_updated'] = update_time
return token
obtain_token()
# do search-request # do search-request
def request(query, params): def request(query, params):
params['url'] = search_url.format(query=urlencode({'i': query})) # obtain token if last update was more than an hour
if time() - token['last_updated'] > 3600:
obtain_token()
params['url'] = search_url.format(query=urlencode({'input': query}), token=token['value'])
params['headers']['Referer'] = 'https://www.wolframalpha.com/input/?i=' + query
return params return params
# get response from search-request # get response from search-request
def response(resp): def response(resp):
results = [] resp_json = loads(resp.text)
line = None
dom = html.fromstring(resp.text) if not resp_json['queryresult']['success']:
scripts = dom.xpath(scripts_xpath) return []
# the answer is inside a js function # TODO handle resp_json['queryresult']['assumptions']
# answer can be located in different 'pods', although by default it should be in pod_0200 result_chunks = []
possible_locations = ['pod_0200\.push\((.*)', for pod in resp_json['queryresult']['pods']:
'pod_0100\.push\((.*)'] pod_title = pod.get('title', '')
if 'subpods' not in pod:
# failed result
if dom.xpath(failure_xpath):
return results
# get line that matches the pattern
for pattern in possible_locations:
for script in scripts:
try:
line = search(pattern, script.text_content()).group(1)
break
except AttributeError:
continue continue
if line: for subpod in pod['subpods']:
break if 'img' in subpod:
result_chunks.append(u'<p>{0}<br /><img src="{1}" alt="{2}" /></p>'
.format(escape(pod_title or subpod['img']['alt']),
escape(subpod['img']['src']),
escape(subpod['img']['alt'])))
if line: if not result_chunks:
# extract answer from json return []
answer = line[line.find('{'):line.rfind('}') + 1]
try:
answer = loads(answer)
except Exception:
answer = loads(answer.encode('unicode-escape'))
answer = answer['stringified']
# clean plaintext answer return [{'url': resp.request.headers['Referer'],
h = HTMLParser.HTMLParser() 'title': 'Wolframalpha',
answer = h.unescape(answer.decode('unicode-escape')) 'content': ''.join(result_chunks)}]
answer = sub(r'\\', '', answer)
results.append({'answer': answer})
# user input is in first part of title
title = dom.xpath(title_xpath)[0].text.encode('utf-8')
result_url = request(title[:-16], {})['url']
# append result
results.append({'url': result_url,
'title': title.decode('utf-8')})
return results

View file

@ -1,6 +1,5 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from collections import defaultdict from collections import defaultdict
import mock
from searx.engines import wolframalpha_noapi from searx.engines import wolframalpha_noapi
from searx.testing import SearxTestCase from searx.testing import SearxTestCase
@ -21,173 +20,4 @@ class TestWolframAlphaNoAPIEngine(SearxTestCase):
self.assertRaises(AttributeError, wolframalpha_noapi.response, []) self.assertRaises(AttributeError, wolframalpha_noapi.response, [])
self.assertRaises(AttributeError, wolframalpha_noapi.response, '') self.assertRaises(AttributeError, wolframalpha_noapi.response, '')
self.assertRaises(AttributeError, wolframalpha_noapi.response, '[]') self.assertRaises(AttributeError, wolframalpha_noapi.response, '[]')
# TODO
html = """
<!DOCTYPE html>
<title> Parangaricutirimícuaro - Wolfram|Alpha</title>
<meta charset="utf-8" />
<body>
<div id="closest">
<p class="pfail">Wolfram|Alpha doesn't know how to interpret your input.</p>
<div id="dtips">
<div class="tip">
<span class="tip-title">Tip:&nbsp;</span>
Check your spelling, and use English
<span class="tip-extra"></span>
</div>
</div>
</div>
</body>
</html>
"""
# test failed query
response = mock.Mock(text=html)
self.assertEqual(wolframalpha_noapi.response(response), [])
html = """
<!DOCTYPE html>
<title> sqrt(-1) - Wolfram|Alpha</title>
<meta charset="utf-8" />
<body>
<script type="text/javascript">
try {
if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) {
context.jsonArray.popups.pod_0100 = [];
}
context.jsonArray.popups.pod_0100.push( {"stringified": "sqrt(-1)","mInput": "","mOutput": ""});
} catch(e) { }
try {
if (typeof context.jsonArray.popups.pod_0200 == "undefined" ) {
context.jsonArray.popups.pod_0200 = [];
}
context.jsonArray.popups.pod_0200.push( {"stringified": "i","mInput": "","mOutput": ""});
} catch(e) { }
</script>
</body>
</html>
"""
# test plaintext
response = mock.Mock(text=html)
results = wolframalpha_noapi.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 2)
self.assertEquals('i', results[0]['answer'])
self.assertIn('sqrt(-1) - Wolfram|Alpha', results[1]['title'])
self.assertEquals('http://www.wolframalpha.com/input/?i=+sqrt%28-1%29', results[1]['url'])
html = """
<!DOCTYPE html>
<title> integral 1/x - Wolfram|Alpha</title>
<meta charset="utf-8" />
<body>
<script type="text/javascript">
try {
if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) {
context.jsonArray.popups.pod_0100 = [];
}
context.jsonArray.popups.pod_0100.push( {"stringified": "integral 1\/x dx = log(x)+constant"});
} catch(e) { }
</script>
</body>
</html>
"""
# test integral
response = mock.Mock(text=html)
results = wolframalpha_noapi.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 2)
self.assertIn('log(x)+c', results[0]['answer'])
self.assertIn('integral 1/x - Wolfram|Alpha', results[1]['title'])
self.assertEquals('http://www.wolframalpha.com/input/?i=+integral+1%2Fx', results[1]['url'])
html = """
<!DOCTYPE html>
<title> &int;1&#x2f;x &#xf74c;x - Wolfram|Alpha</title>
<meta charset="utf-8" />
<body>
<script type="text/javascript">
try {
if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) {
context.jsonArray.popups.pod_0100 = [];
}
context.jsonArray.popups.pod_0100.push( {"stringified": "integral 1\/x dx = log(x)+constant"});
} catch(e) { }
</script>
</body>
</html>
"""
# test input in mathematical notation
response = mock.Mock(text=html)
results = wolframalpha_noapi.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 2)
self.assertIn('log(x)+c', results[0]['answer'])
self.assertIn('∫1/x x - Wolfram|Alpha'.decode('utf-8'), results[1]['title'])
self.assertEquals('http://www.wolframalpha.com/input/?i=+%E2%88%AB1%2Fx+%EF%9D%8Cx', results[1]['url'])
html = """
<!DOCTYPE html>
<title> 1 euro to yen - Wolfram|Alpha</title>
<meta charset="utf-8" />
<body>
<script type="text/javascript">
try {
if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) {
context.jsonArray.popups.pod_0100 = [];
}
context.jsonArray.popups.pod_0100.push( {"stringified": "convert euro1 (euro) to Japanese yen"});
} catch(e) { }
try {
if (typeof context.jsonArray.popups.pod_0200 == "undefined" ) {
context.jsonArray.popups.pod_0200 = [];
}
context.jsonArray.popups.pod_0200.push( {"stringified": "&yen;130.5 (Japanese yen)"});
} catch(e) { }
</script>
</body>
</html>
"""
# test output with htmlentity
response = mock.Mock(text=html)
results = wolframalpha_noapi.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 2)
self.assertIn('¥'.decode('utf-8'), results[0]['answer'])
self.assertIn('1 euro to yen - Wolfram|Alpha', results[1]['title'])
self.assertEquals('http://www.wolframalpha.com/input/?i=+1+euro+to+yen', results[1]['url'])
html = """
<!DOCTYPE html>
<title> distance from nairobi to kyoto in inches - Wolfram|Alpha</title>
<meta charset="utf-8" />
<body>
<script type="text/javascript">
try {
if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) {
context.jsonArray.popups.pod_0100 = [];
}
[...].pod_0100.push( {"stringified": "convert distance | from | Nairobi, Kenya\nto | Kyoto, Japan to inches"});
} catch(e) { }
try {
if (typeof context.jsonArray.popups.pod_0200 == "undefined" ) {
context.jsonArray.popups.pod_0200 = [];
}
pod_0200.push({"stringified": "4.295&times;10^8 inches","mOutput": "Quantity[4.295×10^8,&amp;quot;Inches&amp;quot;]"});
} catch(e) { }
</script>
</body>
</html>
"""
# test output with utf-8 character
response = mock.Mock(text=html)
results = wolframalpha_noapi.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 2)
self.assertIn('4.295×10^8 inches'.decode('utf-8'), results[0]['answer'])
self.assertIn('distance from nairobi to kyoto in inches - Wolfram|Alpha', results[1]['title'])
self.assertEquals('http://www.wolframalpha.com/input/?i=+distance+from+nairobi+to+kyoto+in+inches',
results[1]['url'])