[fix] wolframalpha page changes

related issues: #508 #509
This commit is contained in:
Adam Tauber 2016-02-17 17:07:19 +01:00
parent 4e5af8d87b
commit d06178139f
2 changed files with 59 additions and 223 deletions

View file

@ -8,79 +8,85 @@
# @stable no
# @parse answer
from re import search, sub
from cgi import escape
from json import loads
from time import time
from urllib import urlencode
from lxml import html
import HTMLParser
from searx.poolrequests import get as http_get
# search-url
url = 'http://www.wolframalpha.com/'
url = 'https://www.wolframalpha.com/'
search_url = url + 'input/?{query}'
search_url = url + 'input/json.jsp'\
'?async=true'\
'&banners=raw'\
'&debuggingdata=false'\
'&format=image,plaintext,imagemap,minput,moutput'\
'&formattimeout=2'\
'&{query}'\
'&output=JSON'\
'&parsetimeout=2'\
'&proxycode={token}'\
'&scantimeout=0.5'\
'&sponsorcategories=true'\
'&statemethod=deploybutton'
# xpath variables
scripts_xpath = '//script'
title_xpath = '//title'
failure_xpath = '//p[attribute::class="pfail"]'
token = {'value': '',
'last_updated': None}
# seems, wolframalpha resets its token in every hour
def obtain_token():
update_time = time() - (time() % 3600)
token_response = http_get('https://www.wolframalpha.com/input/api/v1/code?ts=9999999999999999999', timeout=2.0)
token['value'] = loads(token_response.text)['code']
token['last_updated'] = update_time
return token
obtain_token()
# do search-request
def request(query, params):
params['url'] = search_url.format(query=urlencode({'i': query}))
# obtain token if last update was more than an hour
if time() - token['last_updated'] > 3600:
obtain_token()
params['url'] = search_url.format(query=urlencode({'input': query}), token=token['value'])
params['headers']['Referer'] = 'https://www.wolframalpha.com/input/?i=' + query
return params
# get response from search-request
def response(resp):
results = []
line = None
resp_json = loads(resp.text)
dom = html.fromstring(resp.text)
scripts = dom.xpath(scripts_xpath)
if not resp_json['queryresult']['success']:
return []
# the answer is inside a js function
# answer can be located in different 'pods', although by default it should be in pod_0200
possible_locations = ['pod_0200\.push\((.*)',
'pod_0100\.push\((.*)']
# TODO handle resp_json['queryresult']['assumptions']
result_chunks = []
for pod in resp_json['queryresult']['pods']:
pod_title = pod.get('title', '')
if 'subpods' not in pod:
continue
for subpod in pod['subpods']:
if 'img' in subpod:
result_chunks.append(u'<p>{0}<br /><img src="{1}" alt="{2}" /></p>'
.format(escape(pod_title or subpod['img']['alt']),
escape(subpod['img']['src']),
escape(subpod['img']['alt'])))
# failed result
if dom.xpath(failure_xpath):
return results
if not result_chunks:
return []
# get line that matches the pattern
for pattern in possible_locations:
for script in scripts:
try:
line = search(pattern, script.text_content()).group(1)
break
except AttributeError:
continue
if line:
break
if line:
# extract answer from json
answer = line[line.find('{'):line.rfind('}') + 1]
try:
answer = loads(answer)
except Exception:
answer = loads(answer.encode('unicode-escape'))
answer = answer['stringified']
# clean plaintext answer
h = HTMLParser.HTMLParser()
answer = h.unescape(answer.decode('unicode-escape'))
answer = sub(r'\\', '', answer)
results.append({'answer': answer})
# user input is in first part of title
title = dom.xpath(title_xpath)[0].text.encode('utf-8')
result_url = request(title[:-16], {})['url']
# append result
results.append({'url': result_url,
'title': title.decode('utf-8')})
return results
return [{'url': resp.request.headers['Referer'],
'title': 'Wolframalpha',
'content': ''.join(result_chunks)}]

View file

@ -1,6 +1,5 @@
# -*- coding: utf-8 -*-
from collections import defaultdict
import mock
from searx.engines import wolframalpha_noapi
from searx.testing import SearxTestCase
@ -21,173 +20,4 @@ class TestWolframAlphaNoAPIEngine(SearxTestCase):
self.assertRaises(AttributeError, wolframalpha_noapi.response, [])
self.assertRaises(AttributeError, wolframalpha_noapi.response, '')
self.assertRaises(AttributeError, wolframalpha_noapi.response, '[]')
html = """
<!DOCTYPE html>
<title> Parangaricutirimícuaro - Wolfram|Alpha</title>
<meta charset="utf-8" />
<body>
<div id="closest">
<p class="pfail">Wolfram|Alpha doesn't know how to interpret your input.</p>
<div id="dtips">
<div class="tip">
<span class="tip-title">Tip:&nbsp;</span>
Check your spelling, and use English
<span class="tip-extra"></span>
</div>
</div>
</div>
</body>
</html>
"""
# test failed query
response = mock.Mock(text=html)
self.assertEqual(wolframalpha_noapi.response(response), [])
html = """
<!DOCTYPE html>
<title> sqrt(-1) - Wolfram|Alpha</title>
<meta charset="utf-8" />
<body>
<script type="text/javascript">
try {
if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) {
context.jsonArray.popups.pod_0100 = [];
}
context.jsonArray.popups.pod_0100.push( {"stringified": "sqrt(-1)","mInput": "","mOutput": ""});
} catch(e) { }
try {
if (typeof context.jsonArray.popups.pod_0200 == "undefined" ) {
context.jsonArray.popups.pod_0200 = [];
}
context.jsonArray.popups.pod_0200.push( {"stringified": "i","mInput": "","mOutput": ""});
} catch(e) { }
</script>
</body>
</html>
"""
# test plaintext
response = mock.Mock(text=html)
results = wolframalpha_noapi.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 2)
self.assertEquals('i', results[0]['answer'])
self.assertIn('sqrt(-1) - Wolfram|Alpha', results[1]['title'])
self.assertEquals('http://www.wolframalpha.com/input/?i=+sqrt%28-1%29', results[1]['url'])
html = """
<!DOCTYPE html>
<title> integral 1/x - Wolfram|Alpha</title>
<meta charset="utf-8" />
<body>
<script type="text/javascript">
try {
if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) {
context.jsonArray.popups.pod_0100 = [];
}
context.jsonArray.popups.pod_0100.push( {"stringified": "integral 1\/x dx = log(x)+constant"});
} catch(e) { }
</script>
</body>
</html>
"""
# test integral
response = mock.Mock(text=html)
results = wolframalpha_noapi.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 2)
self.assertIn('log(x)+c', results[0]['answer'])
self.assertIn('integral 1/x - Wolfram|Alpha', results[1]['title'])
self.assertEquals('http://www.wolframalpha.com/input/?i=+integral+1%2Fx', results[1]['url'])
html = """
<!DOCTYPE html>
<title> &int;1&#x2f;x &#xf74c;x - Wolfram|Alpha</title>
<meta charset="utf-8" />
<body>
<script type="text/javascript">
try {
if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) {
context.jsonArray.popups.pod_0100 = [];
}
context.jsonArray.popups.pod_0100.push( {"stringified": "integral 1\/x dx = log(x)+constant"});
} catch(e) { }
</script>
</body>
</html>
"""
# test input in mathematical notation
response = mock.Mock(text=html)
results = wolframalpha_noapi.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 2)
self.assertIn('log(x)+c', results[0]['answer'])
self.assertIn('∫1/x x - Wolfram|Alpha'.decode('utf-8'), results[1]['title'])
self.assertEquals('http://www.wolframalpha.com/input/?i=+%E2%88%AB1%2Fx+%EF%9D%8Cx', results[1]['url'])
html = """
<!DOCTYPE html>
<title> 1 euro to yen - Wolfram|Alpha</title>
<meta charset="utf-8" />
<body>
<script type="text/javascript">
try {
if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) {
context.jsonArray.popups.pod_0100 = [];
}
context.jsonArray.popups.pod_0100.push( {"stringified": "convert euro1 (euro) to Japanese yen"});
} catch(e) { }
try {
if (typeof context.jsonArray.popups.pod_0200 == "undefined" ) {
context.jsonArray.popups.pod_0200 = [];
}
context.jsonArray.popups.pod_0200.push( {"stringified": "&yen;130.5 (Japanese yen)"});
} catch(e) { }
</script>
</body>
</html>
"""
# test output with htmlentity
response = mock.Mock(text=html)
results = wolframalpha_noapi.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 2)
self.assertIn('¥'.decode('utf-8'), results[0]['answer'])
self.assertIn('1 euro to yen - Wolfram|Alpha', results[1]['title'])
self.assertEquals('http://www.wolframalpha.com/input/?i=+1+euro+to+yen', results[1]['url'])
html = """
<!DOCTYPE html>
<title> distance from nairobi to kyoto in inches - Wolfram|Alpha</title>
<meta charset="utf-8" />
<body>
<script type="text/javascript">
try {
if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) {
context.jsonArray.popups.pod_0100 = [];
}
[...].pod_0100.push( {"stringified": "convert distance | from | Nairobi, Kenya\nto | Kyoto, Japan to inches"});
} catch(e) { }
try {
if (typeof context.jsonArray.popups.pod_0200 == "undefined" ) {
context.jsonArray.popups.pod_0200 = [];
}
pod_0200.push({"stringified": "4.295&times;10^8 inches","mOutput": "Quantity[4.295×10^8,&amp;quot;Inches&amp;quot;]"});
} catch(e) { }
</script>
</body>
</html>
"""
# test output with utf-8 character
response = mock.Mock(text=html)
results = wolframalpha_noapi.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 2)
self.assertIn('4.295×10^8 inches'.decode('utf-8'), results[0]['answer'])
self.assertIn('distance from nairobi to kyoto in inches - Wolfram|Alpha', results[1]['title'])
self.assertEquals('http://www.wolframalpha.com/input/?i=+distance+from+nairobi+to+kyoto+in+inches',
results[1]['url'])
# TODO