[fix] wolframalpha page changes

related issues: #508 #509
2016-02-17 17:07:19 +01:00 · 2016-02-17 17:07:19 +01:00 · d06178139f
commit d06178139f
parent 4e5af8d87b
2 changed files with 59 additions and 223 deletions
--- a/searx/engines/wolframalpha_noapi.py
+++ b/searx/engines/wolframalpha_noapi.py
@ -8,79 +8,85 @@
 # @stable      no
 # @parse       answer
-from re import search, sub
+from cgi import escape
 from json import loads
 from time import time
 from urllib import urlencode
-from lxml import html
+
-import HTMLParser
+from searx.poolrequests import get as http_get
 # search-url
-url = 'http://www.wolframalpha.com/'
+url = 'https://www.wolframalpha.com/'
 search_url = url + 'input/?{query}'
 search_url = url + 'input/json.jsp'\
    '?async=true'\
    '&banners=raw'\
    '&debuggingdata=false'\
    '&format=image,plaintext,imagemap,minput,moutput'\
    '&formattimeout=2'\
    '&{query}'\
    '&output=JSON'\
    '&parsetimeout=2'\
    '&proxycode={token}'\
    '&scantimeout=0.5'\
    '&sponsorcategories=true'\
    '&statemethod=deploybutton'
 # xpath variables
 scripts_xpath = '//script'
 title_xpath = '//title'
 failure_xpath = '//p[attribute::class="pfail"]'
 token = {'value': '',
         'last_updated': None}
 # seems, wolframalpha resets its token in every hour
 def obtain_token():
    update_time = time() - (time() % 3600)
    token_response = http_get('https://www.wolframalpha.com/input/api/v1/code?ts=9999999999999999999', timeout=2.0)
    token['value'] = loads(token_response.text)['code']
    token['last_updated'] = update_time
    return token
 obtain_token()
 # do search-request
 def request(query, params):
-    params['url'] = search_url.format(query=urlencode({'i': query}))
+    # obtain token if last update was more than an hour
    if time() - token['last_updated'] > 3600:
        obtain_token()
    params['url'] = search_url.format(query=urlencode({'input': query}), token=token['value'])
    params['headers']['Referer'] = 'https://www.wolframalpha.com/input/?i=' + query
    return params
 # get response from search-request
 def response(resp):
-    results = []
+    resp_json = loads(resp.text)
    line = None
-    dom = html.fromstring(resp.text)
+    if not resp_json['queryresult']['success']:
-    scripts = dom.xpath(scripts_xpath)
+        return []
-    # the answer is inside a js function
+    # TODO handle resp_json['queryresult']['assumptions']
-    # answer can be located in different 'pods', although by default it should be in pod_0200
+    result_chunks = []
-    possible_locations = ['pod_0200\.push\((.*)',
+    for pod in resp_json['queryresult']['pods']:
-                          'pod_0100\.push\((.*)']
+        pod_title = pod.get('title', '')
-
+        if 'subpods' not in pod:
    # failed result
    if dom.xpath(failure_xpath):
        return results
    # get line that matches the pattern
    for pattern in possible_locations:
        for script in scripts:
            try:
                line = search(pattern, script.text_content()).group(1)
                break
            except AttributeError:
            continue
-        if line:
+        for subpod in pod['subpods']:
-            break
+            if 'img' in subpod:
                result_chunks.append(u'<p>{0}<br /><img src="{1}" alt="{2}" /></p>'
                                     .format(escape(pod_title or subpod['img']['alt']),
                                             escape(subpod['img']['src']),
                                             escape(subpod['img']['alt'])))
-    if line:
+    if not result_chunks:
-        # extract answer from json
+        return []
        answer = line[line.find('{'):line.rfind('}') + 1]
        try:
            answer = loads(answer)
        except Exception:
            answer = loads(answer.encode('unicode-escape'))
        answer = answer['stringified']
-        # clean plaintext answer
+    return [{'url': resp.request.headers['Referer'],
-        h = HTMLParser.HTMLParser()
+             'title': 'Wolframalpha',
-        answer = h.unescape(answer.decode('unicode-escape'))
+             'content': ''.join(result_chunks)}]
        answer = sub(r'\\', '', answer)
        results.append({'answer': answer})
    # user input is in first part of title
    title = dom.xpath(title_xpath)[0].text.encode('utf-8')
    result_url = request(title[:-16], {})['url']
    # append result
    results.append({'url': result_url,
                    'title': title.decode('utf-8')})
    return results
--- a/tests/unit/engines/test_wolframalpha_noapi.py
+++ b/tests/unit/engines/test_wolframalpha_noapi.py
@ -1,6 +1,5 @@
 # -*- coding: utf-8 -*-
 from collections import defaultdict
 import mock
 from searx.engines import wolframalpha_noapi
 from searx.testing import SearxTestCase
@ -21,173 +20,4 @@ class TestWolframAlphaNoAPIEngine(SearxTestCase):
        self.assertRaises(AttributeError, wolframalpha_noapi.response, [])
        self.assertRaises(AttributeError, wolframalpha_noapi.response, '')
        self.assertRaises(AttributeError, wolframalpha_noapi.response, '[]')
-
+        # TODO
        html = """
        <!DOCTYPE html>
            <title> Parangaricutirimícuaro - Wolfram|Alpha</title>
            <meta charset="utf-8" />
            <body>
                <div id="closest">
                    <p class="pfail">Wolfram|Alpha doesn't know how to interpret your input.</p>
                    <div id="dtips">
                        <div class="tip">
                            <span class="tip-title">Tip:&nbsp;</span>
                                Check your spelling, and use English
                            <span class="tip-extra"></span>
                        </div>
                    </div>
                </div>
            </body>
        </html>
        """
        # test failed query
        response = mock.Mock(text=html)
        self.assertEqual(wolframalpha_noapi.response(response), [])
        html = """
        <!DOCTYPE html>
            <title> sqrt(-1) - Wolfram|Alpha</title>
            <meta charset="utf-8" />
            <body>
                <script type="text/javascript">
                  try {
                    if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) {
                      context.jsonArray.popups.pod_0100 = [];
                    }
                    context.jsonArray.popups.pod_0100.push( {"stringified": "sqrt(-1)","mInput": "","mOutput": ""});
                  } catch(e) { }
                  try {
                    if (typeof context.jsonArray.popups.pod_0200 == "undefined" ) {
                      context.jsonArray.popups.pod_0200 = [];
                    }
                    context.jsonArray.popups.pod_0200.push( {"stringified": "i","mInput": "","mOutput": ""});
                  } catch(e) { }
                </script>
            </body>
        </html>
        """
        # test plaintext
        response = mock.Mock(text=html)
        results = wolframalpha_noapi.response(response)
        self.assertEqual(type(results), list)
        self.assertEqual(len(results), 2)
        self.assertEquals('i', results[0]['answer'])
        self.assertIn('sqrt(-1) - Wolfram|Alpha', results[1]['title'])
        self.assertEquals('http://www.wolframalpha.com/input/?i=+sqrt%28-1%29', results[1]['url'])
        html = """
        <!DOCTYPE html>
            <title> integral 1/x - Wolfram|Alpha</title>
            <meta charset="utf-8" />
            <body>
                <script type="text/javascript">
                  try {
                    if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) {
                      context.jsonArray.popups.pod_0100 = [];
                    }
                    context.jsonArray.popups.pod_0100.push( {"stringified": "integral 1\/x dx = log(x)+constant"});
                  } catch(e) { }
                </script>
            </body>
        </html>
        """
        # test integral
        response = mock.Mock(text=html)
        results = wolframalpha_noapi.response(response)
        self.assertEqual(type(results), list)
        self.assertEqual(len(results), 2)
        self.assertIn('log(x)+c', results[0]['answer'])
        self.assertIn('integral 1/x - Wolfram|Alpha', results[1]['title'])
        self.assertEquals('http://www.wolframalpha.com/input/?i=+integral+1%2Fx', results[1]['url'])
        html = """
        <!DOCTYPE html>
            <title> &int;1&#x2f;x &#xf74c;x - Wolfram|Alpha</title>
            <meta charset="utf-8" />
            <body>
                <script type="text/javascript">
                  try {
                    if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) {
                      context.jsonArray.popups.pod_0100 = [];
                    }
                    context.jsonArray.popups.pod_0100.push( {"stringified": "integral 1\/x dx = log(x)+constant"});
                  } catch(e) { }
                </script>
            </body>
        </html>
        """
        # test input in mathematical notation
        response = mock.Mock(text=html)
        results = wolframalpha_noapi.response(response)
        self.assertEqual(type(results), list)
        self.assertEqual(len(results), 2)
        self.assertIn('log(x)+c', results[0]['answer'])
        self.assertIn('∫1/x x - Wolfram|Alpha'.decode('utf-8'), results[1]['title'])
        self.assertEquals('http://www.wolframalpha.com/input/?i=+%E2%88%AB1%2Fx+%EF%9D%8Cx', results[1]['url'])
        html = """
        <!DOCTYPE html>
            <title> 1 euro to yen - Wolfram|Alpha</title>
            <meta charset="utf-8" />
            <body>
                <script type="text/javascript">
                  try {
                    if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) {
                      context.jsonArray.popups.pod_0100 = [];
                    }
                  context.jsonArray.popups.pod_0100.push( {"stringified": "convert euro1  (euro) to Japanese yen"});
                  } catch(e) { }
                  try {
                    if (typeof context.jsonArray.popups.pod_0200 == "undefined" ) {
                      context.jsonArray.popups.pod_0200 = [];
                    }
                    context.jsonArray.popups.pod_0200.push( {"stringified": "&yen;130.5  (Japanese yen)"});
                  } catch(e) { }
                </script>
            </body>
        </html>
        """
        # test output with htmlentity
        response = mock.Mock(text=html)
        results = wolframalpha_noapi.response(response)
        self.assertEqual(type(results), list)
        self.assertEqual(len(results), 2)
        self.assertIn('¥'.decode('utf-8'), results[0]['answer'])
        self.assertIn('1 euro to yen - Wolfram|Alpha', results[1]['title'])
        self.assertEquals('http://www.wolframalpha.com/input/?i=+1+euro+to+yen', results[1]['url'])
        html = """
        <!DOCTYPE html>
            <title> distance from nairobi to kyoto in inches - Wolfram|Alpha</title>
            <meta charset="utf-8" />
            <body>
                <script type="text/javascript">
                  try {
                    if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) {
                      context.jsonArray.popups.pod_0100 = [];
                    }
 [...].pod_0100.push( {"stringified": "convert distance | from | Nairobi, Kenya\nto | Kyoto, Japan to inches"});
                  } catch(e) { }
                  try {
                    if (typeof context.jsonArray.popups.pod_0200 == "undefined" ) {
                      context.jsonArray.popups.pod_0200 = [];
                    }
 pod_0200.push({"stringified": "4.295&times;10^8 inches","mOutput": "Quantity[4.295×10^8,&amp;quot;Inches&amp;quot;]"});
                  } catch(e) { }
                </script>
            </body>
        </html>
        """
        # test output with utf-8 character
        response = mock.Mock(text=html)
        results = wolframalpha_noapi.response(response)
        self.assertEqual(type(results), list)
        self.assertEqual(len(results), 2)
        self.assertIn('4.295×10^8 inches'.decode('utf-8'), results[0]['answer'])
        self.assertIn('distance from nairobi to kyoto in inches - Wolfram|Alpha', results[1]['title'])
        self.assertEquals('http://www.wolframalpha.com/input/?i=+distance+from+nairobi+to+kyoto+in+inches',
                          results[1]['url'])