forked from Ponysearch/Ponysearch
[fix] fix flickr_noapi decoding (#1655)
Characters that were not ASCII were incorrectly decoded. Add an helper function: searx.utils.ecma_unescape (Python implementation of unescape Javascript function).
This commit is contained in:
parent
4dc792e1e2
commit
2179079a91
3 changed files with 32 additions and 6 deletions
|
@ -16,7 +16,8 @@ from json import loads
|
||||||
from time import time
|
from time import time
|
||||||
import re
|
import re
|
||||||
from searx.engines import logger
|
from searx.engines import logger
|
||||||
from searx.url_utils import urlencode, unquote
|
from searx.url_utils import urlencode
|
||||||
|
from searx.utils import ecma_unescape, html_to_text
|
||||||
|
|
||||||
logger = logger.getChild('flickr-noapi')
|
logger = logger.getChild('flickr-noapi')
|
||||||
|
|
||||||
|
@ -75,11 +76,10 @@ def response(resp):
|
||||||
|
|
||||||
for index in legend:
|
for index in legend:
|
||||||
photo = model_export['main'][index[0]][int(index[1])][index[2]][index[3]][int(index[4])]
|
photo = model_export['main'][index[0]][int(index[1])][index[2]][index[3]][int(index[4])]
|
||||||
author = unquote(photo.get('realname', ''))
|
author = ecma_unescape(photo.get('realname', ''))
|
||||||
source = unquote(photo.get('username', '')) + ' @ Flickr'
|
source = ecma_unescape(photo.get('username', '')) + ' @ Flickr'
|
||||||
title = unquote(photo.get('title', ''))
|
title = ecma_unescape(photo.get('title', ''))
|
||||||
content = unquote(photo.get('description', ''))
|
content = html_to_text(ecma_unescape(photo.get('description', '')))
|
||||||
|
|
||||||
img_src = None
|
img_src = None
|
||||||
# From the biggest to the lowest format
|
# From the biggest to the lowest format
|
||||||
for image_size in image_sizes:
|
for image_size in image_sizes:
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
import csv
|
import csv
|
||||||
import hashlib
|
import hashlib
|
||||||
import hmac
|
import hmac
|
||||||
|
@ -44,6 +45,9 @@ logger = logger.getChild('utils')
|
||||||
blocked_tags = ('script',
|
blocked_tags = ('script',
|
||||||
'style')
|
'style')
|
||||||
|
|
||||||
|
ecma_unescape4_re = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
|
||||||
|
ecma_unescape2_re = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
|
||||||
|
|
||||||
useragents = json.loads(open(os.path.dirname(os.path.realpath(__file__))
|
useragents = json.loads(open(os.path.dirname(os.path.realpath(__file__))
|
||||||
+ "/data/useragents.json", 'r', encoding='utf-8').read())
|
+ "/data/useragents.json", 'r', encoding='utf-8').read())
|
||||||
|
|
||||||
|
@ -415,3 +419,18 @@ def to_string(obj):
|
||||||
return obj.__str__()
|
return obj.__str__()
|
||||||
if hasattr(obj, '__repr__'):
|
if hasattr(obj, '__repr__'):
|
||||||
return obj.__repr__()
|
return obj.__repr__()
|
||||||
|
|
||||||
|
|
||||||
|
def ecma_unescape(s):
|
||||||
|
"""
|
||||||
|
python implementation of the unescape javascript function
|
||||||
|
|
||||||
|
https://www.ecma-international.org/ecma-262/6.0/#sec-unescape-string
|
||||||
|
https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/unescape
|
||||||
|
"""
|
||||||
|
# s = unicode(s)
|
||||||
|
# "%u5409" becomes "吉"
|
||||||
|
s = ecma_unescape4_re.sub(lambda e: unichr(int(e.group(1), 16)), s)
|
||||||
|
# "%20" becomes " ", "%F3" becomes "ó"
|
||||||
|
s = ecma_unescape2_re.sub(lambda e: unichr(int(e.group(1), 16)), s)
|
||||||
|
return s
|
||||||
|
|
|
@ -90,6 +90,13 @@ class TestUtils(SearxTestCase):
|
||||||
self.assertEqual(utils.match_language('iw-IL', ['he-IL']), 'he-IL')
|
self.assertEqual(utils.match_language('iw-IL', ['he-IL']), 'he-IL')
|
||||||
self.assertEqual(utils.match_language('he-IL', ['iw-IL'], aliases), 'iw-IL')
|
self.assertEqual(utils.match_language('he-IL', ['iw-IL'], aliases), 'iw-IL')
|
||||||
|
|
||||||
|
def test_ecma_unscape(self):
|
||||||
|
self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space')
|
||||||
|
self.assertEqual(utils.ecma_unescape('text using %xx: %F3'),
|
||||||
|
u'text using %xx: ó')
|
||||||
|
self.assertEqual(utils.ecma_unescape('text using %u: %u5409, %u4E16%u754c'),
|
||||||
|
u'text using %u: 吉, 世界')
|
||||||
|
|
||||||
|
|
||||||
class TestHTMLTextExtractor(SearxTestCase):
|
class TestHTMLTextExtractor(SearxTestCase):
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue