forked from Ponysearch/Ponysearch
Merge pull request #2190 from dalf/fix-htmltextextractor
[fix] searx.utils.HTMLTextExtractor: invalid HTML don't raise an Exception
This commit is contained in:
commit
530fc4bda7
2 changed files with 18 additions and 2 deletions
|
@ -77,6 +77,10 @@ def highlight_content(content, query):
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLTextExtractorException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class HTMLTextExtractor(HTMLParser):
|
class HTMLTextExtractor(HTMLParser):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -92,7 +96,7 @@ class HTMLTextExtractor(HTMLParser):
|
||||||
return
|
return
|
||||||
|
|
||||||
if tag != self.tags[-1]:
|
if tag != self.tags[-1]:
|
||||||
raise Exception("invalid html")
|
raise HTMLTextExtractorException()
|
||||||
|
|
||||||
self.tags.pop()
|
self.tags.pop()
|
||||||
|
|
||||||
|
@ -128,7 +132,10 @@ def html_to_text(html):
|
||||||
html = html.replace('\n', ' ')
|
html = html.replace('\n', ' ')
|
||||||
html = ' '.join(html.split())
|
html = ' '.join(html.split())
|
||||||
s = HTMLTextExtractor()
|
s = HTMLTextExtractor()
|
||||||
|
try:
|
||||||
s.feed(html)
|
s.feed(html)
|
||||||
|
except HTMLTextExtractorException:
|
||||||
|
logger.debug("HTMLTextExtractor: invalid HTML\n%s", html)
|
||||||
return s.get_text()
|
return s.get_text()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -52,6 +52,10 @@ class TestUtils(SearxTestCase):
|
||||||
self.assertIsNotNone(utils.html_to_text(html))
|
self.assertIsNotNone(utils.html_to_text(html))
|
||||||
self.assertEqual(utils.html_to_text(html), "Test text")
|
self.assertEqual(utils.html_to_text(html), "Test text")
|
||||||
|
|
||||||
|
def test_html_to_text_invalid(self):
|
||||||
|
html = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
|
||||||
|
self.assertEqual(utils.html_to_text(html), "Lorem ipsum")
|
||||||
|
|
||||||
def test_prettify_url(self):
|
def test_prettify_url(self):
|
||||||
data = (('https://searx.me/', 'https://searx.me/'),
|
data = (('https://searx.me/', 'https://searx.me/'),
|
||||||
('https://searx.me/ű', 'https://searx.me/ű'),
|
('https://searx.me/ű', 'https://searx.me/ű'),
|
||||||
|
@ -116,6 +120,11 @@ class TestHTMLTextExtractor(SearxTestCase):
|
||||||
self.html_text_extractor.handle_entityref(entity)
|
self.html_text_extractor.handle_entityref(entity)
|
||||||
self.assertIn(entity, self.html_text_extractor.result)
|
self.assertIn(entity, self.html_text_extractor.result)
|
||||||
|
|
||||||
|
def test_invalid_html(self):
|
||||||
|
text = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
|
||||||
|
with self.assertRaises(utils.HTMLTextExtractorException):
|
||||||
|
self.html_text_extractor.feed(text)
|
||||||
|
|
||||||
|
|
||||||
class TestUnicodeWriter(SearxTestCase):
|
class TestUnicodeWriter(SearxTestCase):
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue