[mod] utils.py: add markdown_to_text helper function

This commit is contained in:
Bnyro 2023-09-08 08:40:22 +02:00 committed by Markus Heiser
parent 668b1d55ab
commit a3d7e9c285
2 changed files with 30 additions and 12 deletions

View file

@ -42,10 +42,9 @@ Implementations
from datetime import datetime from datetime import datetime
from urllib.parse import urlencode from urllib.parse import urlencode
from markdown_it import MarkdownIt
from flask_babel import gettext from flask_babel import gettext
from searx.utils import html_to_text from searx.utils import markdown_to_text
about = { about = {
"website": 'https://lemmy.ml/', "website": 'https://lemmy.ml/',
@ -78,11 +77,6 @@ def request(query, params):
return params return params
def _format_content(content):
html = MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(content)
return html_to_text(html)
def _get_communities(json): def _get_communities(json):
results = [] results = []
@ -97,7 +91,7 @@ def _get_communities(json):
{ {
'url': result['community']['actor_id'], 'url': result['community']['actor_id'],
'title': result['community']['title'], 'title': result['community']['title'],
'content': _format_content(result['community'].get('description', '')), 'content': markdown_to_text(result['community'].get('description', '')),
'img_src': result['community'].get('icon', result['community'].get('banner')), 'img_src': result['community'].get('icon', result['community'].get('banner')),
'publishedDate': datetime.strptime(counts['published'][:19], '%Y-%m-%dT%H:%M:%S'), 'publishedDate': datetime.strptime(counts['published'][:19], '%Y-%m-%dT%H:%M:%S'),
'metadata': metadata, 'metadata': metadata,
@ -114,7 +108,7 @@ def _get_users(json):
{ {
'url': result['person']['actor_id'], 'url': result['person']['actor_id'],
'title': result['person']['name'], 'title': result['person']['name'],
'content': _format_content(result['person'].get('bio', '')), 'content': markdown_to_text(result['person'].get('bio', '')),
} }
) )
@ -140,7 +134,7 @@ def _get_posts(json):
content = result['post'].get('body', '').strip() content = result['post'].get('body', '').strip()
if content: if content:
content = _format_content(content) content = markdown_to_text(content)
results.append( results.append(
{ {
@ -164,7 +158,7 @@ def _get_comments(json):
content = result['comment'].get('content', '').strip() content = result['comment'].get('content', '').strip()
if content: if content:
content = _format_content(content) content = markdown_to_text(content)
metadata = ( metadata = (
f"▲ {result['counts']['upvotes']} ▼ {result['counts']['downvotes']}" f"▲ {result['counts']['upvotes']} ▼ {result['counts']['downvotes']}"
@ -176,7 +170,7 @@ def _get_comments(json):
{ {
'url': result['comment']['ap_id'], 'url': result['comment']['ap_id'],
'title': result['post']['name'], 'title': result['post']['name'],
'content': _format_content(result['comment']['content']), 'content': markdown_to_text(result['comment']['content']),
'publishedDate': datetime.strptime(result['comment']['published'][:19], '%Y-%m-%dT%H:%M:%S'), 'publishedDate': datetime.strptime(result['comment']['published'][:19], '%Y-%m-%dT%H:%M:%S'),
'metadata': metadata, 'metadata': metadata,
} }

View file

@ -15,6 +15,7 @@ from os.path import splitext, join
from random import choice from random import choice
from html.parser import HTMLParser from html.parser import HTMLParser
from urllib.parse import urljoin, urlparse from urllib.parse import urljoin, urlparse
from markdown_it import MarkdownIt
from lxml import html from lxml import html
from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
@ -158,6 +159,29 @@ def html_to_text(html_str: str) -> str:
return s.get_text() return s.get_text()
def markdown_to_text(markdown_str: str) -> str:
"""Extract text from a Markdown string
Args:
* markdown_str (str): string Markdown
Returns:
* str: extracted text
Examples:
>>> markdown_to_text('[example](https://example.com)')
'example'
>>> markdown_to_text('## Headline')
'Headline'
"""
html_str = (
MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(markdown_str)
)
return html_to_text(html_str)
def extract_text(xpath_results, allow_none: bool = False) -> Optional[str]: def extract_text(xpath_results, allow_none: bool = False) -> Optional[str]:
"""Extract text from a lxml result """Extract text from a lxml result