Merge pull request #2019 from ArtikusHG/fasttext

Replace langdetect with fasttext (followup of #1969)
This commit is contained in:
Alexandre Flament 2022-12-16 21:54:07 +01:00 committed by GitHub
commit b927482195
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 61 additions and 54 deletions

View file

@ -11,7 +11,6 @@ httpx[http2]==0.21.2
Brotli==1.0.9
uvloop==0.17.0
httpx-socks[asyncio]==0.7.2
langdetect==1.0.9
setproctitle==1.3.2
redis==4.4.0
markdown-it-py==2.1.0

View file

@ -66,46 +66,28 @@ that is identified as an English term (try ``:de-DE thermomix``, for example).
"""
from flask_babel import gettext
import fasttext
import babel
from searx.data import data_dir
from searx.utils import detect_language
from searx.languages import language_codes
# Monkey patch: prevent fasttext from showing a (useless) warning when loading a
# model.
fasttext.FastText.eprint = lambda x: None
name = gettext('Autodetect search language')
description = gettext('Automatically detect the query search language and switch to it.')
preference_section = 'general'
default_on = False
lang_model: fasttext.FastText._FastText = None
"""fasttext model to predict laguage of a search term"""
supported_langs = set()
"""Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`)."""
def get_model():
# lazy load, in order to to save memory
global lang_model # pylint: disable=global-statement
if lang_model is None:
lang_model = fasttext.load_model(str(data_dir / 'lid.176.ftz'))
return lang_model
def pre_search(request, search): # pylint: disable=unused-argument
prediction = get_model().predict(search.search_query.query, k=1, threshold=0.3)
if prediction:
lang = prediction[0][0].split('__label__')[1]
if lang in supported_langs:
search.search_query.lang = lang
try:
search.search_query.locale = babel.Locale.parse(lang)
except babel.core.UnknownLocaleError:
pass
lang = detect_language(search.search_query.query, min_probability=0)
if lang in supported_langs:
search.search_query.lang = lang
try:
search.search_query.locale = babel.Locale.parse(lang)
except babel.core.UnknownLocaleError:
pass
return True

View file

@ -10,12 +10,10 @@ from timeit import default_timer
from urllib.parse import urlparse
import re
from langdetect import detect_langs
from langdetect.lang_detect_exception import LangDetectException
import httpx
from searx import network, logger
from searx.utils import gen_useragent
from searx.utils import gen_useragent, detect_language
from searx.results import ResultContainer
from searx.search.models import SearchQuery, EngineRef
from searx.search.processors import EngineProcessor
@ -208,14 +206,10 @@ class ResultContainerTests:
self.test_results.add_error(self.test_name, message, *args, '(' + sqstr + ')')
def _add_language(self, text: str) -> typing.Optional[str]:
try:
r = detect_langs(str(text)) # pylint: disable=E1101
except LangDetectException:
return None
if len(r) > 0 and r[0].prob > 0.95:
self.languages.add(r[0].lang)
self.test_results.add_language(r[0].lang)
langStr = detect_language(text)
if langStr:
self.languages.add(langStr)
self.test_results.add_language(langStr)
return None
def _check_result(self, result):

View file

@ -15,6 +15,7 @@ from os.path import splitext, join
from random import choice
from html.parser import HTMLParser
from urllib.parse import urljoin, urlparse
import fasttext
from lxml import html
from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
@ -22,7 +23,7 @@ from babel.core import get_global
from searx import settings
from searx.data import USER_AGENTS
from searx.data import USER_AGENTS, data_dir
from searx.version import VERSION_TAG
from searx.languages import language_codes
from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
@ -50,6 +51,12 @@ _STORAGE_UNIT_VALUE: Dict[str, int] = {
_XPATH_CACHE: Dict[str, XPath] = {}
_LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {}
_FASTTEXT_MODEL: Optional[fasttext.FastText._FastText] = None
"""fasttext model to predict laguage of a search term"""
# Monkey patch: prevent fasttext from showing a (useless) warning when loading a model.
fasttext.FastText.eprint = lambda x: None
class _NotSetClass: # pylint: disable=too-few-public-methods
"""Internal class for this module, do not create instance of this class.
@ -621,3 +628,20 @@ def eval_xpath_getindex(elements: ElementBase, xpath_spec: XPathSpecType, index:
# to record xpath_spec
raise SearxEngineXPathException(xpath_spec, 'index ' + str(index) + ' not found')
return default
def _get_fasttext_model() -> fasttext.FastText._FastText:
global _FASTTEXT_MODEL # pylint: disable=global-statement
if _FASTTEXT_MODEL is None:
_FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz'))
return _FASTTEXT_MODEL
def detect_language(text: str, threshold: float = 0.3, min_probability: float = 0.5) -> Optional[str]:
"""https://fasttext.cc/docs/en/language-identification.html"""
if not isinstance(text, str):
raise ValueError('text must a str')
r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold)
if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0 and r[1][0] > min_probability:
return r[0][0].split('__label__')[1]
return None

View file

@ -17,14 +17,11 @@ from os.path import join
from lxml.html import fromstring
from langdetect import detect_langs
from langdetect.lang_detect_exception import LangDetectException
from searx.engines import wikidata, set_loggers
from searx.utils import extract_text, match_language
from searx.locales import LOCALE_NAMES, locales_initialize
from searx import searx_dir
from searx.utils import gen_useragent
from searx.utils import gen_useragent, detect_language
import searx.search
import searx.network
@ -117,17 +114,6 @@ def get_wikipedia_summary(lang, pageid):
return None
def detect_language(text):
try:
r = detect_langs(str(text)) # pylint: disable=E1101
except LangDetectException:
return None
if len(r) > 0 and r[0].prob > 0.95:
return r[0].lang
return None
def get_website_description(url, lang1, lang2=None):
headers = {
'User-Agent': gen_useragent(),

View file

@ -232,3 +232,25 @@ class TestXPathUtils(SearxTestCase):
with self.assertRaises(SearxEngineXPathException) as context:
utils.eval_xpath_getindex(doc, 'count(//i)', 1)
self.assertEqual(context.exception.message, 'the result is not a list')
def test_detect_language(self):
# make sure new line are not an issue
# fasttext.predict('') does not accept new line.
l = utils.detect_language('The quick brown fox jumps over\nthe lazy dog')
self.assertEqual(l, 'en')
l = utils.detect_language('いろはにほへと ちりぬるを わかよたれそ つねならむ うゐのおくやま けふこえて あさきゆめみし ゑひもせす')
self.assertEqual(l, 'ja')
l = utils.detect_language('Pijamalı hasta yağız şoföre çabucak güvendi.')
self.assertEqual(l, 'tr')
l = utils.detect_language('')
self.assertIsNone(l)
# mix languages --> None
l = utils.detect_language('The いろはにほへと Pijamalı')
self.assertIsNone(l)
with self.assertRaises(ValueError):
utils.detect_language(None)