[mod] drop fasttext-predict (#5795)

Removes the `fasttext-predict` dependency and the language detection code.

If a user now selects `auto` for the search language, the detected language now
falls back directly to the `Accept-Language` header sent by the browser (which was already the fallback when fasttext returned no result).

- fasttext's [language detection is unreliable](https://github.com/searxng/searxng/issues/4195) for some languages, especially short search queries, and in particular for queries containing proper names which is a common case.
- `fasttext-predict` consumes [significant memory](https://github.com/searxng/searxng/pull/1969#issuecomment-1345366676) without offering users much real value.
- the upstream fasttext project was archived by Meta in 2024
- users already have two better alternatives: the `Accept-Language` header and the search-syntax language prefix (e.g. `:fr` or `:de`).

Related: https://github.com/searxng/searxng/issues/4195
Closes: https://github.com/searxng/searxng/issues/5790
This commit is contained in:
Brock Vojkovic
2026-03-06 22:40:44 +08:00
committed by GitHub
parent c7ba2158f9
commit 68ff08f224
5 changed files with 5 additions and 127 deletions

View File

@@ -12,7 +12,6 @@ httpx-socks[asyncio]==0.10.0
sniffio==1.3.1
valkey==6.1.1
markdown-it-py==3.0.0
fasttext-predict==0.9.2.4
tomli==2.4.0; python_version < '3.11'
msgspec==0.20.0
typer==0.24.1

View File

@@ -25,16 +25,11 @@ from lxml.etree import XPath, XPathError, XPathSyntaxError
from lxml.etree import ElementBase, _Element # pyright: ignore[reportPrivateUsage]
from searx import settings
from searx.data import USER_AGENTS, data_dir, gsa_useragents_loader
from searx.data import USER_AGENTS, gsa_useragents_loader
from searx.version import VERSION_TAG
from searx.sxng_locales import sxng_locales
from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
from searx import logger
if t.TYPE_CHECKING:
import fasttext.FastText # type: ignore
logger = logger.getChild('utils')
XPathSpecType: t.TypeAlias = str | XPath
@@ -61,12 +56,6 @@ _JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
_XPATH_CACHE: dict[str, XPath] = {}
_LANG_TO_LC_CACHE: dict[str, dict[str, str]] = {}
_FASTTEXT_MODEL: "fasttext.FastText._FastText | None" = None # pyright: ignore[reportPrivateUsage]
"""fasttext model to predict language of a search term"""
SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in sxng_locales])
"""Languages supported by most searxng engines (:py:obj:`searx.sxng_locales.sxng_locales`)."""
class _NotSetClass: # pylint: disable=too-few-public-methods
"""Internal class for this module, do not create instance of this class.
@@ -610,17 +599,6 @@ def eval_xpath_getindex(
return default
def _get_fasttext_model() -> "fasttext.FastText._FastText": # pyright: ignore[reportPrivateUsage]
global _FASTTEXT_MODEL # pylint: disable=global-statement
if _FASTTEXT_MODEL is None:
import fasttext # pylint: disable=import-outside-toplevel
# Monkey patch: prevent fasttext from showing a (useless) warning when loading a model.
fasttext.FastText.eprint = lambda x: None # type: ignore
_FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz')) # type: ignore
return _FASTTEXT_MODEL
def get_embeded_stream_url(url: str):
"""
Converts a standard video URL into its embed format. Supported services include Youtube,
@@ -683,77 +661,6 @@ def get_embeded_stream_url(url: str):
return iframe_src
def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> str | None:
"""Detect the language of the ``text`` parameter.
:param str text: The string whose language is to be detected.
:param float threshold: Threshold filters the returned labels by a threshold
on probability. A choice of 0.3 will return labels with at least 0.3
probability.
:param bool only_search_languages: If ``True``, returns only supported
SearXNG search languages. see :py:obj:`searx.languages`
:rtype: str, None
:returns:
The detected language code or ``None``. See below.
:raises ValueError: If ``text`` is not a string.
The language detection is done by using `a fork`_ of the fastText_ library
(`python fasttext`_). fastText_ distributes the `language identification
model`_, for reference:
- `FastText.zip: Compressing text classification models`_
- `Bag of Tricks for Efficient Text Classification`_
The `language identification model`_ support the language codes
(ISO-639-3)::
af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs
bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es
et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia
id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li
lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah
nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru
rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl
tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh
By using ``only_search_languages=True`` the `language identification model`_
is harmonized with the SearXNG's language (locale) model. General
conditions of SearXNG's locale model are:
a. SearXNG's locale of a query is passed to the
:py:obj:`searx.locales.get_engine_locale` to get a language and/or region
code that is used by an engine.
b. Most of SearXNG's engines do not support all the languages from `language
identification model`_ and there is also a discrepancy in the ISO-639-3
(fasttext) and ISO-639-2 (SearXNG)handling. Further more, in SearXNG the
locales like ``zh-TH`` (``zh-CN``) are mapped to ``zh_Hant``
(``zh_Hans``) while the `language identification model`_ reduce both to
``zh``.
.. _a fork: https://github.com/searxng/fasttext-predict
.. _fastText: https://fasttext.cc/
.. _python fasttext: https://pypi.org/project/fasttext/
.. _language identification model: https://fasttext.cc/docs/en/language-identification.html
.. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
.. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
"""
if not isinstance(text, str):
raise ValueError('text must a str') # pyright: ignore[reportUnreachable]
r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold) # type: ignore
if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0: # type: ignore
language = r[0][0].split('__label__')[1] # type: ignore
if only_search_languages and language not in SEARCH_LANGUAGE_CODES:
return None
return language # type: ignore
return None
def _j2p_process_escape(match: re.Match[str]) -> str:
# deal with ECMA escape characters
_escape = match.group(1) or match.group(2)

View File

@@ -9,7 +9,6 @@ from searx.query import RawTextQuery
from searx.engines import categories, engines
from searx.search.models import SearchQuery, EngineRef
from searx.preferences import Preferences, is_locked
from searx.utils import detect_language
# remove duplicate queries.
@@ -233,9 +232,7 @@ def get_search_query_from_webapp(
4. string with the *selected locale* of the query
About language/locale: if the client selects the alias ``auto`` the
``SearchQuery`` object is build up by the :py:obj:`detected language
<searx.utils.detect_language>`. If language recognition does not have a
match the language preferred by the :py:obj:`Preferences.client` is used.
language preferred by the :py:obj:`Preferences.client` is used.
If client does not have a preference, the default ``all`` is used.
The *selected locale* in the tuple always represents the selected
@@ -267,8 +264,7 @@ def get_search_query_from_webapp(
selected_locale = query_lang
if query_lang == 'auto':
query_lang = detect_language(query, threshold=0.8, only_search_languages=True)
query_lang = query_lang or preferences.client.locale_tag or 'all'
query_lang = preferences.client.locale_tag or 'all'
if not is_locked('categories') and raw_text_query.specific:
# if engines are calculated from query,

View File

@@ -19,7 +19,7 @@ from searx.engines import wikidata, set_loggers
from searx.utils import extract_text, searxng_useragent
from searx.locales import LOCALE_NAMES, locales_initialize, match_locale
from searx import searx_dir
from searx.utils import gen_useragent, detect_language
from searx.utils import gen_useragent
import searx.search
import searx.network
from searx.data import data_dir
@@ -169,7 +169,7 @@ def get_website_description(url, lang1, lang2=None):
lang = extract_text(html.xpath('/html/@lang'))
if lang is None and len(lang1) > 0:
lang = lang1
lang = detect_language(description) or lang or 'en'
lang = lang or 'en'
lang = lang.split('_')[0]
lang = lang.split('-')[0]
return (lang, description)

View File

@@ -194,27 +194,3 @@ class TestXPathUtils(SearxTestCase): # pylint: disable=missing-class-docstring
with self.assertRaises(SearxEngineXPathException) as context:
utils.eval_xpath_getindex(doc, 'count(//i)', 1)
self.assertEqual(context.exception.message, 'the result is not a list')
def test_detect_language(self):
# make sure new line are not an issue
# fasttext.predict('') does not accept new line.
l = utils.detect_language('The quick brown fox jumps over\nthe lazy dog')
self.assertEqual(l, 'en')
l = utils.detect_language(
'いろはにほへと ちりぬるを わかよたれそ つねならむ うゐのおくやま けふこえて あさきゆめみし ゑひもせす'
)
self.assertEqual(l, 'ja')
l = utils.detect_language('Pijamalı hasta yağız şoföre çabucak güvendi.')
self.assertEqual(l, 'tr')
l = utils.detect_language('')
self.assertIsNone(l)
# mix languages --> None
l = utils.detect_language('The いろはにほへと Pijamalı')
self.assertIsNone(l)
with self.assertRaises(ValueError):
utils.detect_language(None) # type: ignore