diff --git a/docs/dev/plugins/rerank.rst b/docs/dev/plugins/rerank.rst new file mode 100644 index 000000000..7611747e0 --- /dev/null +++ b/docs/dev/plugins/rerank.rst @@ -0,0 +1,9 @@ +.. _rerank plugin: + +================ +Rerank +================ + +.. automodule:: searx.plugins.rerank + :members: + diff --git a/searx/plugins/_core.py b/searx/plugins/_core.py index 7df9772e9..6f302f1e2 100644 --- a/searx/plugins/_core.py +++ b/searx/plugins/_core.py @@ -68,6 +68,9 @@ class PluginInfo: keywords: list[str] = field(default_factory=list) """See :py:obj:`Plugin.keywords`""" + is_allowed: bool = True + """Switch to disable plugin completely, without the user preference.""" + class Plugin(abc.ABC): """Abstract base class of all Plugins.""" diff --git a/searx/plugins/rerank.py b/searx/plugins/rerank.py new file mode 100644 index 000000000..3ec858e0a --- /dev/null +++ b/searx/plugins/rerank.py @@ -0,0 +1,115 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# pylint: disable=missing-module-docstring, missing-class-docstring +from __future__ import annotations +import typing + +from flask_babel import gettext + +from searx import settings +from searx.plugins import Plugin, PluginInfo +from searx.result_types import EngineResults + +if typing.TYPE_CHECKING: + from searx.search import SearchWithPlugins + from searx.extended_types import SXNG_Request + +try: + import bm25s +except ImportError: + # Import error is ignored because the admin has to install bm25s manually to use the plugin + bm25s = None + + +class SXNGPlugin(Plugin): + """Plugin which reranks the search results using the Okapi BM25 algorithm. + + This plugin utilizes the `bm25s` library to reorder search results based on their relevance to the search query, + potentially improving the quality of results. Before enabling this plugin, + ensure you have installed the ``bm25s`` pip package. e.g. by installing it directly via pip or + by adding it to the project's `requirements.txt` file. + + Configuration: + -------------- + To enable the Rerank plugin, add it to the `enabled_plugins` list in your `settings.yml` file: + + .. code:: yaml + + enabled_plugins: + .. + - 'Rerank plugin' + + By default, the plugin retains the information about which engines found a particular result. + Results that appear in multiple engine results will receive a score boost. + This approach might be relevant if you wish results found by different engines to be prioritized. + You can modify this behaviour by configuring the ``remove_extra_engines`` setting. + If ``remove_extra_engines`` is set to ``true``, the original engine list is reduced to only the first engine. + This is useful when you prefer the reranking to not be affected by any potential overlap + of results from different engines. + + .. code:: yaml + + rerank: + remove_extra_engines: true + + """ + + id = "rerank" + default_on = False + + def __init__(self): + super().__init__() + + self.stopword_langs = ['en', 'de', 'nl', 'fr', 'es', 'pt', 'it', 'ru', 'sv', 'no', 'zh'] + self.remove_extra_engines = settings.get('rerank', {}).get('remove_extra_engines') + + self.info = PluginInfo( + id=self.id, + name=gettext("Rerank plugin"), + description=gettext("""Rerank search results, ignoring original engine ranking"""), + preference_section="general", + is_allowed=bm25s is not None, + ) + + def post_search(self, request: "SXNG_Request", search: "SearchWithPlugins") -> EngineResults: + results = EngineResults() + + if not bm25s: + return results + + # pylint: disable=protected-access + results = search.result_container._merged_results + query = search.search_query.query + locale = search.search_query.locale + + # Determine the stopwords based on the selected locale + stopwords = locale.language if locale and locale.language in self.stopword_langs else 'en' + + retriever = bm25s.BM25() + result_tokens = bm25s.tokenize( + [ + f"{result.get('title', '')} | {result.get('content', '')} | {result.get('url', '')}" + for result in results + ], + stopwords=stopwords, + ) + retriever.index(result_tokens) + + query_tokens = bm25s.tokenize(query, stopwords=stopwords) + + # Retrieve ranked indices of results based on the query tokens + indices = retriever.retrieve(query_tokens, k=len(results), return_as='documents', show_progress=False) + + if self.remove_extra_engines: + # Only keep the main engine and set our ranking + for position, index in enumerate(indices[0]): + if 'positions' in results[index]: + results[index]['positions'] = [position + 1] + results[index]['engines'] = set([results[index]['engine']]) + else: + # Overwrite all engine positions with the new ranking + # Results returned from multiple engines will still get a score boost + for position, index in enumerate(indices[0]): + if 'positions' in results[index]: + results[index]['positions'] = [position + 1] * len(results[index]['positions']) + + return results diff --git a/searx/settings.yml b/searx/settings.yml index 3c9245bbd..83e9b67cc 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -249,6 +249,7 @@ outgoing: # - 'Hostnames plugin' # see 'hostnames' configuration below # - 'Open Access DOI rewrite' # - 'Tor check plugin' +# - 'Rerank plugin' # requires the bm25s python dependency to be installed # Configuration of the "Hostnames plugin": # diff --git a/searx/templates/simple/preferences.html b/searx/templates/simple/preferences.html index e86e926cc..e815102c3 100644 --- a/searx/templates/simple/preferences.html +++ b/searx/templates/simple/preferences.html @@ -38,7 +38,7 @@ {%- macro plugin_preferences(section) -%} {%- for plugin in plugins_storage -%} - {%- if plugin.preference_section == section -%} + {%- if plugin.preference_section == section and plugin.is_allowed -%}
{{- '' -}} {{ _(plugin.name) }}{{- '' -}}