From 9f9ae5427d82c6b1dbd0145a398d756ceaaf0d4f Mon Sep 17 00:00:00 2001 From: GenericMale Date: Sun, 19 Jan 2025 01:18:13 +0100 Subject: [PATCH 1/3] [feat] plugins: new rerank results plugin --- searx/plugins/rerank.py | 77 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 searx/plugins/rerank.py diff --git a/searx/plugins/rerank.py b/searx/plugins/rerank.py new file mode 100644 index 000000000..50397bcbe --- /dev/null +++ b/searx/plugins/rerank.py @@ -0,0 +1,77 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Plugin which reranks the search results using the Okapi BM25 algorithm. +Before enabling the Rerank plugin, you must the install the pip package ``bm25s``. + +Enable in ``settings.yml``: + +.. code:: yaml + + enabled_plugins: + .. + - 'Rerank plugin' + +By default, the engine list is retained, so results found by multiple engines receive a score boost. +The following setting can be used to ensure that the engine list only contains the first engine. +This will prevent overlapping search engine results from affecting the ranking: + +.. code:: yaml + + rerank: + remove_extra_engines: true + +""" + +from searx import settings + +try: + import bm25s +except ImportError: + # Import error is ignored because the admin has to install bm25s manually to use the engine + pass + +name = 'Rerank plugin' +description = 'Rerank search results, ignoring original engine ranking' +default_on = False +preference_section = 'general' + +# Supported stopwords for bm25s. Default is 'en' +stopword_langs = ['en', 'de', 'nl', 'fr', 'es', 'pt', 'it', 'ru', 'sv', 'no', 'zh'] + +remove_extra_engines = settings.get('rerank', {}).get('remove_extra_engines') + + +def post_search(_request, search): + # pylint: disable=protected-access + results = search.result_container._merged_results + query = search.search_query.query + locale = search.search_query.locale + + # Determine the stopwords based on the selected locale + stopwords = locale.language if locale and locale.language in stopword_langs else True + + retriever = bm25s.BM25() + result_tokens = bm25s.tokenize( + [f"{result.get('title', '')} | {result.get('content', '')} | {result.get('url', '')}" for result in results], + stopwords=stopwords, + ) + retriever.index(result_tokens) + + query_tokens = bm25s.tokenize(query, stopwords=stopwords) + + # Retrieve ranked indices of results based on the query tokens + indices = retriever.retrieve(query_tokens, k=len(results), return_as='documents', show_progress=False) + + if remove_extra_engines: + # Only keep the main engine and set our ranking + for position, index in enumerate(indices[0]): + if 'positions' in results[index]: + results[index]['positions'] = [position + 1] + results[index]['engines'] = set([results[index]['engine']]) + else: + # Overwrite all engine positions with the new ranking + # Results returned from multiple engines will still get a score boost + for position, index in enumerate(indices[0]): + if 'positions' in results[index]: + results[index]['positions'] = [position + 1] * len(results[index]['positions']) + + return True From 7fa0fbd93d000d62a38f253dd390b5c7cd62ecad Mon Sep 17 00:00:00 2001 From: GenericMale Date: Sun, 26 Jan 2025 23:38:16 +0100 Subject: [PATCH 2/3] [mod] rerank plugin: hide if bm25s is not installed & add to docs --- docs/src/searx.plugins.rerank.rst | 9 +++++++++ searx/plugins/rerank.py | 33 +++++++++++++++++++++++-------- searx/settings.yml | 1 + 3 files changed, 35 insertions(+), 8 deletions(-) create mode 100644 docs/src/searx.plugins.rerank.rst diff --git a/docs/src/searx.plugins.rerank.rst b/docs/src/searx.plugins.rerank.rst new file mode 100644 index 000000000..fd0b49169 --- /dev/null +++ b/docs/src/searx.plugins.rerank.rst @@ -0,0 +1,9 @@ +.. _rerank plugin: + +================ +Rerank plugin +================ + +.. automodule:: searx.plugins.rerank + :members: + diff --git a/searx/plugins/rerank.py b/searx/plugins/rerank.py index 50397bcbe..1b52b6a8a 100644 --- a/searx/plugins/rerank.py +++ b/searx/plugins/rerank.py @@ -1,8 +1,14 @@ # SPDX-License-Identifier: AGPL-3.0-or-later """Plugin which reranks the search results using the Okapi BM25 algorithm. -Before enabling the Rerank plugin, you must the install the pip package ``bm25s``. -Enable in ``settings.yml``: +This plugin utilizes the `bm25s` library to reorder search results based on their relevance to the search query, +potentially improving the quality of results. Before enabling this plugin, +ensure you have installed the ``bm25s`` pip package. e.g. by installing it directly via pip or +by adding it to the project's `requirements.txt` file. + +Configuration: +-------------- +To enable the Rerank plugin, add it to the `enabled_plugins` list in your `settings.yml` file: .. code:: yaml @@ -10,9 +16,13 @@ Enable in ``settings.yml``: .. - 'Rerank plugin' -By default, the engine list is retained, so results found by multiple engines receive a score boost. -The following setting can be used to ensure that the engine list only contains the first engine. -This will prevent overlapping search engine results from affecting the ranking: +By default, the plugin retains the information about which engines found a particular result. +Results that appear in multiple engine results will receive a score boost. +This approach might be relevant if you wish results found by different engines to be prioritized. +You can modify this behaviour by configuring the ``remove_extra_engines`` setting. +If ``remove_extra_engines`` is set to ``true``, the original engine list is reduced to only the first engine. +This is useful when you prefer the reranking to not be affected by any potential overlap +of results from different engines. .. code:: yaml @@ -26,8 +36,8 @@ from searx import settings try: import bm25s except ImportError: - # Import error is ignored because the admin has to install bm25s manually to use the engine - pass + # Import error is ignored because the admin has to install bm25s manually to use the plugin + bm25s = None name = 'Rerank plugin' description = 'Rerank search results, ignoring original engine ranking' @@ -41,13 +51,16 @@ remove_extra_engines = settings.get('rerank', {}).get('remove_extra_engines') def post_search(_request, search): + if not bm25s: + return True + # pylint: disable=protected-access results = search.result_container._merged_results query = search.search_query.query locale = search.search_query.locale # Determine the stopwords based on the selected locale - stopwords = locale.language if locale and locale.language in stopword_langs else True + stopwords = locale.language if locale and locale.language in stopword_langs else 'en' retriever = bm25s.BM25() result_tokens = bm25s.tokenize( @@ -75,3 +88,7 @@ def post_search(_request, search): results[index]['positions'] = [position + 1] * len(results[index]['positions']) return True + + +def is_allowed(): + return bm25s is not None diff --git a/searx/settings.yml b/searx/settings.yml index 45dfc67bc..56331b367 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -249,6 +249,7 @@ outgoing: # - 'Hostnames plugin' # see 'hostnames' configuration below # - 'Open Access DOI rewrite' # - 'Tor check plugin' +# - 'Rerank plugin' # requires the bm25s python dependency to be installed # Configuration of the "Hostnames plugin": # From ab83de9332207dfb737d7fb7b305250e1078b457 Mon Sep 17 00:00:00 2001 From: GenericMale Date: Wed, 29 Jan 2025 18:17:46 +0100 Subject: [PATCH 3/3] [fix] rerank plugin: adapt to #4183 --- .../plugins/rerank.rst} | 2 +- searx/plugins/_core.py | 3 + searx/plugins/rerank.py | 161 ++++++++++-------- searx/templates/simple/preferences.html | 2 +- 4 files changed, 96 insertions(+), 72 deletions(-) rename docs/{src/searx.plugins.rerank.rst => dev/plugins/rerank.rst} (88%) diff --git a/docs/src/searx.plugins.rerank.rst b/docs/dev/plugins/rerank.rst similarity index 88% rename from docs/src/searx.plugins.rerank.rst rename to docs/dev/plugins/rerank.rst index fd0b49169..7611747e0 100644 --- a/docs/src/searx.plugins.rerank.rst +++ b/docs/dev/plugins/rerank.rst @@ -1,7 +1,7 @@ .. _rerank plugin: ================ -Rerank plugin +Rerank ================ .. automodule:: searx.plugins.rerank diff --git a/searx/plugins/_core.py b/searx/plugins/_core.py index 70e5758ec..aac330158 100644 --- a/searx/plugins/_core.py +++ b/searx/plugins/_core.py @@ -68,6 +68,9 @@ class PluginInfo: keywords: list[str] = field(default_factory=list) """See :py:obj:`Plugin.keywords`""" + is_allowed: bool = True + """Switch to disable plugin completely, without the user preference.""" + class Plugin(abc.ABC): """Abstract base class of all Plugins.""" diff --git a/searx/plugins/rerank.py b/searx/plugins/rerank.py index 1b52b6a8a..3ec858e0a 100644 --- a/searx/plugins/rerank.py +++ b/searx/plugins/rerank.py @@ -1,37 +1,17 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -"""Plugin which reranks the search results using the Okapi BM25 algorithm. +# pylint: disable=missing-module-docstring, missing-class-docstring +from __future__ import annotations +import typing -This plugin utilizes the `bm25s` library to reorder search results based on their relevance to the search query, -potentially improving the quality of results. Before enabling this plugin, -ensure you have installed the ``bm25s`` pip package. e.g. by installing it directly via pip or -by adding it to the project's `requirements.txt` file. - -Configuration: --------------- -To enable the Rerank plugin, add it to the `enabled_plugins` list in your `settings.yml` file: - -.. code:: yaml - - enabled_plugins: - .. - - 'Rerank plugin' - -By default, the plugin retains the information about which engines found a particular result. -Results that appear in multiple engine results will receive a score boost. -This approach might be relevant if you wish results found by different engines to be prioritized. -You can modify this behaviour by configuring the ``remove_extra_engines`` setting. -If ``remove_extra_engines`` is set to ``true``, the original engine list is reduced to only the first engine. -This is useful when you prefer the reranking to not be affected by any potential overlap -of results from different engines. - -.. code:: yaml - - rerank: - remove_extra_engines: true - -""" +from flask_babel import gettext from searx import settings +from searx.plugins import Plugin, PluginInfo +from searx.result_types import EngineResults + +if typing.TYPE_CHECKING: + from searx.search import SearchWithPlugins + from searx.extended_types import SXNG_Request try: import bm25s @@ -39,56 +19,97 @@ except ImportError: # Import error is ignored because the admin has to install bm25s manually to use the plugin bm25s = None -name = 'Rerank plugin' -description = 'Rerank search results, ignoring original engine ranking' -default_on = False -preference_section = 'general' -# Supported stopwords for bm25s. Default is 'en' -stopword_langs = ['en', 'de', 'nl', 'fr', 'es', 'pt', 'it', 'ru', 'sv', 'no', 'zh'] +class SXNGPlugin(Plugin): + """Plugin which reranks the search results using the Okapi BM25 algorithm. -remove_extra_engines = settings.get('rerank', {}).get('remove_extra_engines') + This plugin utilizes the `bm25s` library to reorder search results based on their relevance to the search query, + potentially improving the quality of results. Before enabling this plugin, + ensure you have installed the ``bm25s`` pip package. e.g. by installing it directly via pip or + by adding it to the project's `requirements.txt` file. + Configuration: + -------------- + To enable the Rerank plugin, add it to the `enabled_plugins` list in your `settings.yml` file: -def post_search(_request, search): - if not bm25s: - return True + .. code:: yaml - # pylint: disable=protected-access - results = search.result_container._merged_results - query = search.search_query.query - locale = search.search_query.locale + enabled_plugins: + .. + - 'Rerank plugin' - # Determine the stopwords based on the selected locale - stopwords = locale.language if locale and locale.language in stopword_langs else 'en' + By default, the plugin retains the information about which engines found a particular result. + Results that appear in multiple engine results will receive a score boost. + This approach might be relevant if you wish results found by different engines to be prioritized. + You can modify this behaviour by configuring the ``remove_extra_engines`` setting. + If ``remove_extra_engines`` is set to ``true``, the original engine list is reduced to only the first engine. + This is useful when you prefer the reranking to not be affected by any potential overlap + of results from different engines. - retriever = bm25s.BM25() - result_tokens = bm25s.tokenize( - [f"{result.get('title', '')} | {result.get('content', '')} | {result.get('url', '')}" for result in results], - stopwords=stopwords, - ) - retriever.index(result_tokens) + .. code:: yaml - query_tokens = bm25s.tokenize(query, stopwords=stopwords) + rerank: + remove_extra_engines: true - # Retrieve ranked indices of results based on the query tokens - indices = retriever.retrieve(query_tokens, k=len(results), return_as='documents', show_progress=False) + """ - if remove_extra_engines: - # Only keep the main engine and set our ranking - for position, index in enumerate(indices[0]): - if 'positions' in results[index]: - results[index]['positions'] = [position + 1] - results[index]['engines'] = set([results[index]['engine']]) - else: - # Overwrite all engine positions with the new ranking - # Results returned from multiple engines will still get a score boost - for position, index in enumerate(indices[0]): - if 'positions' in results[index]: - results[index]['positions'] = [position + 1] * len(results[index]['positions']) + id = "rerank" + default_on = False - return True + def __init__(self): + super().__init__() + self.stopword_langs = ['en', 'de', 'nl', 'fr', 'es', 'pt', 'it', 'ru', 'sv', 'no', 'zh'] + self.remove_extra_engines = settings.get('rerank', {}).get('remove_extra_engines') -def is_allowed(): - return bm25s is not None + self.info = PluginInfo( + id=self.id, + name=gettext("Rerank plugin"), + description=gettext("""Rerank search results, ignoring original engine ranking"""), + preference_section="general", + is_allowed=bm25s is not None, + ) + + def post_search(self, request: "SXNG_Request", search: "SearchWithPlugins") -> EngineResults: + results = EngineResults() + + if not bm25s: + return results + + # pylint: disable=protected-access + results = search.result_container._merged_results + query = search.search_query.query + locale = search.search_query.locale + + # Determine the stopwords based on the selected locale + stopwords = locale.language if locale and locale.language in self.stopword_langs else 'en' + + retriever = bm25s.BM25() + result_tokens = bm25s.tokenize( + [ + f"{result.get('title', '')} | {result.get('content', '')} | {result.get('url', '')}" + for result in results + ], + stopwords=stopwords, + ) + retriever.index(result_tokens) + + query_tokens = bm25s.tokenize(query, stopwords=stopwords) + + # Retrieve ranked indices of results based on the query tokens + indices = retriever.retrieve(query_tokens, k=len(results), return_as='documents', show_progress=False) + + if self.remove_extra_engines: + # Only keep the main engine and set our ranking + for position, index in enumerate(indices[0]): + if 'positions' in results[index]: + results[index]['positions'] = [position + 1] + results[index]['engines'] = set([results[index]['engine']]) + else: + # Overwrite all engine positions with the new ranking + # Results returned from multiple engines will still get a score boost + for position, index in enumerate(indices[0]): + if 'positions' in results[index]: + results[index]['positions'] = [position + 1] * len(results[index]['positions']) + + return results diff --git a/searx/templates/simple/preferences.html b/searx/templates/simple/preferences.html index d68e90e4a..009165b86 100644 --- a/searx/templates/simple/preferences.html +++ b/searx/templates/simple/preferences.html @@ -38,7 +38,7 @@ {%- macro plugin_preferences(section) -%} {%- for plugin in plugins_storage -%} - {%- if plugin.preference_section == section -%} + {%- if plugin.preference_section == section and plugin.is_allowed -%}
{{- '' -}} {{ _(plugin.name) }}{{- '' -}}