Merge 023a646b0a24e0f37eaa308b793cc8eecfb1ba57 into ecee73eafd694bb91b840882aea5b3d6c5b40a7b
This commit is contained in:
commit
d1a4793fb5
9
docs/dev/plugins/rerank.rst
Normal file
9
docs/dev/plugins/rerank.rst
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
.. _rerank plugin:
|
||||||
|
|
||||||
|
================
|
||||||
|
Rerank
|
||||||
|
================
|
||||||
|
|
||||||
|
.. automodule:: searx.plugins.rerank
|
||||||
|
:members:
|
||||||
|
|
@ -68,6 +68,9 @@ class PluginInfo:
|
|||||||
keywords: list[str] = field(default_factory=list)
|
keywords: list[str] = field(default_factory=list)
|
||||||
"""See :py:obj:`Plugin.keywords`"""
|
"""See :py:obj:`Plugin.keywords`"""
|
||||||
|
|
||||||
|
is_allowed: bool = True
|
||||||
|
"""Switch to disable plugin completely, without the user preference."""
|
||||||
|
|
||||||
|
|
||||||
class Plugin(abc.ABC):
|
class Plugin(abc.ABC):
|
||||||
"""Abstract base class of all Plugins."""
|
"""Abstract base class of all Plugins."""
|
||||||
|
115
searx/plugins/rerank.py
Normal file
115
searx/plugins/rerank.py
Normal file
@ -0,0 +1,115 @@
|
|||||||
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
|
# pylint: disable=missing-module-docstring, missing-class-docstring
|
||||||
|
from __future__ import annotations
|
||||||
|
import typing
|
||||||
|
|
||||||
|
from flask_babel import gettext
|
||||||
|
|
||||||
|
from searx import settings
|
||||||
|
from searx.plugins import Plugin, PluginInfo
|
||||||
|
from searx.result_types import EngineResults
|
||||||
|
|
||||||
|
if typing.TYPE_CHECKING:
|
||||||
|
from searx.search import SearchWithPlugins
|
||||||
|
from searx.extended_types import SXNG_Request
|
||||||
|
|
||||||
|
try:
|
||||||
|
import bm25s
|
||||||
|
except ImportError:
|
||||||
|
# Import error is ignored because the admin has to install bm25s manually to use the plugin
|
||||||
|
bm25s = None
|
||||||
|
|
||||||
|
|
||||||
|
class SXNGPlugin(Plugin):
|
||||||
|
"""Plugin which reranks the search results using the Okapi BM25 algorithm.
|
||||||
|
|
||||||
|
This plugin utilizes the `bm25s` library to reorder search results based on their relevance to the search query,
|
||||||
|
potentially improving the quality of results. Before enabling this plugin,
|
||||||
|
ensure you have installed the ``bm25s`` pip package. e.g. by installing it directly via pip or
|
||||||
|
by adding it to the project's `requirements.txt` file.
|
||||||
|
|
||||||
|
Configuration:
|
||||||
|
--------------
|
||||||
|
To enable the Rerank plugin, add it to the `enabled_plugins` list in your `settings.yml` file:
|
||||||
|
|
||||||
|
.. code:: yaml
|
||||||
|
|
||||||
|
enabled_plugins:
|
||||||
|
..
|
||||||
|
- 'Rerank plugin'
|
||||||
|
|
||||||
|
By default, the plugin retains the information about which engines found a particular result.
|
||||||
|
Results that appear in multiple engine results will receive a score boost.
|
||||||
|
This approach might be relevant if you wish results found by different engines to be prioritized.
|
||||||
|
You can modify this behaviour by configuring the ``remove_extra_engines`` setting.
|
||||||
|
If ``remove_extra_engines`` is set to ``true``, the original engine list is reduced to only the first engine.
|
||||||
|
This is useful when you prefer the reranking to not be affected by any potential overlap
|
||||||
|
of results from different engines.
|
||||||
|
|
||||||
|
.. code:: yaml
|
||||||
|
|
||||||
|
rerank:
|
||||||
|
remove_extra_engines: true
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
id = "rerank"
|
||||||
|
default_on = False
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self.stopword_langs = ['en', 'de', 'nl', 'fr', 'es', 'pt', 'it', 'ru', 'sv', 'no', 'zh']
|
||||||
|
self.remove_extra_engines = settings.get('rerank', {}).get('remove_extra_engines')
|
||||||
|
|
||||||
|
self.info = PluginInfo(
|
||||||
|
id=self.id,
|
||||||
|
name=gettext("Rerank plugin"),
|
||||||
|
description=gettext("""Rerank search results, ignoring original engine ranking"""),
|
||||||
|
preference_section="general",
|
||||||
|
is_allowed=bm25s is not None,
|
||||||
|
)
|
||||||
|
|
||||||
|
def post_search(self, request: "SXNG_Request", search: "SearchWithPlugins") -> EngineResults:
|
||||||
|
results = EngineResults()
|
||||||
|
|
||||||
|
if not bm25s:
|
||||||
|
return results
|
||||||
|
|
||||||
|
# pylint: disable=protected-access
|
||||||
|
results = search.result_container._merged_results
|
||||||
|
query = search.search_query.query
|
||||||
|
locale = search.search_query.locale
|
||||||
|
|
||||||
|
# Determine the stopwords based on the selected locale
|
||||||
|
stopwords = locale.language if locale and locale.language in self.stopword_langs else 'en'
|
||||||
|
|
||||||
|
retriever = bm25s.BM25()
|
||||||
|
result_tokens = bm25s.tokenize(
|
||||||
|
[
|
||||||
|
f"{result.get('title', '')} | {result.get('content', '')} | {result.get('url', '')}"
|
||||||
|
for result in results
|
||||||
|
],
|
||||||
|
stopwords=stopwords,
|
||||||
|
)
|
||||||
|
retriever.index(result_tokens)
|
||||||
|
|
||||||
|
query_tokens = bm25s.tokenize(query, stopwords=stopwords)
|
||||||
|
|
||||||
|
# Retrieve ranked indices of results based on the query tokens
|
||||||
|
indices = retriever.retrieve(query_tokens, k=len(results), return_as='documents', show_progress=False)
|
||||||
|
|
||||||
|
if self.remove_extra_engines:
|
||||||
|
# Only keep the main engine and set our ranking
|
||||||
|
for position, index in enumerate(indices[0]):
|
||||||
|
if 'positions' in results[index]:
|
||||||
|
results[index]['positions'] = [position + 1]
|
||||||
|
results[index]['engines'] = set([results[index]['engine']])
|
||||||
|
else:
|
||||||
|
# Overwrite all engine positions with the new ranking
|
||||||
|
# Results returned from multiple engines will still get a score boost
|
||||||
|
for position, index in enumerate(indices[0]):
|
||||||
|
if 'positions' in results[index]:
|
||||||
|
results[index]['positions'] = [position + 1] * len(results[index]['positions'])
|
||||||
|
|
||||||
|
return results
|
@ -249,6 +249,7 @@ outgoing:
|
|||||||
# - 'Hostnames plugin' # see 'hostnames' configuration below
|
# - 'Hostnames plugin' # see 'hostnames' configuration below
|
||||||
# - 'Open Access DOI rewrite'
|
# - 'Open Access DOI rewrite'
|
||||||
# - 'Tor check plugin'
|
# - 'Tor check plugin'
|
||||||
|
# - 'Rerank plugin' # requires the bm25s python dependency to be installed
|
||||||
|
|
||||||
# Configuration of the "Hostnames plugin":
|
# Configuration of the "Hostnames plugin":
|
||||||
#
|
#
|
||||||
|
@ -38,7 +38,7 @@
|
|||||||
|
|
||||||
{%- macro plugin_preferences(section) -%}
|
{%- macro plugin_preferences(section) -%}
|
||||||
{%- for plugin in plugins_storage -%}
|
{%- for plugin in plugins_storage -%}
|
||||||
{%- if plugin.preference_section == section -%}
|
{%- if plugin.preference_section == section and plugin.is_allowed -%}
|
||||||
<fieldset>{{- '' -}}
|
<fieldset>{{- '' -}}
|
||||||
<legend>{{ _(plugin.name) }}</legend>{{- '' -}}
|
<legend>{{ _(plugin.name) }}</legend>{{- '' -}}
|
||||||
<div class="value">
|
<div class="value">
|
||||||
|
Loading…
x
Reference in New Issue
Block a user