[feat] engines: add reuters news engine

2025-03-17 21:53:15 +01:00 · 2025-03-17 21:53:15 +01:00 · 9ffa9fb730
commit 9ffa9fb730
parent 5daa4f0460
5 changed files with 137 additions and 9 deletions
--- a/docs/dev/engines/online/reuters.rst
+++ b/docs/dev/engines/online/reuters.rst
@ -0,0 +1,8 @@
 .. _reuters engine:
 =======
 Reuters
 =======
 .. automodule:: searx.engines.reuters
   :members:
--- a/searx/engines/reuters.py
+++ b/searx/engines/reuters.py
@ -0,0 +1,90 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 """Reuters_ (news) is an international news agency.
 .. _Reuters: https://www.reuters.com
 Configuration
 =============
 The engine has the following additional settings:
 - :py:obj:`sort_order`
 .. code:: yaml
   - name: reuters
     engine: reuters
     shortcut: reu
     sort_order: "relevance"
 Implementations
 ===============
 """
 from json import dumps
 from urllib.parse import quote_plus
 from datetime import datetime, timedelta
 from searx.result_types import EngineResults
 about = {
    "website": "https://www.reuters.com",
    "wikidata_id": "Q130879",
    "official_api_documentation": None,
    "use_official_api": False,
    "require_api_key": False,
    "results": "JSON",
 }
 categories = ["news"]
 time_range_support = True
 paging = True
 base_url = "https://www.reuters.com"
 results_per_page = 20
 sort_order = "relevance"
 """Sort order, one of ``relevance``, ``display_date:desc`` or ``display_data:asc``."""
 time_range_duration_map = {
    "day": 1,
    "week": 7,
    "month": 30,
    "year": 365,
 }
 def request(query, params):
    args = {
        "keyword": query,
        "offset": (params["pageno"] - 1) * results_per_page,
        "orderby": sort_order,
        "size": results_per_page,
        "website": "reuters",
    }
    if params["time_range"]:
        time_diff_days = time_range_duration_map[params["time_range"]]
        start_date = datetime.now() - timedelta(days=time_diff_days)
        args["start_date"] = start_date.isoformat()
    params["url"] = f"{base_url}/pf/api/v3/content/fetch/articles-by-search-v2?query={quote_plus(dumps(args))}"
    return params
 def response(resp) -> EngineResults:
    res = EngineResults()
    for result in resp.json().get("result", {}).get("articles", []):
        res.add(
            res.types.MainResult(
                url=base_url + result["canonical_url"],
                title=result["web"],
                content=result["description"],
                thumbnail=result.get("thumbnail", {}).get("url", ""),
                metadata=result.get("kicker", {}).get("name"),
                publishedDate=datetime.strptime(result["display_time"], "%Y-%m-%dT%H:%M:%SZ"),
            )
        )
    return res
--- a/searx/result_types/_base.py
+++ b/searx/result_types/_base.py
@ -25,6 +25,8 @@ import re
 import urllib.parse
 import warnings
 import typing
 import time
 import datetime
 from collections.abc import Callable
@ -212,6 +214,15 @@ def _filter_urls(result: Result | LegacyResult, filter_func: Callable[[Result |
    result.normalize_result_fields()
 def _normalize_date_fields(result: MainResult | LegacyResult):
    if result.publishedDate:  # do not try to get a date from an empty string or a None type
        try:  # test if publishedDate >= 1900 (datetime module bug)
            result.pubdate = result.publishedDate.strftime('%Y-%m-%d %H:%M:%S%z')
        except ValueError:
            result.publishedDate = None
 class Result(msgspec.Struct, kw_only=True):
    """Base class of all result types :ref:`result types`."""
@ -347,6 +358,24 @@ class MainResult(Result):  # pylint: disable=missing-class-docstring
    thumbnail: str = ""
    """URL of a thumbnail that is displayed in the result item."""
    publishedDate: datetime.datetime | None = None
    """The date on which the object was published."""
    pubdate: str = ""
    """String representation of :py:obj:`MainResult.publishedDate`"""
    length: time.struct_time | None = None
    """Playing duration in seconds."""
    views: str = ""
    """View count in humanized number format."""
    author: str = ""
    """Author of the title."""
    metadata: str = ""
    """Miscellaneous metadata."""
    priority: typing.Literal["", "high", "low"] = ""
    """The priority can be set via :ref:`hostnames plugin`, for example."""
@ -379,8 +408,8 @@ class MainResult(Result):  # pylint: disable=missing-class-docstring
    def normalize_result_fields(self):
        super().normalize_result_fields()
        _normalize_text_fields(self)
        _normalize_date_fields(self)
        if self.engine:
            self.engines.add(self.engine)
@ -419,6 +448,8 @@ class LegacyResult(dict):
    positions: list[int]
    score: float
    category: str
    publishedDate: datetime.datetime | None = None
    pubdate: str = ""
    # infobox result
    urls: list[dict[str, str]]
@ -514,6 +545,7 @@ class LegacyResult(dict):
        return f"LegacyResult: {super().__repr__()}"
    def normalize_result_fields(self):
        _normalize_date_fields(self)
        _normalize_url_fields(self)
        _normalize_text_fields(self)
        if self.engine:
--- a/searx/settings.yml
+++ b/searx/settings.yml
@ -1741,6 +1741,12 @@ engines:
    page_size: 25
    disabled: true
  - name: reuters
    engine: reuters
    shortcut: reu
    # https://docs.searxng.org/dev/engines/online/reuters.html
    # sort_order = "relevance"
  - name: right dao
    engine: xpath
    paging: true
--- a/searx/webapp.py
+++ b/searx/webapp.py
@ -694,14 +694,6 @@ def search():
            if 'title' in result and result['title']:
                result['title'] = highlight_content(escape(result['title'] or ''), search_query.query)
        if getattr(result, 'publishedDate', None):  # do not try to get a date from an empty string or a None type
            try:  # test if publishedDate >= 1900 (datetime module bug)
                result['pubdate'] = result['publishedDate'].strftime('%Y-%m-%d %H:%M:%S%z')
            except ValueError:
                result['publishedDate'] = None
            else:
                result['publishedDate'] = webutils.searxng_l10n_timespan(result['publishedDate'])
        # set result['open_group'] = True when the template changes from the previous result
        # set result['close_group'] = True when the template changes on the next result
        if current_template != result.template: