From 9ffa9fb73061f41f91a490bcb2772d2eb56e0d78 Mon Sep 17 00:00:00 2001 From: Bnyro Date: Mon, 17 Mar 2025 21:53:15 +0100 Subject: [PATCH] [feat] engines: add reuters news engine --- docs/dev/engines/online/reuters.rst | 8 +++ searx/engines/reuters.py | 90 +++++++++++++++++++++++++++++ searx/result_types/_base.py | 34 ++++++++++- searx/settings.yml | 6 ++ searx/webapp.py | 8 --- 5 files changed, 137 insertions(+), 9 deletions(-) create mode 100644 docs/dev/engines/online/reuters.rst create mode 100644 searx/engines/reuters.py diff --git a/docs/dev/engines/online/reuters.rst b/docs/dev/engines/online/reuters.rst new file mode 100644 index 000000000..e0f685d17 --- /dev/null +++ b/docs/dev/engines/online/reuters.rst @@ -0,0 +1,8 @@ +.. _reuters engine: + +======= +Reuters +======= + +.. automodule:: searx.engines.reuters + :members: diff --git a/searx/engines/reuters.py b/searx/engines/reuters.py new file mode 100644 index 000000000..113124c48 --- /dev/null +++ b/searx/engines/reuters.py @@ -0,0 +1,90 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Reuters_ (news) is an international news agency. + +.. _Reuters: https://www.reuters.com + +Configuration +============= + +The engine has the following additional settings: + +- :py:obj:`sort_order` + +.. code:: yaml + + - name: reuters + engine: reuters + shortcut: reu + sort_order: "relevance" + + +Implementations +=============== + +""" + +from json import dumps +from urllib.parse import quote_plus +from datetime import datetime, timedelta + +from searx.result_types import EngineResults + +about = { + "website": "https://www.reuters.com", + "wikidata_id": "Q130879", + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": "JSON", +} + +categories = ["news"] +time_range_support = True +paging = True + +base_url = "https://www.reuters.com" + +results_per_page = 20 +sort_order = "relevance" +"""Sort order, one of ``relevance``, ``display_date:desc`` or ``display_data:asc``.""" + +time_range_duration_map = { + "day": 1, + "week": 7, + "month": 30, + "year": 365, +} + + +def request(query, params): + args = { + "keyword": query, + "offset": (params["pageno"] - 1) * results_per_page, + "orderby": sort_order, + "size": results_per_page, + "website": "reuters", + } + if params["time_range"]: + time_diff_days = time_range_duration_map[params["time_range"]] + start_date = datetime.now() - timedelta(days=time_diff_days) + args["start_date"] = start_date.isoformat() + + params["url"] = f"{base_url}/pf/api/v3/content/fetch/articles-by-search-v2?query={quote_plus(dumps(args))}" + return params + + +def response(resp) -> EngineResults: + res = EngineResults() + + for result in resp.json().get("result", {}).get("articles", []): + res.add( + res.types.MainResult( + url=base_url + result["canonical_url"], + title=result["web"], + content=result["description"], + thumbnail=result.get("thumbnail", {}).get("url", ""), + metadata=result.get("kicker", {}).get("name"), + publishedDate=datetime.strptime(result["display_time"], "%Y-%m-%dT%H:%M:%SZ"), + ) + ) + return res diff --git a/searx/result_types/_base.py b/searx/result_types/_base.py index c4c0b18b2..caf7e2a4f 100644 --- a/searx/result_types/_base.py +++ b/searx/result_types/_base.py @@ -25,6 +25,8 @@ import re import urllib.parse import warnings import typing +import time +import datetime from collections.abc import Callable @@ -212,6 +214,15 @@ def _filter_urls(result: Result | LegacyResult, filter_func: Callable[[Result | result.normalize_result_fields() +def _normalize_date_fields(result: MainResult | LegacyResult): + + if result.publishedDate: # do not try to get a date from an empty string or a None type + try: # test if publishedDate >= 1900 (datetime module bug) + result.pubdate = result.publishedDate.strftime('%Y-%m-%d %H:%M:%S%z') + except ValueError: + result.publishedDate = None + + class Result(msgspec.Struct, kw_only=True): """Base class of all result types :ref:`result types`.""" @@ -347,6 +358,24 @@ class MainResult(Result): # pylint: disable=missing-class-docstring thumbnail: str = "" """URL of a thumbnail that is displayed in the result item.""" + publishedDate: datetime.datetime | None = None + """The date on which the object was published.""" + + pubdate: str = "" + """String representation of :py:obj:`MainResult.publishedDate`""" + + length: time.struct_time | None = None + """Playing duration in seconds.""" + + views: str = "" + """View count in humanized number format.""" + + author: str = "" + """Author of the title.""" + + metadata: str = "" + """Miscellaneous metadata.""" + priority: typing.Literal["", "high", "low"] = "" """The priority can be set via :ref:`hostnames plugin`, for example.""" @@ -379,8 +408,8 @@ class MainResult(Result): # pylint: disable=missing-class-docstring def normalize_result_fields(self): super().normalize_result_fields() - _normalize_text_fields(self) + _normalize_date_fields(self) if self.engine: self.engines.add(self.engine) @@ -419,6 +448,8 @@ class LegacyResult(dict): positions: list[int] score: float category: str + publishedDate: datetime.datetime | None = None + pubdate: str = "" # infobox result urls: list[dict[str, str]] @@ -514,6 +545,7 @@ class LegacyResult(dict): return f"LegacyResult: {super().__repr__()}" def normalize_result_fields(self): + _normalize_date_fields(self) _normalize_url_fields(self) _normalize_text_fields(self) if self.engine: diff --git a/searx/settings.yml b/searx/settings.yml index a08e2a625..d8d5f4170 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -1741,6 +1741,12 @@ engines: page_size: 25 disabled: true + - name: reuters + engine: reuters + shortcut: reu + # https://docs.searxng.org/dev/engines/online/reuters.html + # sort_order = "relevance" + - name: right dao engine: xpath paging: true diff --git a/searx/webapp.py b/searx/webapp.py index 8d9fd9393..b721c7132 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -694,14 +694,6 @@ def search(): if 'title' in result and result['title']: result['title'] = highlight_content(escape(result['title'] or ''), search_query.query) - if getattr(result, 'publishedDate', None): # do not try to get a date from an empty string or a None type - try: # test if publishedDate >= 1900 (datetime module bug) - result['pubdate'] = result['publishedDate'].strftime('%Y-%m-%d %H:%M:%S%z') - except ValueError: - result['publishedDate'] = None - else: - result['publishedDate'] = webutils.searxng_l10n_timespan(result['publishedDate']) - # set result['open_group'] = True when the template changes from the previous result # set result['close_group'] = True when the template changes on the next result if current_template != result.template: