[feat] engines: add reuters news engine
This commit is contained in:
parent
5daa4f0460
commit
9ffa9fb730
8
docs/dev/engines/online/reuters.rst
Normal file
8
docs/dev/engines/online/reuters.rst
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
.. _reuters engine:
|
||||||
|
|
||||||
|
=======
|
||||||
|
Reuters
|
||||||
|
=======
|
||||||
|
|
||||||
|
.. automodule:: searx.engines.reuters
|
||||||
|
:members:
|
90
searx/engines/reuters.py
Normal file
90
searx/engines/reuters.py
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
|
"""Reuters_ (news) is an international news agency.
|
||||||
|
|
||||||
|
.. _Reuters: https://www.reuters.com
|
||||||
|
|
||||||
|
Configuration
|
||||||
|
=============
|
||||||
|
|
||||||
|
The engine has the following additional settings:
|
||||||
|
|
||||||
|
- :py:obj:`sort_order`
|
||||||
|
|
||||||
|
.. code:: yaml
|
||||||
|
|
||||||
|
- name: reuters
|
||||||
|
engine: reuters
|
||||||
|
shortcut: reu
|
||||||
|
sort_order: "relevance"
|
||||||
|
|
||||||
|
|
||||||
|
Implementations
|
||||||
|
===============
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
from json import dumps
|
||||||
|
from urllib.parse import quote_plus
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
from searx.result_types import EngineResults
|
||||||
|
|
||||||
|
about = {
|
||||||
|
"website": "https://www.reuters.com",
|
||||||
|
"wikidata_id": "Q130879",
|
||||||
|
"official_api_documentation": None,
|
||||||
|
"use_official_api": False,
|
||||||
|
"require_api_key": False,
|
||||||
|
"results": "JSON",
|
||||||
|
}
|
||||||
|
|
||||||
|
categories = ["news"]
|
||||||
|
time_range_support = True
|
||||||
|
paging = True
|
||||||
|
|
||||||
|
base_url = "https://www.reuters.com"
|
||||||
|
|
||||||
|
results_per_page = 20
|
||||||
|
sort_order = "relevance"
|
||||||
|
"""Sort order, one of ``relevance``, ``display_date:desc`` or ``display_data:asc``."""
|
||||||
|
|
||||||
|
time_range_duration_map = {
|
||||||
|
"day": 1,
|
||||||
|
"week": 7,
|
||||||
|
"month": 30,
|
||||||
|
"year": 365,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def request(query, params):
|
||||||
|
args = {
|
||||||
|
"keyword": query,
|
||||||
|
"offset": (params["pageno"] - 1) * results_per_page,
|
||||||
|
"orderby": sort_order,
|
||||||
|
"size": results_per_page,
|
||||||
|
"website": "reuters",
|
||||||
|
}
|
||||||
|
if params["time_range"]:
|
||||||
|
time_diff_days = time_range_duration_map[params["time_range"]]
|
||||||
|
start_date = datetime.now() - timedelta(days=time_diff_days)
|
||||||
|
args["start_date"] = start_date.isoformat()
|
||||||
|
|
||||||
|
params["url"] = f"{base_url}/pf/api/v3/content/fetch/articles-by-search-v2?query={quote_plus(dumps(args))}"
|
||||||
|
return params
|
||||||
|
|
||||||
|
|
||||||
|
def response(resp) -> EngineResults:
|
||||||
|
res = EngineResults()
|
||||||
|
|
||||||
|
for result in resp.json().get("result", {}).get("articles", []):
|
||||||
|
res.add(
|
||||||
|
res.types.MainResult(
|
||||||
|
url=base_url + result["canonical_url"],
|
||||||
|
title=result["web"],
|
||||||
|
content=result["description"],
|
||||||
|
thumbnail=result.get("thumbnail", {}).get("url", ""),
|
||||||
|
metadata=result.get("kicker", {}).get("name"),
|
||||||
|
publishedDate=datetime.strptime(result["display_time"], "%Y-%m-%dT%H:%M:%SZ"),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return res
|
@ -25,6 +25,8 @@ import re
|
|||||||
import urllib.parse
|
import urllib.parse
|
||||||
import warnings
|
import warnings
|
||||||
import typing
|
import typing
|
||||||
|
import time
|
||||||
|
import datetime
|
||||||
|
|
||||||
from collections.abc import Callable
|
from collections.abc import Callable
|
||||||
|
|
||||||
@ -212,6 +214,15 @@ def _filter_urls(result: Result | LegacyResult, filter_func: Callable[[Result |
|
|||||||
result.normalize_result_fields()
|
result.normalize_result_fields()
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_date_fields(result: MainResult | LegacyResult):
|
||||||
|
|
||||||
|
if result.publishedDate: # do not try to get a date from an empty string or a None type
|
||||||
|
try: # test if publishedDate >= 1900 (datetime module bug)
|
||||||
|
result.pubdate = result.publishedDate.strftime('%Y-%m-%d %H:%M:%S%z')
|
||||||
|
except ValueError:
|
||||||
|
result.publishedDate = None
|
||||||
|
|
||||||
|
|
||||||
class Result(msgspec.Struct, kw_only=True):
|
class Result(msgspec.Struct, kw_only=True):
|
||||||
"""Base class of all result types :ref:`result types`."""
|
"""Base class of all result types :ref:`result types`."""
|
||||||
|
|
||||||
@ -347,6 +358,24 @@ class MainResult(Result): # pylint: disable=missing-class-docstring
|
|||||||
thumbnail: str = ""
|
thumbnail: str = ""
|
||||||
"""URL of a thumbnail that is displayed in the result item."""
|
"""URL of a thumbnail that is displayed in the result item."""
|
||||||
|
|
||||||
|
publishedDate: datetime.datetime | None = None
|
||||||
|
"""The date on which the object was published."""
|
||||||
|
|
||||||
|
pubdate: str = ""
|
||||||
|
"""String representation of :py:obj:`MainResult.publishedDate`"""
|
||||||
|
|
||||||
|
length: time.struct_time | None = None
|
||||||
|
"""Playing duration in seconds."""
|
||||||
|
|
||||||
|
views: str = ""
|
||||||
|
"""View count in humanized number format."""
|
||||||
|
|
||||||
|
author: str = ""
|
||||||
|
"""Author of the title."""
|
||||||
|
|
||||||
|
metadata: str = ""
|
||||||
|
"""Miscellaneous metadata."""
|
||||||
|
|
||||||
priority: typing.Literal["", "high", "low"] = ""
|
priority: typing.Literal["", "high", "low"] = ""
|
||||||
"""The priority can be set via :ref:`hostnames plugin`, for example."""
|
"""The priority can be set via :ref:`hostnames plugin`, for example."""
|
||||||
|
|
||||||
@ -379,8 +408,8 @@ class MainResult(Result): # pylint: disable=missing-class-docstring
|
|||||||
|
|
||||||
def normalize_result_fields(self):
|
def normalize_result_fields(self):
|
||||||
super().normalize_result_fields()
|
super().normalize_result_fields()
|
||||||
|
|
||||||
_normalize_text_fields(self)
|
_normalize_text_fields(self)
|
||||||
|
_normalize_date_fields(self)
|
||||||
if self.engine:
|
if self.engine:
|
||||||
self.engines.add(self.engine)
|
self.engines.add(self.engine)
|
||||||
|
|
||||||
@ -419,6 +448,8 @@ class LegacyResult(dict):
|
|||||||
positions: list[int]
|
positions: list[int]
|
||||||
score: float
|
score: float
|
||||||
category: str
|
category: str
|
||||||
|
publishedDate: datetime.datetime | None = None
|
||||||
|
pubdate: str = ""
|
||||||
|
|
||||||
# infobox result
|
# infobox result
|
||||||
urls: list[dict[str, str]]
|
urls: list[dict[str, str]]
|
||||||
@ -514,6 +545,7 @@ class LegacyResult(dict):
|
|||||||
return f"LegacyResult: {super().__repr__()}"
|
return f"LegacyResult: {super().__repr__()}"
|
||||||
|
|
||||||
def normalize_result_fields(self):
|
def normalize_result_fields(self):
|
||||||
|
_normalize_date_fields(self)
|
||||||
_normalize_url_fields(self)
|
_normalize_url_fields(self)
|
||||||
_normalize_text_fields(self)
|
_normalize_text_fields(self)
|
||||||
if self.engine:
|
if self.engine:
|
||||||
|
@ -1741,6 +1741,12 @@ engines:
|
|||||||
page_size: 25
|
page_size: 25
|
||||||
disabled: true
|
disabled: true
|
||||||
|
|
||||||
|
- name: reuters
|
||||||
|
engine: reuters
|
||||||
|
shortcut: reu
|
||||||
|
# https://docs.searxng.org/dev/engines/online/reuters.html
|
||||||
|
# sort_order = "relevance"
|
||||||
|
|
||||||
- name: right dao
|
- name: right dao
|
||||||
engine: xpath
|
engine: xpath
|
||||||
paging: true
|
paging: true
|
||||||
|
@ -694,14 +694,6 @@ def search():
|
|||||||
if 'title' in result and result['title']:
|
if 'title' in result and result['title']:
|
||||||
result['title'] = highlight_content(escape(result['title'] or ''), search_query.query)
|
result['title'] = highlight_content(escape(result['title'] or ''), search_query.query)
|
||||||
|
|
||||||
if getattr(result, 'publishedDate', None): # do not try to get a date from an empty string or a None type
|
|
||||||
try: # test if publishedDate >= 1900 (datetime module bug)
|
|
||||||
result['pubdate'] = result['publishedDate'].strftime('%Y-%m-%d %H:%M:%S%z')
|
|
||||||
except ValueError:
|
|
||||||
result['publishedDate'] = None
|
|
||||||
else:
|
|
||||||
result['publishedDate'] = webutils.searxng_l10n_timespan(result['publishedDate'])
|
|
||||||
|
|
||||||
# set result['open_group'] = True when the template changes from the previous result
|
# set result['open_group'] = True when the template changes from the previous result
|
||||||
# set result['close_group'] = True when the template changes on the next result
|
# set result['close_group'] = True when the template changes on the next result
|
||||||
if current_template != result.template:
|
if current_template != result.template:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user