From d1c584b9619cdc11d6d22ecc8cde5f7dc85054e9 Mon Sep 17 00:00:00 2001 From: Tommaso Colella Date: Sat, 29 Mar 2025 19:54:48 +0000 Subject: [PATCH] [feat] engine: add engine for italian press agency ansa --- searx/engines/ansa.py | 81 +++++++++++++++++++++++++++++++++++++++++++ searx/settings.yml | 5 +++ 2 files changed, 86 insertions(+) create mode 100644 searx/engines/ansa.py diff --git a/searx/engines/ansa.py b/searx/engines/ansa.py new file mode 100644 index 000000000..56cbee31a --- /dev/null +++ b/searx/engines/ansa.py @@ -0,0 +1,81 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Engine for Ansa, Italy's oldest news agency. + +To use this engine add the following entry to your engines +list in ``settings.yml``: + +.. code:: yaml + + - name: ansa + engine: ansa + shortcut: ans + disabled: false + +""" + +from urllib.parse import urlencode +from lxml import html +from searx.result_types import EngineResults, MainResult +from searx.utils import eval_xpath, eval_xpath_list, extract_text + +engine_type = 'online' +language_support = False +categories = ['news'] +paging = True +page_size = 12 +base_url = 'https://www.ansa.it' + +time_range_support = True +time_range_args = { + 'day': 1, + 'week': 7, + 'month': 31, + 'year': 365, +} +# https://www.ansa.it/ricerca/ansait/search.shtml?start=0&any=houthi&periodo=&sort=data%3Adesc +search_api = 'https://www.ansa.it/ricerca/ansait/search.shtml?' + +about = { + 'website': 'https://www.ansa.it', + 'wikidata_id': 'Q392934', + 'official_api_documentation': None, + 'use_official_api': False, + 'require_api_key': False, + 'results': 'HTML', + 'language': 'it', +} + + +def request(query, params): + query_params = { + 'any': query, + 'start': (params['pageno'] - 1) * page_size, + 'sort': "data:desc", + } + + if params['time_range']: + query_params['periodo'] = time_range_args.get(params['time_range']) + + params['url'] = search_api + urlencode(query_params) + return params + + +def response(resp) -> EngineResults: + res = EngineResults() + doc = html.fromstring(resp.text) + + for result in eval_xpath_list(doc, "//div[@class='article']"): + + res_obj = MainResult( + title=extract_text(eval_xpath(result, "./div[@class='content']/h2[@class='title']/a")), + content=extract_text(eval_xpath(result, "./div[@class='content']/div[@class='text']")), + url=base_url + extract_text(eval_xpath(result, "./div[@class='content']/h2[@class='title']/a/@href")), + ) + + thumbnail = extract_text(eval_xpath(result, "./div[@class='image']/a/img/@src")) + if thumbnail: + res_obj.thumbnail = base_url + thumbnail + + res.append(res_obj) + + return res diff --git a/searx/settings.yml b/searx/settings.yml index 6ab6351ad..5a87da3cd 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -433,6 +433,11 @@ engines: disabled: true shortcut: aa + - name: ansa + engine: ansa + shortcut: ans + disabled: false + # - name: annas articles # engine: annas_archive # shortcut: aaa