From 6e3508fa60bef37dd0926c12dd7e6ae36fb0a4d9 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 30 Jan 2022 10:16:16 +0100 Subject: [PATCH] [enh] add digg engine back to SearXNG digg was removed in 4c82ac767 since the API was no longer available, this adds digg back by parsing HTML. This implementation was copied from e-foundation/searx [1][2] The CDN for https://digg.com is Cloudflare [1] https://github.com/e-foundation/searx/commit/2eb3a41155ca3f8b4eac61dd4defa6d31cb300b1 [2] https://github.com/searx/searx/pull/3150 Signed-off-by: Markus Heiser --- searx/engines/digg.py | 67 +++++++++++++++++++++++++++++++++++++++++++ searx/settings.yml | 4 +++ 2 files changed, 71 insertions(+) create mode 100644 searx/engines/digg.py diff --git a/searx/engines/digg.py b/searx/engines/digg.py new file mode 100644 index 000000000..ef4517a7c --- /dev/null +++ b/searx/engines/digg.py @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Digg (News, Social media) + +""" + +from urllib.parse import urlencode +from datetime import datetime + +from lxml import html +from searx.utils import eval_xpath, extract_text + +# about +about = { + "website": 'https://digg.com', + "wikidata_id": 'Q270478', + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config +categories = ['news', 'social media'] +paging = True +base_url = 'https://digg.com' +results_per_page = 10 + +# search-url +search_url = base_url + ('/search' '?{query}' '&size={size}' '&offset={offset}') + + +def request(query, params): + offset = (params['pageno'] - 1) * results_per_page + 1 + params['url'] = search_url.format( + query=urlencode({'q': query}), + size=results_per_page, + offset=offset, + ) + return params + + +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + results_list = eval_xpath(dom, '//section[contains(@class, "search-results")]') + + for result in results_list: + + titles = eval_xpath(result, '//article//header//h2') + contents = eval_xpath(result, '//article//p') + urls = eval_xpath(result, '//header/a/@href') + published_dates = eval_xpath(result, '//article/div/div/time/@datetime') + + for (title, content, url, published_date) in zip(titles, contents, urls, published_dates): + results.append( + { + 'url': url, + 'publishedDate': datetime.strptime(published_date, '%Y-%m-%dT%H:%M:%SZ'), + 'title': extract_text(title), + 'content': extract_text(content), + } + ) + + return results diff --git a/searx/settings.yml b/searx/settings.yml index ad38d543d..1d07ef8b1 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -426,6 +426,10 @@ engines: # timeout: 6.0 # disabled: true + - name: digg + engine: digg + shortcut: dg + - name: docker hub engine: docker_hub shortcut: dh