From 97aa5a779b3910efb2cc8f7497969fbe0d126910 Mon Sep 17 00:00:00 2001 From: Zhijie He Date: Sun, 23 Feb 2025 13:47:03 +0800 Subject: [PATCH] [feat] add Sogou engine for searxng Co-authored-by: Bnyro --- docs/admin/settings/settings_search.rst | 1 + searx/autocomplete.py | 19 +++++++ searx/engines/sogou.py | 68 +++++++++++++++++++++++ searx/engines/sogou_images.py | 49 +++++++++++++++++ searx/engines/sogou_videos.py | 72 +++++++++++++++++++++++++ searx/settings.yml | 17 +++++- 6 files changed, 225 insertions(+), 1 deletion(-) create mode 100644 searx/engines/sogou.py create mode 100644 searx/engines/sogou_images.py create mode 100644 searx/engines/sogou_videos.py diff --git a/docs/admin/settings/settings_search.rst b/docs/admin/settings/settings_search.rst index a85640fa5..284a6c18b 100644 --- a/docs/admin/settings/settings_search.rst +++ b/docs/admin/settings/settings_search.rst @@ -42,6 +42,7 @@ - ``mwmbl`` - ``qwant`` - ``seznam`` + - ``sogou`` - ``stract`` - ``swisscows`` - ``wikipedia`` diff --git a/searx/autocomplete.py b/searx/autocomplete.py index 2ef6189a5..8a4f0a66a 100644 --- a/searx/autocomplete.py +++ b/searx/autocomplete.py @@ -20,6 +20,7 @@ from searx.engines import ( ) from searx.network import get as http_get, post as http_post from searx.exceptions import SearxEngineResponseException +from searx.utils import extr def update_kwargs(**kwargs): @@ -186,6 +187,23 @@ def seznam(query, _lang): ] +def sogou(query, _lang): + # Sogou search autocompleter + base_url = "https://sor.html5.qq.com/api/getsug?" + response = get(base_url + urlencode({'m': 'searxng', 'key': query})) + + if response.ok: + raw_json = extr(response.text, "[", "]", default="") + + try: + data = json.loads(f"[{raw_json}]]") + return data[1] + except json.JSONDecodeError: + return [] + + return [] + + def stract(query, _lang): # stract autocompleter (beta) url = f"https://stract.com/beta/api/autosuggest?q={quote_plus(query)}" @@ -270,6 +288,7 @@ backends = { 'mwmbl': mwmbl, 'qwant': qwant, 'seznam': seznam, + 'sogou': sogou, 'stract': stract, 'swisscows': swisscows, 'wikipedia': wikipedia, diff --git a/searx/engines/sogou.py b/searx/engines/sogou.py new file mode 100644 index 000000000..e36b4cd4f --- /dev/null +++ b/searx/engines/sogou.py @@ -0,0 +1,68 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Sogou search engine for searxng""" + +from urllib.parse import urlencode +from lxml import html + +from searx.utils import extract_text + +# Metadata +about = { + "website": "https://www.sogou.com/", + "wikidata_id": "Q7554565", + "use_official_api": False, + "require_api_key": False, + "results": "HTML", +} + +# Engine Configuration +categories = ["general"] +paging = True +max_page = 10 +time_range_support = True + +time_range_dict = {'day': 'inttime_day', 'week': 'inttime_week', 'month': 'inttime_month', 'year': 'inttime_year'} + +# Base URL +base_url = "https://www.sogou.com" + + +def request(query, params): + query_params = { + "query": query, + "page": params["pageno"], + } + + if time_range_dict.get(params['time_range']): + query_params["s_from"] = time_range_dict.get(params['time_range']) + query_params["tsn"] = 1 + + params["url"] = f"{base_url}/web?{urlencode(query_params)}" + return params + + +def response(resp): + dom = html.fromstring(resp.text) + results = [] + + for item in dom.xpath('//div[contains(@class, "vrwrap")]'): + title = extract_text(item.xpath('.//h3[contains(@class, "vr-title")]/a')) + url = extract_text(item.xpath('.//h3[contains(@class, "vr-title")]/a/@href')) + + if url.startswith("/link?url="): + url = f"{base_url}{url}" + + content = extract_text(item.xpath('.//div[contains(@class, "text-layout")]//p[contains(@class, "star-wiki")]')) + if not content: + content = extract_text(item.xpath('.//div[contains(@class, "fz-mid space-txt")]')) + + if title and url: + results.append( + { + "title": title, + "url": url, + "content": content, + } + ) + + return results diff --git a/searx/engines/sogou_images.py b/searx/engines/sogou_images.py new file mode 100644 index 000000000..69992e3ba --- /dev/null +++ b/searx/engines/sogou_images.py @@ -0,0 +1,49 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Sogou-Images: A search engine for retrieving images from Sogou.""" + +import json +import re +from urllib.parse import quote_plus + +# about +about = { + "website": "https://pic.sogou.com/", + "wikidata_id": "Q7554565", + "use_official_api": False, + "require_api_key": False, + "results": "HTML", +} + +# engine dependent config +categories = ["images"] + +base_url = "https://pic.sogou.com" + + +def request(query, params): + params["url"] = f"{base_url}/pics?query={quote_plus(query)}" + return params + + +def response(resp): + results = [] + match = re.search(r'window\.__INITIAL_STATE__\s*=\s*({.*?});', resp.text, re.S) + if not match: + return results + + data = json.loads(match.group(1)) + if "searchList" in data and "searchList" in data["searchList"]: + for item in data["searchList"]["searchList"]: + results.append( + { + "template": "images.html", + "url": item.get("url", ""), + "thumbnail_src": item.get("picUrl", ""), + "img_src": item.get("picUrl", ""), + "content": item.get("content_major", ""), + "title": item.get("title", ""), + "source": item.get("ch_site_name", ""), + } + ) + + return results diff --git a/searx/engines/sogou_videos.py b/searx/engines/sogou_videos.py new file mode 100644 index 000000000..1149996c9 --- /dev/null +++ b/searx/engines/sogou_videos.py @@ -0,0 +1,72 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Sogou-Videos: A search engine for retrieving videos from Sogou.""" + +from urllib.parse import urlencode +from datetime import datetime + +from searx.exceptions import SearxEngineAPIException + +about = { + "website": "https://v.sogou.com/", + "use_official_api": False, + "require_api_key": False, + "results": "JSON", +} + +categories = ["videos"] +paging = True +results_per_page = 10 + +# Base URL +base_url = "https://v.sogou.com" + + +def request(query, params): + query_params = { + "page": params["pageno"], + "pagesize": 10, + "query": query, + } + + params["url"] = f"{base_url}/api/video/shortVideoV2?{urlencode(query_params)}" + return params + + +def response(resp): + try: + data = resp.json() + except Exception as e: + raise SearxEngineAPIException(f"Invalid response: {e}") from e + results = [] + + if not data.get("data", {}).get("list"): + raise SearxEngineAPIException("Invalid response") + + for entry in data["data"]["list"]: + if not entry.get("titleEsc") or not entry.get("url"): + continue + + video_url = entry.get("url") + if video_url.startswith("/vc/np"): + video_url = f"{base_url}{video_url}" + + published_date = None + if entry.get("date") and entry.get("duration"): + try: + date_time_str = f"{entry['date']} {entry['duration']}" + published_date = datetime.strptime(date_time_str, "%Y-%m-%d %H:%M") + except (ValueError, TypeError): + published_date = None + + results.append( + { + 'url': video_url, + 'title': entry["titleEsc"], + 'content': f"{entry['site']} | {entry['duration']}", + 'template': 'videos.html', + 'publishedDate': published_date, + 'thumbnail': entry["picurl"], + } + ) + + return results diff --git a/searx/settings.yml b/searx/settings.yml index 496828464..d379718fb 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -34,7 +34,7 @@ search: # Filter results. 0: None, 1: Moderate, 2: Strict safe_search: 0 # Existing autocomplete backends: "360search", "baidu", "brave", "dbpedia", "duckduckgo", "google", "yandex", - # "mwmbl", "seznam", "stract", "swisscows", "qwant", "wikipedia" - + # "mwmbl", "seznam", "sogou", "stract", "swisscows", "qwant", "wikipedia" - # leave blank to turn it off by default. autocomplete: "" # minimun characters to type before autocompleter starts @@ -1711,6 +1711,21 @@ engines: engine: sepiasearch shortcut: sep + - name: sogou + engine: sogou + shortcut: sogou + disabled: true + + - name: sogou images + engine: sogou_images + shortcut: sogoui + disabled: true + + - name: sogou videos + engine: sogou_videos + shortcut: sogouv + disabled: true + - name: soundcloud engine: soundcloud shortcut: sc