From 76f52b5b45a08776ad09e9b670a80635ab303c96 Mon Sep 17 00:00:00 2001 From: Zhijie He Date: Sun, 2 Mar 2025 11:18:30 +0800 Subject: [PATCH] [feat] add Sogou WeChat article search support --- searx/engines/sogou.py | 1 - searx/engines/sogou_images.py | 10 ++++- searx/engines/sogou_wechat.py | 75 +++++++++++++++++++++++++++++++++++ searx/settings.yml | 5 +++ 4 files changed, 88 insertions(+), 3 deletions(-) create mode 100644 searx/engines/sogou_wechat.py diff --git a/searx/engines/sogou.py b/searx/engines/sogou.py index e36b4cd4f..d1390ab43 100644 --- a/searx/engines/sogou.py +++ b/searx/engines/sogou.py @@ -18,7 +18,6 @@ about = { # Engine Configuration categories = ["general"] paging = True -max_page = 10 time_range_support = True time_range_dict = {'day': 'inttime_day', 'week': 'inttime_week', 'month': 'inttime_month', 'year': 'inttime_year'} diff --git a/searx/engines/sogou_images.py b/searx/engines/sogou_images.py index 69992e3ba..fec3ac12c 100644 --- a/searx/engines/sogou_images.py +++ b/searx/engines/sogou_images.py @@ -3,7 +3,7 @@ import json import re -from urllib.parse import quote_plus +from urllib.parse import urlencode # about about = { @@ -16,12 +16,18 @@ about = { # engine dependent config categories = ["images"] +paging = True base_url = "https://pic.sogou.com" def request(query, params): - params["url"] = f"{base_url}/pics?query={quote_plus(query)}" + query_params = { + "query": query, + "start": (params["pageno"] - 1) * 48, + } + + params["url"] = f"{base_url}/pics?{urlencode(query_params)}" return params diff --git a/searx/engines/sogou_wechat.py b/searx/engines/sogou_wechat.py new file mode 100644 index 000000000..caca1d48f --- /dev/null +++ b/searx/engines/sogou_wechat.py @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Sogou-WeChat search engine for retrieving WeChat Article from Sogou""" + +from urllib.parse import urlencode +from datetime import datetime +import re +from lxml import html + +from searx.utils import extract_text + +# Metadata +about = { + "website": "https://weixin.sogou.com/", + "use_official_api": False, + "require_api_key": False, + "results": "HTML", +} + +# Engine Configuration +categories = ["news"] +paging = True + +# Base URL +base_url = "https://weixin.sogou.com" + + +def request(query, params): + query_params = { + "query": query, + "page": params["pageno"], + "type": 2, + } + + params["url"] = f"{base_url}/weixin?{urlencode(query_params)}" + return params + + +def response(resp): + dom = html.fromstring(resp.text) + results = [] + + for item in dom.xpath('//li[contains(@id, "sogou_vr_")]'): + title = extract_text(item.xpath('.//h3/a')) + url = extract_text(item.xpath('.//h3/a/@href')) + + if url.startswith("/link?url="): + url = f"{base_url}{url}" + + content = extract_text(item.xpath('.//p[@class="txt-info"]')) + if not content: + content = extract_text(item.xpath('.//p[contains(@class, "txt-info")]')) + + thumbnail = extract_text(item.xpath('.//div[@class="img-box"]/a/img/@src')) + if thumbnail and thumbnail.startswith("//"): + thumbnail = f"https:{thumbnail}" + + published_date = None + timestamp = extract_text(item.xpath('.//script[contains(text(), "timeConvert")]')) + if timestamp: + match = re.search(r"timeConvert\('(\d+)'\)", timestamp) + if match: + published_date = datetime.fromtimestamp(int(match.group(1))) + + if title and url: + results.append( + { + "title": title, + "url": url, + "content": content, + 'thumbnail': thumbnail, + "publishedDate": published_date, + } + ) + + return results diff --git a/searx/settings.yml b/searx/settings.yml index d379718fb..6aafaeb63 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -1726,6 +1726,11 @@ engines: shortcut: sogouv disabled: true + - name: sogou wechat + engine: sogou_wechat + shortcut: sogouw + disabled: true + - name: soundcloud engine: soundcloud shortcut: sc