diff --git a/searx/engines/baidu.py b/searx/engines/baidu.py index 86626e87e..b4aba587d 100644 --- a/searx/engines/baidu.py +++ b/searx/engines/baidu.py @@ -9,8 +9,11 @@ from urllib.parse import urlencode from datetime import datetime +import time +import json from searx.exceptions import SearxEngineAPIException +from searx.utils import html_to_text about = { "website": "https://www.baidu.com", @@ -23,33 +26,85 @@ about = { } paging = True -categories = ["general"] -base_url = "https://www.baidu.com/s" +categories = [] results_per_page = 10 +baidu_category = 'general' + +time_range_support = True +time_range_dict = {"day": 86400, "week": 604800, "month": 2592000, "year": 31536000} + + +def init(_): + if baidu_category not in ('general', 'images', 'it'): + raise SearxEngineAPIException(f"Unsupported category: {baidu_category}") + def request(query, params): - keyword = query.strip() + page_num = params["pageno"] - query_params = { - "wd": keyword, - "rn": results_per_page, - "pn": (params["pageno"] - 1) * results_per_page, - "tn": "json", + category_config = { + 'general': { + 'endpoint': 'https://www.baidu.com/s', + 'params': { + "wd": query, + "rn": results_per_page, + "pn": (page_num - 1) * results_per_page, + "tn": "json", + }, + }, + 'images': { + 'endpoint': 'https://image.baidu.com/search/acjson', + 'params': { + "word": query, + "rn": results_per_page, + "pn": (page_num - 1) * results_per_page, + "tn": "resultjson_com", + }, + }, + 'it': { + 'endpoint': 'https://kaifa.baidu.com/rest/v1/search', + 'params': { + "wd": query, + "pageSize": results_per_page, + "pageNum": page_num, + "paramList": f"page_num={page_num},page_size={results_per_page}", + "position": 0, + }, + }, } - params["url"] = f"{base_url}?{urlencode(query_params)}" + query_params = category_config[baidu_category]['params'] + query_url = category_config[baidu_category]['endpoint'] + + if params.get("time_range") in time_range_dict: + now = int(time.time()) + past = now - time_range_dict[params["time_range"]] + + if baidu_category == 'general': + query_params["gpc"] = f"stf={past},{now}|stftype=1" + + if baidu_category == 'it': + query_params["paramList"] += f",timestamp_range={past}-{now}" + + params["url"] = f"{query_url}?{urlencode(query_params)}" return params def response(resp): try: - data = resp.json() + data = json.loads(resp.text, strict=False) except Exception as e: raise SearxEngineAPIException(f"Invalid response: {e}") from e - results = [] - if "feed" not in data or "entry" not in data["feed"]: + parsers = {'general': parse_general, 'images': parse_images, 'it': parse_it} + + return parsers[baidu_category](data) + + +def parse_general(data): + results = [] + if not data.get("feed", {}).get("entry"): raise SearxEngineAPIException("Invalid response") for entry in data["feed"]["entry"]: @@ -69,8 +124,44 @@ def response(resp): "url": entry["url"], "content": entry.get("abs", ""), "publishedDate": published_date, - # "source": entry.get('source') } ) - + return results + + +def parse_images(data): + results = [] + if "data" in data: + for item in data["data"]: + replace_url = item.get("replaceUrl", [{}])[0] + from_url = replace_url.get("FromURL", "").replace("\\/", "/") + img_src = replace_url.get("ObjURL", "").replace("\\/", "/") + + results.append( + { + "template": "images.html", + "url": from_url, + "thumbnail_src": item.get("thumbURL", ""), + "img_src": img_src, + "content": html_to_text(item.get("fromPageTitleEnc", "")), + "title": html_to_text(item.get("fromPageTitle", "")), + "source": item.get("fromURLHost", ""), + } + ) + return results + + +def parse_it(data): + results = [] + if not data.get("data", {}).get("documents", {}).get("data"): + raise SearxEngineAPIException("Invalid response") + + for entry in data["data"]["documents"]["data"]: + results.append( + { + 'title': entry["techDocDigest"]["title"], + 'url': entry["techDocDigest"]["url"], + 'content': entry["techDocDigest"]["summary"], + } + ) return results diff --git a/searx/settings.yml b/searx/settings.yml index 22adf6626..a08538c98 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -502,8 +502,24 @@ engines: categories: music - name: baidu + baidu_categories: general + categories: [general] engine: baidu - shortcut: baidu + shortcut: bd + disabled: true + + - name: baidu images + baidu_categories: images + categories: [images] + engine: baidu + shortcut: bdi + disabled: true + + - name: baidu kaifa + baidu_categories: it + categories: [it] + engine: baidu + shortcut: bdk disabled: true - name: wikipedia