[feat]: engines add images & kaifa from baidu.com

This commit is contained in:
Zhijie He 2025-03-06 19:28:58 +08:00 committed by Markus Heiser
parent 11b5ae186b
commit 4ce7f1accc
2 changed files with 122 additions and 15 deletions

View File

@ -9,8 +9,11 @@
from urllib.parse import urlencode from urllib.parse import urlencode
from datetime import datetime from datetime import datetime
import time
import json
from searx.exceptions import SearxEngineAPIException from searx.exceptions import SearxEngineAPIException
from searx.utils import html_to_text
about = { about = {
"website": "https://www.baidu.com", "website": "https://www.baidu.com",
@ -23,33 +26,85 @@ about = {
} }
paging = True paging = True
categories = ["general"] categories = []
base_url = "https://www.baidu.com/s"
results_per_page = 10 results_per_page = 10
baidu_category = 'general'
time_range_support = True
time_range_dict = {"day": 86400, "week": 604800, "month": 2592000, "year": 31536000}
def init(_):
if baidu_category not in ('general', 'images', 'it'):
raise SearxEngineAPIException(f"Unsupported category: {baidu_category}")
def request(query, params): def request(query, params):
keyword = query.strip() page_num = params["pageno"]
query_params = { category_config = {
"wd": keyword, 'general': {
"rn": results_per_page, 'endpoint': 'https://www.baidu.com/s',
"pn": (params["pageno"] - 1) * results_per_page, 'params': {
"tn": "json", "wd": query,
"rn": results_per_page,
"pn": (page_num - 1) * results_per_page,
"tn": "json",
},
},
'images': {
'endpoint': 'https://image.baidu.com/search/acjson',
'params': {
"word": query,
"rn": results_per_page,
"pn": (page_num - 1) * results_per_page,
"tn": "resultjson_com",
},
},
'it': {
'endpoint': 'https://kaifa.baidu.com/rest/v1/search',
'params': {
"wd": query,
"pageSize": results_per_page,
"pageNum": page_num,
"paramList": f"page_num={page_num},page_size={results_per_page}",
"position": 0,
},
},
} }
params["url"] = f"{base_url}?{urlencode(query_params)}" query_params = category_config[baidu_category]['params']
query_url = category_config[baidu_category]['endpoint']
if params.get("time_range") in time_range_dict:
now = int(time.time())
past = now - time_range_dict[params["time_range"]]
if baidu_category == 'general':
query_params["gpc"] = f"stf={past},{now}|stftype=1"
if baidu_category == 'it':
query_params["paramList"] += f",timestamp_range={past}-{now}"
params["url"] = f"{query_url}?{urlencode(query_params)}"
return params return params
def response(resp): def response(resp):
try: try:
data = resp.json() data = json.loads(resp.text, strict=False)
except Exception as e: except Exception as e:
raise SearxEngineAPIException(f"Invalid response: {e}") from e raise SearxEngineAPIException(f"Invalid response: {e}") from e
results = []
if "feed" not in data or "entry" not in data["feed"]: parsers = {'general': parse_general, 'images': parse_images, 'it': parse_it}
return parsers[baidu_category](data)
def parse_general(data):
results = []
if not data.get("feed", {}).get("entry"):
raise SearxEngineAPIException("Invalid response") raise SearxEngineAPIException("Invalid response")
for entry in data["feed"]["entry"]: for entry in data["feed"]["entry"]:
@ -69,8 +124,44 @@ def response(resp):
"url": entry["url"], "url": entry["url"],
"content": entry.get("abs", ""), "content": entry.get("abs", ""),
"publishedDate": published_date, "publishedDate": published_date,
# "source": entry.get('source')
} }
) )
return results
def parse_images(data):
results = []
if "data" in data:
for item in data["data"]:
replace_url = item.get("replaceUrl", [{}])[0]
from_url = replace_url.get("FromURL", "").replace("\\/", "/")
img_src = replace_url.get("ObjURL", "").replace("\\/", "/")
results.append(
{
"template": "images.html",
"url": from_url,
"thumbnail_src": item.get("thumbURL", ""),
"img_src": img_src,
"content": html_to_text(item.get("fromPageTitleEnc", "")),
"title": html_to_text(item.get("fromPageTitle", "")),
"source": item.get("fromURLHost", ""),
}
)
return results
def parse_it(data):
results = []
if not data.get("data", {}).get("documents", {}).get("data"):
raise SearxEngineAPIException("Invalid response")
for entry in data["data"]["documents"]["data"]:
results.append(
{
'title': entry["techDocDigest"]["title"],
'url': entry["techDocDigest"]["url"],
'content': entry["techDocDigest"]["summary"],
}
)
return results return results

View File

@ -502,8 +502,24 @@ engines:
categories: music categories: music
- name: baidu - name: baidu
baidu_categories: general
categories: [general]
engine: baidu engine: baidu
shortcut: baidu shortcut: bd
disabled: true
- name: baidu images
baidu_categories: images
categories: [images]
engine: baidu
shortcut: bdi
disabled: true
- name: baidu kaifa
baidu_categories: it
categories: [it]
engine: baidu
shortcut: bdk
disabled: true disabled: true
- name: wikipedia - name: wikipedia