[feat]: engines add images & kaifa from baidu.com
This commit is contained in:
		
							parent
							
								
									11b5ae186b
								
							
						
					
					
						commit
						4ce7f1accc
					
				@ -9,8 +9,11 @@
 | 
			
		||||
 | 
			
		||||
from urllib.parse import urlencode
 | 
			
		||||
from datetime import datetime
 | 
			
		||||
import time
 | 
			
		||||
import json
 | 
			
		||||
 | 
			
		||||
from searx.exceptions import SearxEngineAPIException
 | 
			
		||||
from searx.utils import html_to_text
 | 
			
		||||
 | 
			
		||||
about = {
 | 
			
		||||
    "website": "https://www.baidu.com",
 | 
			
		||||
@ -23,33 +26,85 @@ about = {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
paging = True
 | 
			
		||||
categories = ["general"]
 | 
			
		||||
base_url = "https://www.baidu.com/s"
 | 
			
		||||
categories = []
 | 
			
		||||
results_per_page = 10
 | 
			
		||||
 | 
			
		||||
baidu_category = 'general'
 | 
			
		||||
 | 
			
		||||
time_range_support = True
 | 
			
		||||
time_range_dict = {"day": 86400, "week": 604800, "month": 2592000, "year": 31536000}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def init(_):
 | 
			
		||||
    if baidu_category not in ('general', 'images', 'it'):
 | 
			
		||||
        raise SearxEngineAPIException(f"Unsupported category: {baidu_category}")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def request(query, params):
 | 
			
		||||
    keyword = query.strip()
 | 
			
		||||
    page_num = params["pageno"]
 | 
			
		||||
 | 
			
		||||
    query_params = {
 | 
			
		||||
        "wd": keyword,
 | 
			
		||||
    category_config = {
 | 
			
		||||
        'general': {
 | 
			
		||||
            'endpoint': 'https://www.baidu.com/s',
 | 
			
		||||
            'params': {
 | 
			
		||||
                "wd": query,
 | 
			
		||||
                "rn": results_per_page,
 | 
			
		||||
        "pn": (params["pageno"] - 1) * results_per_page,
 | 
			
		||||
                "pn": (page_num - 1) * results_per_page,
 | 
			
		||||
                "tn": "json",
 | 
			
		||||
            },
 | 
			
		||||
        },
 | 
			
		||||
        'images': {
 | 
			
		||||
            'endpoint': 'https://image.baidu.com/search/acjson',
 | 
			
		||||
            'params': {
 | 
			
		||||
                "word": query,
 | 
			
		||||
                "rn": results_per_page,
 | 
			
		||||
                "pn": (page_num - 1) * results_per_page,
 | 
			
		||||
                "tn": "resultjson_com",
 | 
			
		||||
            },
 | 
			
		||||
        },
 | 
			
		||||
        'it': {
 | 
			
		||||
            'endpoint': 'https://kaifa.baidu.com/rest/v1/search',
 | 
			
		||||
            'params': {
 | 
			
		||||
                "wd": query,
 | 
			
		||||
                "pageSize": results_per_page,
 | 
			
		||||
                "pageNum": page_num,
 | 
			
		||||
                "paramList": f"page_num={page_num},page_size={results_per_page}",
 | 
			
		||||
                "position": 0,
 | 
			
		||||
            },
 | 
			
		||||
        },
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    params["url"] = f"{base_url}?{urlencode(query_params)}"
 | 
			
		||||
    query_params = category_config[baidu_category]['params']
 | 
			
		||||
    query_url = category_config[baidu_category]['endpoint']
 | 
			
		||||
 | 
			
		||||
    if params.get("time_range") in time_range_dict:
 | 
			
		||||
        now = int(time.time())
 | 
			
		||||
        past = now - time_range_dict[params["time_range"]]
 | 
			
		||||
 | 
			
		||||
        if baidu_category == 'general':
 | 
			
		||||
            query_params["gpc"] = f"stf={past},{now}|stftype=1"
 | 
			
		||||
 | 
			
		||||
        if baidu_category == 'it':
 | 
			
		||||
            query_params["paramList"] += f",timestamp_range={past}-{now}"
 | 
			
		||||
 | 
			
		||||
    params["url"] = f"{query_url}?{urlencode(query_params)}"
 | 
			
		||||
    return params
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def response(resp):
 | 
			
		||||
    try:
 | 
			
		||||
        data = resp.json()
 | 
			
		||||
        data = json.loads(resp.text, strict=False)
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        raise SearxEngineAPIException(f"Invalid response: {e}") from e
 | 
			
		||||
    results = []
 | 
			
		||||
 | 
			
		||||
    if "feed" not in data or "entry" not in data["feed"]:
 | 
			
		||||
    parsers = {'general': parse_general, 'images': parse_images, 'it': parse_it}
 | 
			
		||||
 | 
			
		||||
    return parsers[baidu_category](data)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def parse_general(data):
 | 
			
		||||
    results = []
 | 
			
		||||
    if not data.get("feed", {}).get("entry"):
 | 
			
		||||
        raise SearxEngineAPIException("Invalid response")
 | 
			
		||||
 | 
			
		||||
    for entry in data["feed"]["entry"]:
 | 
			
		||||
@ -69,8 +124,44 @@ def response(resp):
 | 
			
		||||
                "url": entry["url"],
 | 
			
		||||
                "content": entry.get("abs", ""),
 | 
			
		||||
                "publishedDate": published_date,
 | 
			
		||||
                # "source": entry.get('source')
 | 
			
		||||
            }
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    return results
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def parse_images(data):
 | 
			
		||||
    results = []
 | 
			
		||||
    if "data" in data:
 | 
			
		||||
        for item in data["data"]:
 | 
			
		||||
            replace_url = item.get("replaceUrl", [{}])[0]
 | 
			
		||||
            from_url = replace_url.get("FromURL", "").replace("\\/", "/")
 | 
			
		||||
            img_src = replace_url.get("ObjURL", "").replace("\\/", "/")
 | 
			
		||||
 | 
			
		||||
            results.append(
 | 
			
		||||
                {
 | 
			
		||||
                    "template": "images.html",
 | 
			
		||||
                    "url": from_url,
 | 
			
		||||
                    "thumbnail_src": item.get("thumbURL", ""),
 | 
			
		||||
                    "img_src": img_src,
 | 
			
		||||
                    "content": html_to_text(item.get("fromPageTitleEnc", "")),
 | 
			
		||||
                    "title": html_to_text(item.get("fromPageTitle", "")),
 | 
			
		||||
                    "source": item.get("fromURLHost", ""),
 | 
			
		||||
                }
 | 
			
		||||
            )
 | 
			
		||||
    return results
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def parse_it(data):
 | 
			
		||||
    results = []
 | 
			
		||||
    if not data.get("data", {}).get("documents", {}).get("data"):
 | 
			
		||||
        raise SearxEngineAPIException("Invalid response")
 | 
			
		||||
 | 
			
		||||
    for entry in data["data"]["documents"]["data"]:
 | 
			
		||||
        results.append(
 | 
			
		||||
            {
 | 
			
		||||
                'title': entry["techDocDigest"]["title"],
 | 
			
		||||
                'url': entry["techDocDigest"]["url"],
 | 
			
		||||
                'content': entry["techDocDigest"]["summary"],
 | 
			
		||||
            }
 | 
			
		||||
        )
 | 
			
		||||
    return results
 | 
			
		||||
 | 
			
		||||
@ -502,8 +502,24 @@ engines:
 | 
			
		||||
    categories: music
 | 
			
		||||
 | 
			
		||||
  - name: baidu
 | 
			
		||||
    baidu_categories: general
 | 
			
		||||
    categories: [general]
 | 
			
		||||
    engine: baidu
 | 
			
		||||
    shortcut: baidu
 | 
			
		||||
    shortcut: bd
 | 
			
		||||
    disabled: true
 | 
			
		||||
 | 
			
		||||
  - name: baidu images
 | 
			
		||||
    baidu_categories: images
 | 
			
		||||
    categories: [images]
 | 
			
		||||
    engine: baidu
 | 
			
		||||
    shortcut: bdi
 | 
			
		||||
    disabled: true
 | 
			
		||||
 | 
			
		||||
  - name: baidu kaifa
 | 
			
		||||
    baidu_categories: it
 | 
			
		||||
    categories: [it]
 | 
			
		||||
    engine: baidu
 | 
			
		||||
    shortcut: bdk
 | 
			
		||||
    disabled: true
 | 
			
		||||
 | 
			
		||||
  - name: wikipedia
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user