diff --git a/searx/engines/acfun.py b/searx/engines/acfun.py new file mode 100644 index 000000000..ab30c13f2 --- /dev/null +++ b/searx/engines/acfun.py @@ -0,0 +1,108 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Acfun search engine for searxng""" + +from urllib.parse import urlencode +import re +import json +from datetime import datetime, timedelta +from lxml import html + +from searx.utils import extract_text + +# Metadata +about = { + "website": "https://www.acfun.cn/", + "wikidata_id": "Q3077675", + "use_official_api": False, + "require_api_key": False, + "results": "HTML", +} + +# Engine Configuration +categories = ["videos"] +paging = True + +# Base URL +base_url = "https://www.acfun.cn" + + +def request(query, params): + query_params = {"keyword": query, "pCursor": params["pageno"]} + params["url"] = f"{base_url}/search?{urlencode(query_params)}" + return params + + +def response(resp): + results = [] + + matches = re.findall(r'bigPipe\.onPageletArrive\((\{.*?\})\);', resp.text, re.DOTALL) + if not matches: + return results + + for match in matches: + try: + json_data = json.loads(match) + raw_html = json_data.get("html", "") + if not raw_html: + continue + + tree = html.fromstring(raw_html) + + video_blocks = tree.xpath('//div[contains(@class, "search-video")]') + if not video_blocks: + continue + + for video_block in video_blocks: + video_info = extract_video_data(video_block) + if video_info and video_info["title"] and video_info["url"]: + results.append(video_info) + + except json.JSONDecodeError: + continue + + return results + + +def extract_video_data(video_block): + try: + data_exposure_log = video_block.get('data-exposure-log') + video_data = json.loads(data_exposure_log) + + content_id = video_data.get("content_id", "") + title = video_data.get("title", "") + + url = f"{base_url}/v/ac{content_id}" + iframe_src = f"{base_url}/player/ac{content_id}" + + create_time = extract_text(video_block.xpath('.//span[contains(@class, "info__create-time")]')) + video_cover = extract_text(video_block.xpath('.//div[contains(@class, "video__cover")]/a/img/@src')[0]) + video_duration = extract_text(video_block.xpath('.//span[contains(@class, "video__duration")]')) + video_intro = extract_text(video_block.xpath('.//div[contains(@class, "video__main__intro")]')) + + published_date = None + if create_time: + try: + published_date = datetime.strptime(create_time.strip(), "%Y-%m-%d") + except (ValueError, TypeError): + pass + + length = None + if video_duration: + try: + timediff = datetime.strptime(video_duration.strip(), "%M:%S") + length = timedelta(minutes=timediff.minute, seconds=timediff.second) + except (ValueError, TypeError): + pass + + return { + "title": title, + "url": url, + "content": video_intro, + "thumbnail": video_cover, + "length": length, + "publishedDate": published_date, + "iframe_src": iframe_src, + } + + except (json.JSONDecodeError, AttributeError, TypeError, ValueError): + return None diff --git a/searx/settings.yml b/searx/settings.yml index 9662d2d09..c0b740e99 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -352,6 +352,11 @@ engines: shortcut: 9g disabled: true + - name: acfun + engine: acfun + shortcut: acf + disabled: true + - name: adobe stock engine: adobe_stock shortcut: asi