diff --git a/searx/engines/quark.py b/searx/engines/quark.py new file mode 100644 index 000000000..957ef0d97 --- /dev/null +++ b/searx/engines/quark.py @@ -0,0 +1,383 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Quark (Shenma) search engine for searxng""" + +from urllib.parse import urlencode +from datetime import datetime +import re +import json + +from searx.utils import html_to_text, gen_useragent +from searx.exceptions import SearxEngineAPIException, SearxEngineCaptchaException + +# Metadata +about = { + "website": "https://m.quark.cn/", + "wikidata_id": "Q48816502", + "use_official_api": False, + "require_api_key": False, + "results": "HTML", + "language": "zh", +} + +# Engine Configuration +categories = [] +paging = True +results_per_page = 10 + +quark_category = 'general' + +time_range_support = True +time_range_dict = {'day': '4', 'week': '3', 'month': '2', 'year': '1'} + +CAPTCHA_PATTERN = r'\{[^{]*?"action"\s*:\s*"captcha"\s*,\s*"url"\s*:\s*"([^"]+)"[^{]*?\}' + + +def is_alibaba_captcha(html): + """ + Detects if the response contains an Alibaba X5SEC CAPTCHA page. + + Quark may return a CAPTCHA challenge after 9 requests in a short period. + + Typically, the ban duration is around 15 minutes. + """ + return bool(re.search(CAPTCHA_PATTERN, html)) + + +def init(_): + if quark_category not in ('general', 'images'): + raise SearxEngineAPIException(f"Unsupported category: {quark_category}") + + +def request(query, params): + page_num = params["pageno"] + + category_config = { + 'general': { + 'endpoint': 'https://m.quark.cn/s', + 'params': { + "q": query, + "layout": "html", + "page": page_num, + }, + }, + 'images': { + 'endpoint': 'https://vt.sm.cn/api/pic/list', + 'params': { + "query": query, + "limit": results_per_page, + "start": (page_num - 1) * results_per_page, + }, + }, + } + + query_params = category_config[quark_category]['params'] + query_url = category_config[quark_category]['endpoint'] + + if time_range_dict.get(params['time_range']) and quark_category == 'general': + query_params["tl_request"] = time_range_dict.get(params['time_range']) + + params["url"] = f"{query_url}?{urlencode(query_params)}" + params["headers"] = { + "User-Agent": gen_useragent(), + } + return params + + +def response(resp): + results = [] + text = resp.text + + if is_alibaba_captcha(text): + raise SearxEngineCaptchaException( + suspended_time=900, message="Alibaba CAPTCHA detected. Please try again later." + ) + + if quark_category == 'images': + data = json.loads(text) + for item in data.get('data', {}).get('hit', {}).get('imgInfo', {}).get('item', []): + try: + published_date = datetime.fromtimestamp(int(item.get("publish_time"))) + except (ValueError, TypeError): + published_date = None + + results.append( + { + "template": "images.html", + "url": item.get("imgUrl"), + "thumbnail_src": item.get("img"), + "img_src": item.get("bigPicUrl"), + "title": item.get("title"), + "source": item.get("site"), + "resolution": f"{item['width']} x {item['height']}", + "publishedDate": published_date, + } + ) + + if quark_category == 'general': + # Quark returns a variety of different sc values on a single page, depending on the query type. + source_category_parsers = { + 'addition': parse_addition, + 'ai_page': parse_ai_page, + 'baike_sc': parse_baike_sc, + 'finance_shuidi': parse_finance_shuidi, + 'kk_yidian_all': parse_kk_yidian_all, + 'life_show_general_image': parse_life_show_general_image, + 'med_struct': parse_med_struct, + 'music_new_song': parse_music_new_song, + 'nature_result': parse_nature_result, + 'news_uchq': parse_news_uchq, + 'ss_note': parse_ss_note, + # ss_kv, ss_pic, ss_text, ss_video, baike, structure_web_novel use the same struct as ss_doc + 'ss_doc': parse_ss_doc, + 'ss_kv': parse_ss_doc, + 'ss_pic': parse_ss_doc, + 'ss_text': parse_ss_doc, + 'ss_video': parse_ss_doc, + 'baike': parse_ss_doc, + 'structure_web_novel': parse_ss_doc, + 'travel_dest_overview': parse_travel_dest_overview, + 'travel_ranking_list': parse_travel_ranking_list, + } + + pattern = r'(.*?)' + matches = re.findall(pattern, text, re.DOTALL) + + for match in matches: + data = json.loads(match) + initial_data = data.get('data', {}).get('initialData', {}) + extra_data = data.get('extraData', {}) + + source_category = extra_data.get('sc') + + parsers = source_category_parsers.get(source_category) + if parsers: + parsed_results = parsers(initial_data) + if isinstance(parsed_results, list): + # Extend if the result is a list + results.extend(parsed_results) + else: + # Append if it's a single result + results.append(parsed_results) + + return results + + +def parse_addition(data): + return { + "title": html_to_text(data.get('title', {}).get('content')), + "url": data.get('source', {}).get('url'), + "content": html_to_text(data.get('summary', {}).get('content')), + } + + +def parse_ai_page(data): + results = [] + for item in data.get('list', []): + content = ( + " | ".join(map(str, item.get('content', []))) + if isinstance(item.get('content'), list) + else str(item.get('content')) + ) + + try: + published_date = datetime.fromtimestamp(int(item.get('source', {}).get('time'))) + except (ValueError, TypeError): + published_date = None + + results.append( + { + "title": html_to_text(item.get('title')), + "url": item.get('url'), + "content": html_to_text(content), + "publishedDate": published_date, + } + ) + return results + + +def parse_baike_sc(data): + return { + "title": html_to_text(data.get('data', {}).get('title')), + "url": data.get('data', {}).get('url'), + "content": html_to_text(data.get('data', {}).get('abstract')), + "thumbnail": data.get('data', {}).get('img').replace("http://", "https://"), + } + + +def parse_finance_shuidi(data): + content = " | ".join( + ( + info + for info in [ + data.get('establish_time'), + data.get('company_status'), + data.get('controled_type'), + data.get('company_type'), + data.get('capital'), + data.get('address'), + data.get('business_scope'), + ] + if info + ) + ) + return { + "title": html_to_text(data.get('company_name')), + "url": data.get('title_url'), + "content": html_to_text(content), + } + + +def parse_kk_yidian_all(data): + content_list = [] + for section in data.get('list_container', []): + for item in section.get('list_container', []): + if 'dot_text' in item: + content_list.append(item['dot_text']) + + return { + "title": html_to_text(data.get('title')), + "url": data.get('title_url'), + "content": html_to_text(' '.join(content_list)), + } + + +def parse_life_show_general_image(data): + results = [] + for item in data.get('image', []): + try: + published_date = datetime.fromtimestamp(int(item.get("publish_time"))) + except (ValueError, TypeError): + published_date = None + + results.append( + { + "template": "images.html", + "url": item.get("imgUrl"), + "thumbnail_src": item.get("img"), + "img_src": item.get("bigPicUrl"), + "title": item.get("title"), + "source": item.get("site"), + "resolution": f"{item['width']} x {item['height']}", + "publishedDate": published_date, + } + ) + return results + + +def parse_med_struct(data): + return { + "title": html_to_text(data.get('title')), + "url": data.get('message', {}).get('statistics', {}).get('nu'), + "content": html_to_text(data.get('message', {}).get('content_text')), + "thumbnail": data.get('message', {}).get('video_img').replace("http://", "https://"), + } + + +def parse_music_new_song(data): + results = [] + for item in data.get('hit3', []): + results.append( + { + "title": f"{item['song_name']} | {item['song_singer']}", + "url": item.get("play_url"), + "content": html_to_text(item.get("lyrics")), + "thumbnail": item.get("image_url").replace("http://", "https://"), + } + ) + return results + + +def parse_nature_result(data): + return {"title": html_to_text(data.get('title')), "url": data.get('url'), "content": html_to_text(data.get('desc'))} + + +def parse_news_uchq(data): + results = [] + for item in data.get('feed', []): + try: + published_date = datetime.strptime(item.get('time'), "%Y-%m-%d") + except (ValueError, TypeError): + # Sometime Quark will return non-standard format like "1天前", set published_date as None + published_date = None + + results.append( + { + "title": html_to_text(item.get('title')), + "url": item.get('url'), + "content": html_to_text(item.get('summary')), + "thumbnail": item.get('image').replace("http://", "https://"), + "publishedDate": published_date, + } + ) + return results + + +def parse_ss_doc(data): + published_date = None + try: + timestamp = int(data.get('sourceProps', {}).get('time')) + + # Sometime Quark will return 0, set published_date as None + if timestamp != 0: + published_date = datetime.fromtimestamp(timestamp) + except (ValueError, TypeError): + pass + + try: + thumbnail = data.get('picListProps', [])[0].get('src').replace("http://", "https://") + except (ValueError, TypeError, IndexError): + thumbnail = None + + return { + "title": html_to_text( + data.get('titleProps', {}).get('content') + # ss_kv variant 1 & 2 + or data.get('title') + ), + "url": data.get('sourceProps', {}).get('dest_url') + # ss_kv variant 1 + or data.get('normal_url') + # ss_kv variant 2 + or data.get('url'), + "content": html_to_text( + data.get('summaryProps', {}).get('content') + # ss_doc variant 1 + or data.get('message', {}).get('replyContent') + # ss_kv variant 1 + or data.get('show_body') + # ss_kv variant 2 + or data.get('desc') + ), + "publishedDate": published_date, + "thumbnail": thumbnail, + } + + +def parse_ss_note(data): + try: + published_date = datetime.fromtimestamp(int(data.get('source', {}).get('time'))) + except (ValueError, TypeError): + published_date = None + + return { + "title": html_to_text(data.get('title', {}).get('content')), + "url": data.get('source', {}).get('dest_url'), + "content": html_to_text(data.get('summary', {}).get('content')), + "publishedDate": published_date, + } + + +def parse_travel_dest_overview(data): + return { + "title": html_to_text(data.get('strong', {}).get('title')), + "url": data.get('strong', {}).get('baike_url'), + "content": html_to_text(data.get('strong', {}).get('baike_text')), + } + + +def parse_travel_ranking_list(data): + return { + "title": html_to_text(data.get('title', {}).get('text')), + "url": data.get('title', {}).get('url'), + "content": html_to_text(data.get('title', {}).get('title_tag')), + } diff --git a/searx/settings.yml b/searx/settings.yml index 0ce50f25f..55893f8a1 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -1661,6 +1661,20 @@ engines: shortcut: pypi engine: pypi + - name: quark + quark_category: general + categories: [general] + engine: quark + shortcut: qk + disabled: true + + - name: quark images + quark_category: images + categories: [images] + engine: quark + shortcut: qki + disabled: true + - name: qwant qwant_categ: web engine: qwant