[feat] engines: add Quark engine
Co-authored-by: Bnyro <bnyro@tutanota.com>
This commit is contained in:
parent
a1d5add718
commit
68ffa4de53
383
searx/engines/quark.py
Normal file
383
searx/engines/quark.py
Normal file
@ -0,0 +1,383 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Quark (Shenma) search engine for searxng"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from datetime import datetime
|
||||
import re
|
||||
import json
|
||||
|
||||
from searx.utils import html_to_text, gen_useragent
|
||||
from searx.exceptions import SearxEngineAPIException, SearxEngineCaptchaException
|
||||
|
||||
# Metadata
|
||||
about = {
|
||||
"website": "https://m.quark.cn/",
|
||||
"wikidata_id": "Q48816502",
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": "HTML",
|
||||
"language": "zh",
|
||||
}
|
||||
|
||||
# Engine Configuration
|
||||
categories = []
|
||||
paging = True
|
||||
results_per_page = 10
|
||||
|
||||
quark_category = 'general'
|
||||
|
||||
time_range_support = True
|
||||
time_range_dict = {'day': '4', 'week': '3', 'month': '2', 'year': '1'}
|
||||
|
||||
CAPTCHA_PATTERN = r'\{[^{]*?"action"\s*:\s*"captcha"\s*,\s*"url"\s*:\s*"([^"]+)"[^{]*?\}'
|
||||
|
||||
|
||||
def is_alibaba_captcha(html):
|
||||
"""
|
||||
Detects if the response contains an Alibaba X5SEC CAPTCHA page.
|
||||
|
||||
Quark may return a CAPTCHA challenge after 9 requests in a short period.
|
||||
|
||||
Typically, the ban duration is around 15 minutes.
|
||||
"""
|
||||
return bool(re.search(CAPTCHA_PATTERN, html))
|
||||
|
||||
|
||||
def init(_):
|
||||
if quark_category not in ('general', 'images'):
|
||||
raise SearxEngineAPIException(f"Unsupported category: {quark_category}")
|
||||
|
||||
|
||||
def request(query, params):
|
||||
page_num = params["pageno"]
|
||||
|
||||
category_config = {
|
||||
'general': {
|
||||
'endpoint': 'https://m.quark.cn/s',
|
||||
'params': {
|
||||
"q": query,
|
||||
"layout": "html",
|
||||
"page": page_num,
|
||||
},
|
||||
},
|
||||
'images': {
|
||||
'endpoint': 'https://vt.sm.cn/api/pic/list',
|
||||
'params': {
|
||||
"query": query,
|
||||
"limit": results_per_page,
|
||||
"start": (page_num - 1) * results_per_page,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
query_params = category_config[quark_category]['params']
|
||||
query_url = category_config[quark_category]['endpoint']
|
||||
|
||||
if time_range_dict.get(params['time_range']) and quark_category == 'general':
|
||||
query_params["tl_request"] = time_range_dict.get(params['time_range'])
|
||||
|
||||
params["url"] = f"{query_url}?{urlencode(query_params)}"
|
||||
params["headers"] = {
|
||||
"User-Agent": gen_useragent(),
|
||||
}
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
text = resp.text
|
||||
|
||||
if is_alibaba_captcha(text):
|
||||
raise SearxEngineCaptchaException(
|
||||
suspended_time=900, message="Alibaba CAPTCHA detected. Please try again later."
|
||||
)
|
||||
|
||||
if quark_category == 'images':
|
||||
data = json.loads(text)
|
||||
for item in data.get('data', {}).get('hit', {}).get('imgInfo', {}).get('item', []):
|
||||
try:
|
||||
published_date = datetime.fromtimestamp(int(item.get("publish_time")))
|
||||
except (ValueError, TypeError):
|
||||
published_date = None
|
||||
|
||||
results.append(
|
||||
{
|
||||
"template": "images.html",
|
||||
"url": item.get("imgUrl"),
|
||||
"thumbnail_src": item.get("img"),
|
||||
"img_src": item.get("bigPicUrl"),
|
||||
"title": item.get("title"),
|
||||
"source": item.get("site"),
|
||||
"resolution": f"{item['width']} x {item['height']}",
|
||||
"publishedDate": published_date,
|
||||
}
|
||||
)
|
||||
|
||||
if quark_category == 'general':
|
||||
# Quark returns a variety of different sc values on a single page, depending on the query type.
|
||||
source_category_parsers = {
|
||||
'addition': parse_addition,
|
||||
'ai_page': parse_ai_page,
|
||||
'baike_sc': parse_baike_sc,
|
||||
'finance_shuidi': parse_finance_shuidi,
|
||||
'kk_yidian_all': parse_kk_yidian_all,
|
||||
'life_show_general_image': parse_life_show_general_image,
|
||||
'med_struct': parse_med_struct,
|
||||
'music_new_song': parse_music_new_song,
|
||||
'nature_result': parse_nature_result,
|
||||
'news_uchq': parse_news_uchq,
|
||||
'ss_note': parse_ss_note,
|
||||
# ss_kv, ss_pic, ss_text, ss_video, baike, structure_web_novel use the same struct as ss_doc
|
||||
'ss_doc': parse_ss_doc,
|
||||
'ss_kv': parse_ss_doc,
|
||||
'ss_pic': parse_ss_doc,
|
||||
'ss_text': parse_ss_doc,
|
||||
'ss_video': parse_ss_doc,
|
||||
'baike': parse_ss_doc,
|
||||
'structure_web_novel': parse_ss_doc,
|
||||
'travel_dest_overview': parse_travel_dest_overview,
|
||||
'travel_ranking_list': parse_travel_ranking_list,
|
||||
}
|
||||
|
||||
pattern = r'<script\s+type="application/json"\s+id="s-data-[^"]+"\s+data-used-by="hydrate">(.*?)</script>'
|
||||
matches = re.findall(pattern, text, re.DOTALL)
|
||||
|
||||
for match in matches:
|
||||
data = json.loads(match)
|
||||
initial_data = data.get('data', {}).get('initialData', {})
|
||||
extra_data = data.get('extraData', {})
|
||||
|
||||
source_category = extra_data.get('sc')
|
||||
|
||||
parsers = source_category_parsers.get(source_category)
|
||||
if parsers:
|
||||
parsed_results = parsers(initial_data)
|
||||
if isinstance(parsed_results, list):
|
||||
# Extend if the result is a list
|
||||
results.extend(parsed_results)
|
||||
else:
|
||||
# Append if it's a single result
|
||||
results.append(parsed_results)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def parse_addition(data):
|
||||
return {
|
||||
"title": html_to_text(data.get('title', {}).get('content')),
|
||||
"url": data.get('source', {}).get('url'),
|
||||
"content": html_to_text(data.get('summary', {}).get('content')),
|
||||
}
|
||||
|
||||
|
||||
def parse_ai_page(data):
|
||||
results = []
|
||||
for item in data.get('list', []):
|
||||
content = (
|
||||
" | ".join(map(str, item.get('content', [])))
|
||||
if isinstance(item.get('content'), list)
|
||||
else str(item.get('content'))
|
||||
)
|
||||
|
||||
try:
|
||||
published_date = datetime.fromtimestamp(int(item.get('source', {}).get('time')))
|
||||
except (ValueError, TypeError):
|
||||
published_date = None
|
||||
|
||||
results.append(
|
||||
{
|
||||
"title": html_to_text(item.get('title')),
|
||||
"url": item.get('url'),
|
||||
"content": html_to_text(content),
|
||||
"publishedDate": published_date,
|
||||
}
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
def parse_baike_sc(data):
|
||||
return {
|
||||
"title": html_to_text(data.get('data', {}).get('title')),
|
||||
"url": data.get('data', {}).get('url'),
|
||||
"content": html_to_text(data.get('data', {}).get('abstract')),
|
||||
"thumbnail": data.get('data', {}).get('img').replace("http://", "https://"),
|
||||
}
|
||||
|
||||
|
||||
def parse_finance_shuidi(data):
|
||||
content = " | ".join(
|
||||
(
|
||||
info
|
||||
for info in [
|
||||
data.get('establish_time'),
|
||||
data.get('company_status'),
|
||||
data.get('controled_type'),
|
||||
data.get('company_type'),
|
||||
data.get('capital'),
|
||||
data.get('address'),
|
||||
data.get('business_scope'),
|
||||
]
|
||||
if info
|
||||
)
|
||||
)
|
||||
return {
|
||||
"title": html_to_text(data.get('company_name')),
|
||||
"url": data.get('title_url'),
|
||||
"content": html_to_text(content),
|
||||
}
|
||||
|
||||
|
||||
def parse_kk_yidian_all(data):
|
||||
content_list = []
|
||||
for section in data.get('list_container', []):
|
||||
for item in section.get('list_container', []):
|
||||
if 'dot_text' in item:
|
||||
content_list.append(item['dot_text'])
|
||||
|
||||
return {
|
||||
"title": html_to_text(data.get('title')),
|
||||
"url": data.get('title_url'),
|
||||
"content": html_to_text(' '.join(content_list)),
|
||||
}
|
||||
|
||||
|
||||
def parse_life_show_general_image(data):
|
||||
results = []
|
||||
for item in data.get('image', []):
|
||||
try:
|
||||
published_date = datetime.fromtimestamp(int(item.get("publish_time")))
|
||||
except (ValueError, TypeError):
|
||||
published_date = None
|
||||
|
||||
results.append(
|
||||
{
|
||||
"template": "images.html",
|
||||
"url": item.get("imgUrl"),
|
||||
"thumbnail_src": item.get("img"),
|
||||
"img_src": item.get("bigPicUrl"),
|
||||
"title": item.get("title"),
|
||||
"source": item.get("site"),
|
||||
"resolution": f"{item['width']} x {item['height']}",
|
||||
"publishedDate": published_date,
|
||||
}
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
def parse_med_struct(data):
|
||||
return {
|
||||
"title": html_to_text(data.get('title')),
|
||||
"url": data.get('message', {}).get('statistics', {}).get('nu'),
|
||||
"content": html_to_text(data.get('message', {}).get('content_text')),
|
||||
"thumbnail": data.get('message', {}).get('video_img').replace("http://", "https://"),
|
||||
}
|
||||
|
||||
|
||||
def parse_music_new_song(data):
|
||||
results = []
|
||||
for item in data.get('hit3', []):
|
||||
results.append(
|
||||
{
|
||||
"title": f"{item['song_name']} | {item['song_singer']}",
|
||||
"url": item.get("play_url"),
|
||||
"content": html_to_text(item.get("lyrics")),
|
||||
"thumbnail": item.get("image_url").replace("http://", "https://"),
|
||||
}
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
def parse_nature_result(data):
|
||||
return {"title": html_to_text(data.get('title')), "url": data.get('url'), "content": html_to_text(data.get('desc'))}
|
||||
|
||||
|
||||
def parse_news_uchq(data):
|
||||
results = []
|
||||
for item in data.get('feed', []):
|
||||
try:
|
||||
published_date = datetime.strptime(item.get('time'), "%Y-%m-%d")
|
||||
except (ValueError, TypeError):
|
||||
# Sometime Quark will return non-standard format like "1天前", set published_date as None
|
||||
published_date = None
|
||||
|
||||
results.append(
|
||||
{
|
||||
"title": html_to_text(item.get('title')),
|
||||
"url": item.get('url'),
|
||||
"content": html_to_text(item.get('summary')),
|
||||
"thumbnail": item.get('image').replace("http://", "https://"),
|
||||
"publishedDate": published_date,
|
||||
}
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
def parse_ss_doc(data):
|
||||
published_date = None
|
||||
try:
|
||||
timestamp = int(data.get('sourceProps', {}).get('time'))
|
||||
|
||||
# Sometime Quark will return 0, set published_date as None
|
||||
if timestamp != 0:
|
||||
published_date = datetime.fromtimestamp(timestamp)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
try:
|
||||
thumbnail = data.get('picListProps', [])[0].get('src').replace("http://", "https://")
|
||||
except (ValueError, TypeError, IndexError):
|
||||
thumbnail = None
|
||||
|
||||
return {
|
||||
"title": html_to_text(
|
||||
data.get('titleProps', {}).get('content')
|
||||
# ss_kv variant 1 & 2
|
||||
or data.get('title')
|
||||
),
|
||||
"url": data.get('sourceProps', {}).get('dest_url')
|
||||
# ss_kv variant 1
|
||||
or data.get('normal_url')
|
||||
# ss_kv variant 2
|
||||
or data.get('url'),
|
||||
"content": html_to_text(
|
||||
data.get('summaryProps', {}).get('content')
|
||||
# ss_doc variant 1
|
||||
or data.get('message', {}).get('replyContent')
|
||||
# ss_kv variant 1
|
||||
or data.get('show_body')
|
||||
# ss_kv variant 2
|
||||
or data.get('desc')
|
||||
),
|
||||
"publishedDate": published_date,
|
||||
"thumbnail": thumbnail,
|
||||
}
|
||||
|
||||
|
||||
def parse_ss_note(data):
|
||||
try:
|
||||
published_date = datetime.fromtimestamp(int(data.get('source', {}).get('time')))
|
||||
except (ValueError, TypeError):
|
||||
published_date = None
|
||||
|
||||
return {
|
||||
"title": html_to_text(data.get('title', {}).get('content')),
|
||||
"url": data.get('source', {}).get('dest_url'),
|
||||
"content": html_to_text(data.get('summary', {}).get('content')),
|
||||
"publishedDate": published_date,
|
||||
}
|
||||
|
||||
|
||||
def parse_travel_dest_overview(data):
|
||||
return {
|
||||
"title": html_to_text(data.get('strong', {}).get('title')),
|
||||
"url": data.get('strong', {}).get('baike_url'),
|
||||
"content": html_to_text(data.get('strong', {}).get('baike_text')),
|
||||
}
|
||||
|
||||
|
||||
def parse_travel_ranking_list(data):
|
||||
return {
|
||||
"title": html_to_text(data.get('title', {}).get('text')),
|
||||
"url": data.get('title', {}).get('url'),
|
||||
"content": html_to_text(data.get('title', {}).get('title_tag')),
|
||||
}
|
@ -1641,6 +1641,20 @@ engines:
|
||||
shortcut: pypi
|
||||
engine: pypi
|
||||
|
||||
- name: quark
|
||||
quark_category: general
|
||||
categories: [general]
|
||||
engine: quark
|
||||
shortcut: qk
|
||||
disabled: true
|
||||
|
||||
- name: quark images
|
||||
quark_category: images
|
||||
categories: [images]
|
||||
engine: quark
|
||||
shortcut: qki
|
||||
disabled: true
|
||||
|
||||
- name: qwant
|
||||
qwant_categ: web
|
||||
engine: qwant
|
||||
|
Loading…
x
Reference in New Issue
Block a user