diff --git a/searx/engines/kagi.py b/searx/engines/kagi.py new file mode 100644 index 000000000..84c0006f4 --- /dev/null +++ b/searx/engines/kagi.py @@ -0,0 +1,148 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Kagi Search +Scrapes Kagi's HTML search results. +""" + +from urllib.parse import urlencode +from lxml import html + +from searx.utils import extract_text, eval_xpath, eval_xpath_list +from searx.exceptions import SearxEngineAPIException +from searx import logger + +logger = logger.getChild('kagi') + +about = { + "website": 'https://kagi.com', + "wikidata_id": None, + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": True, + "results": 'HTML', +} + +categories = ['general', 'web'] +paging = True +time_range_support = False +safesearch = False + +base_url = 'https://kagi.com/html/search' + +api_key = None # Set in settings.yml + +# Global cookie storage for Kagi authentication +kagi_cookies = {'kagi_session': None, '_kagi_search_': None} + + +def request(query, params): + if not api_key: + raise SearxEngineAPIException('missing Kagi API key') + + page = params['pageno'] + + if 'cookies' not in params: + params['cookies'] = {} + params['cookies'].update(kagi_cookies) + + if kagi_cookies['kagi_session'] and kagi_cookies['_kagi_search_']: + logger.debug( + "Using Kagi cookies for authentication - session: %s, search: %s", + kagi_cookies['kagi_session'], + kagi_cookies['_kagi_search_'], + ) + search_url = base_url + '?' + urlencode({'q': query, 'batch': page}) + else: + missing = [] + if not kagi_cookies['kagi_session']: + missing.append('kagi_session') + if not kagi_cookies['_kagi_search_']: + missing.append('_kagi_search_') + logger.debug("Missing cookies %s, using API key for initial authentication", missing) + search_url = base_url + '?' + urlencode({'q': query, 'token': api_key, 'batch': page}) + + params['url'] = search_url + params['headers'].update( + { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/120.0.0.0 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'DNT': '1', + } + ) + params['allow_redirects'] = True + params['verify'] = True + params['max_redirects'] = 1 + + return params + + +def response(resp): + results = [] + + if 'set-cookie' in resp.headers: + cookies = resp.headers.get_list('set-cookie') + for cookie in cookies: + try: + cookie_parts = cookie.split('=', 1) + if len(cookie_parts) != 2: + continue + + name = cookie_parts[0].strip() + value = cookie_parts[1].split(';')[0].strip() + + if name == 'kagi_session': + if value != kagi_cookies['kagi_session']: + kagi_cookies['kagi_session'] = value + resp.search_params['cookies']['kagi_session'] = value + logger.debug("Updated kagi_session cookie: %s", value) + elif name == '_kagi_search_': # Exact match for search cookie + if value != kagi_cookies['_kagi_search_']: + kagi_cookies['_kagi_search_'] = value + resp.search_params['cookies']['_kagi_search_'] = value + logger.debug("Updated _kagi_search_ cookie: %s", value) + except ValueError as e: + logger.warning("Failed to parse Kagi cookie: %s", str(e)) + + logger.debug( + "Global Kagi cookies - session: %s, search: %s", kagi_cookies['kagi_session'], kagi_cookies['_kagi_search_'] + ) + logger.debug( + "Request Kagi cookies - session: %s, search: %s", + resp.search_params['cookies'].get('kagi_session'), + resp.search_params['cookies'].get('_kagi_search_'), + ) + + if resp.status_code == 401: + kagi_cookies['kagi_session'] = None + kagi_cookies['_kagi_search_'] = None + resp.search_params['cookies'].clear() + logger.debug("Cleared invalid Kagi cookies") + + raise SearxEngineAPIException('Invalid Kagi authentication') + if resp.status_code == 429: + raise SearxEngineAPIException('Kagi rate limit exceeded') + if resp.status_code != 200: + raise SearxEngineAPIException(f'Unexpected HTTP status code: {resp.status_code}') + + dom = html.fromstring(resp.text) + + for result in eval_xpath_list(dom, '//div[contains(@class, "_0_SRI")]'): + try: + title_tag = eval_xpath(result, './/a[contains(@class, "__sri_title_link")]')[0] + title = extract_text(title_tag) + url = title_tag.get('href') + content_tag = eval_xpath(result, './/div[contains(@class, "__sri-desc")]') + content = extract_text(content_tag[0]) if content_tag else '' + domain = eval_xpath(result, './/span[contains(@class, "host")]/text()') + if domain: + domain = domain[0] + + search_result = {'url': url, 'title': title, 'content': content, 'domain': domain} + results.append(search_result) + + except (IndexError, KeyError): + continue + + return results