Merge 6697cb695012b28c99690d64716d8b774375c486 into cc9dbde2e58ffc4b2ddd05ccf4327ebe8e168514
This commit is contained in:
commit
c3d4c52108
@ -175,3 +175,4 @@ features or generally made searx better:
|
|||||||
- Daniel Kukula `<https://github.com/dkuku>`
|
- Daniel Kukula `<https://github.com/dkuku>`
|
||||||
- Patrick Evans `https://github.com/holysoles`
|
- Patrick Evans `https://github.com/holysoles`
|
||||||
- Daniel Mowitz `<https://daniel.mowitz.rocks>`
|
- Daniel Mowitz `<https://daniel.mowitz.rocks>`
|
||||||
|
- SentientTapeDrive `<https://github.com/SentientTapeDrive>`_ `<https://thefubar.company>`_
|
||||||
|
105
docs/dev/engines/online/kagi.rst
Normal file
105
docs/dev/engines/online/kagi.rst
Normal file
@ -0,0 +1,105 @@
|
|||||||
|
.. _kagi engine:
|
||||||
|
|
||||||
|
Kagi
|
||||||
|
====
|
||||||
|
|
||||||
|
The Kagi engine scrapes search results from Kagi's HTML search interface.
|
||||||
|
|
||||||
|
Example
|
||||||
|
-------
|
||||||
|
|
||||||
|
Configuration
|
||||||
|
~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. code:: yaml
|
||||||
|
|
||||||
|
- name: kagi
|
||||||
|
engine: kagi
|
||||||
|
shortcut: kg
|
||||||
|
categories: [general, web]
|
||||||
|
timeout: 4.0
|
||||||
|
api_key: "YOUR-KAGI-TOKEN" # required
|
||||||
|
about:
|
||||||
|
website: https://kagi.com
|
||||||
|
use_official_api: false
|
||||||
|
require_api_key: true
|
||||||
|
results: HTML
|
||||||
|
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
~~~~~~~~~~
|
||||||
|
|
||||||
|
``api_key`` : required
|
||||||
|
The Kagi API token used for authentication. Can be obtained from your Kagi account settings.
|
||||||
|
|
||||||
|
``pageno`` : optional
|
||||||
|
The page number for paginated results. Defaults to 1.
|
||||||
|
|
||||||
|
Example Request
|
||||||
|
~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
params = {
|
||||||
|
'api_key': 'YOUR-KAGI-TOKEN',
|
||||||
|
'pageno': 1,
|
||||||
|
'headers': {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||||
|
'Accept-Language': 'en-US,en;q=0.5',
|
||||||
|
'DNT': '1'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
query = 'test query'
|
||||||
|
request_params = kagi.request(query, params)
|
||||||
|
|
||||||
|
Example Response
|
||||||
|
~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
[
|
||||||
|
# Search result
|
||||||
|
{
|
||||||
|
'url': 'https://example.com/',
|
||||||
|
'title': 'Example Title',
|
||||||
|
'content': 'Example content snippet...',
|
||||||
|
'domain': 'example.com'
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
Implementation
|
||||||
|
-------------
|
||||||
|
|
||||||
|
The engine performs the following steps:
|
||||||
|
|
||||||
|
1. Constructs a GET request to ``https://kagi.com/html/search`` with:
|
||||||
|
- ``q`` parameter for the search query
|
||||||
|
- ``token`` parameter for authentication
|
||||||
|
- ``batch`` parameter for pagination
|
||||||
|
|
||||||
|
2. Parses the HTML response using XPath to extract:
|
||||||
|
- Result titles
|
||||||
|
- URLs
|
||||||
|
- Content snippets
|
||||||
|
- Domain information
|
||||||
|
|
||||||
|
3. Handles various error cases:
|
||||||
|
- 401: Invalid API token
|
||||||
|
- 429: Rate limit exceeded
|
||||||
|
- Other non-200 status codes
|
||||||
|
|
||||||
|
Dependencies
|
||||||
|
-----------
|
||||||
|
|
||||||
|
- lxml: For HTML parsing and XPath evaluation
|
||||||
|
- urllib.parse: For URL handling and encoding
|
||||||
|
- searx.utils: For text extraction and XPath helpers
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
|
||||||
|
- The engine requires a valid Kagi API token to function
|
||||||
|
- Results are scraped from Kagi's HTML interface rather than using an official API
|
||||||
|
- Rate limiting may apply based on your Kagi subscription level
|
||||||
|
- The engine sets specific browser-like headers to ensure reliable scraping
|
148
searx/engines/kagi.py
Normal file
148
searx/engines/kagi.py
Normal file
@ -0,0 +1,148 @@
|
|||||||
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
|
"""Kagi Search
|
||||||
|
Scrapes Kagi's HTML search results.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from urllib.parse import urlencode
|
||||||
|
from lxml import html
|
||||||
|
|
||||||
|
from searx.utils import extract_text, eval_xpath, eval_xpath_list
|
||||||
|
from searx.exceptions import SearxEngineAPIException
|
||||||
|
from searx import logger
|
||||||
|
|
||||||
|
logger = logger.getChild('kagi')
|
||||||
|
|
||||||
|
about = {
|
||||||
|
"website": 'https://kagi.com',
|
||||||
|
"wikidata_id": None,
|
||||||
|
"official_api_documentation": None,
|
||||||
|
"use_official_api": False,
|
||||||
|
"require_api_key": True,
|
||||||
|
"results": 'HTML',
|
||||||
|
}
|
||||||
|
|
||||||
|
categories = ['general', 'web']
|
||||||
|
paging = True
|
||||||
|
time_range_support = False
|
||||||
|
safesearch = False
|
||||||
|
|
||||||
|
base_url = 'https://kagi.com/html/search'
|
||||||
|
|
||||||
|
api_key = None # Set in settings.yml
|
||||||
|
|
||||||
|
# Global cookie storage for Kagi authentication
|
||||||
|
kagi_cookies = {'kagi_session': None, '_kagi_search_': None}
|
||||||
|
|
||||||
|
|
||||||
|
def request(query, params):
|
||||||
|
if not api_key:
|
||||||
|
raise SearxEngineAPIException('missing Kagi API key')
|
||||||
|
|
||||||
|
page = params['pageno']
|
||||||
|
|
||||||
|
if 'cookies' not in params:
|
||||||
|
params['cookies'] = {}
|
||||||
|
params['cookies'].update(kagi_cookies)
|
||||||
|
|
||||||
|
if kagi_cookies['kagi_session'] and kagi_cookies['_kagi_search_']:
|
||||||
|
logger.debug(
|
||||||
|
"Using Kagi cookies for authentication - session: %s, search: %s",
|
||||||
|
kagi_cookies['kagi_session'],
|
||||||
|
kagi_cookies['_kagi_search_'],
|
||||||
|
)
|
||||||
|
search_url = base_url + '?' + urlencode({'q': query, 'batch': page})
|
||||||
|
else:
|
||||||
|
missing = []
|
||||||
|
if not kagi_cookies['kagi_session']:
|
||||||
|
missing.append('kagi_session')
|
||||||
|
if not kagi_cookies['_kagi_search_']:
|
||||||
|
missing.append('_kagi_search_')
|
||||||
|
logger.debug("Missing cookies %s, using API key for initial authentication", missing)
|
||||||
|
search_url = base_url + '?' + urlencode({'q': query, 'token': api_key, 'batch': page})
|
||||||
|
|
||||||
|
params['url'] = search_url
|
||||||
|
params['headers'].update(
|
||||||
|
{
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
|
||||||
|
'AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||||
|
'Chrome/120.0.0.0 Safari/537.36',
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||||
|
'Accept-Language': 'en-US,en;q=0.5',
|
||||||
|
'DNT': '1',
|
||||||
|
}
|
||||||
|
)
|
||||||
|
params['allow_redirects'] = True
|
||||||
|
params['verify'] = True
|
||||||
|
params['max_redirects'] = 1
|
||||||
|
|
||||||
|
return params
|
||||||
|
|
||||||
|
|
||||||
|
def response(resp):
|
||||||
|
results = []
|
||||||
|
|
||||||
|
if 'set-cookie' in resp.headers:
|
||||||
|
cookies = resp.headers.get_list('set-cookie')
|
||||||
|
for cookie in cookies:
|
||||||
|
try:
|
||||||
|
cookie_parts = cookie.split('=', 1)
|
||||||
|
if len(cookie_parts) != 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
name = cookie_parts[0].strip()
|
||||||
|
value = cookie_parts[1].split(';')[0].strip()
|
||||||
|
|
||||||
|
if name == 'kagi_session':
|
||||||
|
if value != kagi_cookies['kagi_session']:
|
||||||
|
kagi_cookies['kagi_session'] = value
|
||||||
|
resp.search_params['cookies']['kagi_session'] = value
|
||||||
|
logger.debug("Updated kagi_session cookie: %s", value)
|
||||||
|
elif name == '_kagi_search_': # Exact match for search cookie
|
||||||
|
if value != kagi_cookies['_kagi_search_']:
|
||||||
|
kagi_cookies['_kagi_search_'] = value
|
||||||
|
resp.search_params['cookies']['_kagi_search_'] = value
|
||||||
|
logger.debug("Updated _kagi_search_ cookie: %s", value)
|
||||||
|
except ValueError as e:
|
||||||
|
logger.warning("Failed to parse Kagi cookie: %s", str(e))
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
"Global Kagi cookies - session: %s, search: %s", kagi_cookies['kagi_session'], kagi_cookies['_kagi_search_']
|
||||||
|
)
|
||||||
|
logger.debug(
|
||||||
|
"Request Kagi cookies - session: %s, search: %s",
|
||||||
|
resp.search_params['cookies'].get('kagi_session'),
|
||||||
|
resp.search_params['cookies'].get('_kagi_search_'),
|
||||||
|
)
|
||||||
|
|
||||||
|
if resp.status_code == 401:
|
||||||
|
kagi_cookies['kagi_session'] = None
|
||||||
|
kagi_cookies['_kagi_search_'] = None
|
||||||
|
resp.search_params['cookies'].clear()
|
||||||
|
logger.debug("Cleared invalid Kagi cookies")
|
||||||
|
|
||||||
|
raise SearxEngineAPIException('Invalid Kagi authentication')
|
||||||
|
if resp.status_code == 429:
|
||||||
|
raise SearxEngineAPIException('Kagi rate limit exceeded')
|
||||||
|
if resp.status_code != 200:
|
||||||
|
raise SearxEngineAPIException(f'Unexpected HTTP status code: {resp.status_code}')
|
||||||
|
|
||||||
|
dom = html.fromstring(resp.text)
|
||||||
|
|
||||||
|
for result in eval_xpath_list(dom, '//div[contains(@class, "_0_SRI")]'):
|
||||||
|
try:
|
||||||
|
title_tag = eval_xpath(result, './/a[contains(@class, "__sri_title_link")]')[0]
|
||||||
|
title = extract_text(title_tag)
|
||||||
|
url = title_tag.get('href')
|
||||||
|
content_tag = eval_xpath(result, './/div[contains(@class, "__sri-desc")]')
|
||||||
|
content = extract_text(content_tag[0]) if content_tag else ''
|
||||||
|
domain = eval_xpath(result, './/span[contains(@class, "host")]/text()')
|
||||||
|
if domain:
|
||||||
|
domain = domain[0]
|
||||||
|
|
||||||
|
search_result = {'url': url, 'title': title, 'content': content, 'domain': domain}
|
||||||
|
results.append(search_result)
|
||||||
|
|
||||||
|
except (IndexError, KeyError):
|
||||||
|
continue
|
||||||
|
|
||||||
|
return results
|
@ -2484,6 +2484,19 @@ engines:
|
|||||||
shortcut: pgo
|
shortcut: pgo
|
||||||
disabled: true
|
disabled: true
|
||||||
|
|
||||||
|
- name: kagi
|
||||||
|
engine: kagi
|
||||||
|
shortcut: kg
|
||||||
|
categories: [general, web]
|
||||||
|
disabled: true
|
||||||
|
timeout: 4.0
|
||||||
|
api_key: ""
|
||||||
|
about:
|
||||||
|
website: https://kagi.com
|
||||||
|
use_official_api: false
|
||||||
|
require_api_key: true
|
||||||
|
results: HTML
|
||||||
|
|
||||||
# Doku engine lets you access to any Doku wiki instance:
|
# Doku engine lets you access to any Doku wiki instance:
|
||||||
# A public one or a privete/corporate one.
|
# A public one or a privete/corporate one.
|
||||||
# - name: ubuntuwiki
|
# - name: ubuntuwiki
|
||||||
|
152
tests/unit/test_engine_kagi.py
Normal file
152
tests/unit/test_engine_kagi.py
Normal file
@ -0,0 +1,152 @@
|
|||||||
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
|
# pylint: disable=missing-module-docstring
|
||||||
|
|
||||||
|
import mock
|
||||||
|
from lxml import html
|
||||||
|
from urllib.parse import parse_qs
|
||||||
|
|
||||||
|
from searx.engines import kagi
|
||||||
|
from searx.exceptions import SearxEngineAPIException
|
||||||
|
from tests import SearxTestCase
|
||||||
|
|
||||||
|
|
||||||
|
class TestKagiEngine(SearxTestCase):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.test_html = """
|
||||||
|
<div class="_0_main-search-results">
|
||||||
|
<div class="_0_SRI search-result">
|
||||||
|
<div class="_0_TITLE __sri-title">
|
||||||
|
<h3 class="__sri-title-box">
|
||||||
|
<a class="__sri_title_link _ext_t" href="https://example1.com">Result 1</a>
|
||||||
|
</h3>
|
||||||
|
</div>
|
||||||
|
<div class="__sri-url-box">
|
||||||
|
<span class="host">example1.com</span>
|
||||||
|
</div>
|
||||||
|
<div class="__sri-body">
|
||||||
|
<div class="__sri-desc">Content 1</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="_0_SRI search-result">
|
||||||
|
<div class="_0_TITLE __sri-title">
|
||||||
|
<h3 class="__sri-title-box">
|
||||||
|
<a class="__sri_title_link _ext_t" href="https://example2.com">Result 2</a>
|
||||||
|
</h3>
|
||||||
|
</div>
|
||||||
|
<div class="__sri-url-box">
|
||||||
|
<span class="host">example2.com</span>
|
||||||
|
</div>
|
||||||
|
<div class="__sri-body">
|
||||||
|
<div class="__sri-desc">Content 2</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test_request(self):
|
||||||
|
# Test with missing API token
|
||||||
|
kagi.token = None
|
||||||
|
params = {'pageno': 1, 'headers': {}}
|
||||||
|
self.assertRaises(SearxEngineAPIException, kagi.request, 'test query', params)
|
||||||
|
|
||||||
|
# Test with valid API token but no cookie
|
||||||
|
kagi.token = 'test_token'
|
||||||
|
params = {'pageno': 1, 'headers': {}, 'cookies': {}}
|
||||||
|
query = 'test query'
|
||||||
|
request_params = kagi.request(query, params)
|
||||||
|
|
||||||
|
self.assertIn('url', request_params)
|
||||||
|
self.assertIn('token=test_token', request_params['url'])
|
||||||
|
self.assertIn('q=test+query', request_params['url'])
|
||||||
|
self.assertEqual(request_params['max_redirects'], 1)
|
||||||
|
self.assertTrue(request_params['allow_redirects'])
|
||||||
|
|
||||||
|
# Test with both required cookies
|
||||||
|
params['cookies']['kagi_session'] = 'test_session'
|
||||||
|
params['cookies']['_kagi_search_'] = 'test_search'
|
||||||
|
request_params = kagi.request(query, params)
|
||||||
|
self.assertNotIn('token=', request_params['url'])
|
||||||
|
self.assertIn('q=test+query', request_params['url'])
|
||||||
|
self.assertEqual(request_params['max_redirects'], 1)
|
||||||
|
self.assertTrue(request_params['allow_redirects'])
|
||||||
|
|
||||||
|
# Test with missing search cookie
|
||||||
|
params['cookies'] = {'kagi_session': 'test_session'}
|
||||||
|
request_params = kagi.request(query, params)
|
||||||
|
self.assertIn('token=', request_params['url'])
|
||||||
|
|
||||||
|
# Test with missing session cookie
|
||||||
|
params['cookies'] = {'_kagi_search_': 'test_search'}
|
||||||
|
request_params = kagi.request(query, params)
|
||||||
|
self.assertIn('token=', request_params['url'])
|
||||||
|
|
||||||
|
# Test pagination
|
||||||
|
params['pageno'] = 2
|
||||||
|
request_params = kagi.request(query, params)
|
||||||
|
self.assertIn('batch=2', request_params['url'])
|
||||||
|
self.assertEqual(request_params['max_redirects'], 1)
|
||||||
|
|
||||||
|
def test_response(self):
|
||||||
|
def verify_cookie_capture(cookie_headers, expected_session, expected_search):
|
||||||
|
mock_headers = mock.Mock()
|
||||||
|
mock_headers.get_list = mock.Mock(return_value=cookie_headers)
|
||||||
|
mock_headers.__contains__ = mock.Mock(return_value=True)
|
||||||
|
|
||||||
|
response = mock.Mock(
|
||||||
|
text=self.test_html, status_code=200, headers=mock_headers, search_params={'cookies': {}}
|
||||||
|
)
|
||||||
|
results = kagi.response(response)
|
||||||
|
|
||||||
|
self.assertEqual(response.search_params['cookies'].get('kagi_session'), expected_session)
|
||||||
|
self.assertEqual(response.search_params['cookies'].get('_kagi_search_'), expected_search)
|
||||||
|
return results
|
||||||
|
|
||||||
|
# Test cookie capture with standard attributes
|
||||||
|
results = verify_cookie_capture(
|
||||||
|
['kagi_session=test_session; Path=/; HttpOnly', '_kagi_search_=test_search; Path=/; HttpOnly'],
|
||||||
|
'test_session',
|
||||||
|
'test_search',
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test cookie capture with additional attributes
|
||||||
|
results = verify_cookie_capture(
|
||||||
|
[
|
||||||
|
'kagi_session=test_session2; Path=/; HttpOnly; SameSite=Lax',
|
||||||
|
'_kagi_search_=test_search2; Domain=.kagi.com; Path=/; SameSite=Lax',
|
||||||
|
],
|
||||||
|
'test_session2',
|
||||||
|
'test_search2',
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(type(results), list)
|
||||||
|
self.assertEqual(len(results), 2) # 2 search results
|
||||||
|
|
||||||
|
# Check first result
|
||||||
|
self.assertEqual(results[0]['title'], 'Result 1')
|
||||||
|
self.assertEqual(results[0]['url'], 'https://example1.com')
|
||||||
|
self.assertEqual(results[0]['content'], 'Content 1')
|
||||||
|
self.assertEqual(results[0]['domain'], 'example1.com')
|
||||||
|
|
||||||
|
# Check second result
|
||||||
|
self.assertEqual(results[1]['title'], 'Result 2')
|
||||||
|
self.assertEqual(results[1]['url'], 'https://example2.com')
|
||||||
|
self.assertEqual(results[1]['content'], 'Content 2')
|
||||||
|
self.assertEqual(results[1]['domain'], 'example2.com')
|
||||||
|
|
||||||
|
def test_response_error_handling(self):
|
||||||
|
# Test invalid token/cookie response
|
||||||
|
response = mock.Mock(
|
||||||
|
text='', status_code=401, search_params={'cookies': {'kagi_session': 'invalid_session'}}, headers={}
|
||||||
|
)
|
||||||
|
self.assertRaises(SearxEngineAPIException, kagi.response, response)
|
||||||
|
# Verify invalid cookie was cleared
|
||||||
|
self.assertNotIn('kagi_session', response.search_params['cookies'])
|
||||||
|
|
||||||
|
# Test rate limit response
|
||||||
|
response = mock.Mock(text='', status_code=429, search_params={'cookies': {}}, headers={})
|
||||||
|
self.assertRaises(SearxEngineAPIException, kagi.response, response)
|
||||||
|
|
||||||
|
# Test other error response
|
||||||
|
response = mock.Mock(text='', status_code=500, search_params={'cookies': {}}, headers={})
|
||||||
|
self.assertRaises(SearxEngineAPIException, kagi.response, response)
|
Loading…
x
Reference in New Issue
Block a user