diff --git a/searx/engines/demo_offline.py b/searx/engines/demo_offline.py index 2cef4f0d0..6a3b8ddf7 100644 --- a/searx/engines/demo_offline.py +++ b/searx/engines/demo_offline.py @@ -15,6 +15,7 @@ close to the implementation, its just a simple example. To get in use of this import json from searx.result_types import EngineResults +from searx.enginelib import EngineCache engine_type = 'offline' categories = ['general'] @@ -32,14 +33,18 @@ about = { # if there is a need for globals, use a leading underline _my_offline_engine: str = "" +CACHE: EngineCache +"""Persistent (SQLite) key/value cache that deletes its values after ``expire`` +seconds.""" -def init(engine_settings=None): + +def init(engine_settings): """Initialization of the (offline) engine. The origin of this demo engine is a simple json string which is loaded in this example while the engine is - initialized. + initialized.""" + global _my_offline_engine, CACHE # pylint: disable=global-statement - """ - global _my_offline_engine # pylint: disable=global-statement + CACHE = EngineCache(engine_settings["name"]) # type:ignore _my_offline_engine = ( '[ {"value": "%s"}' @@ -57,8 +62,8 @@ def search(query, request_params) -> EngineResults: results. """ res = EngineResults() + count = CACHE.get("count", 0) - count = 0 for row in json.loads(_my_offline_engine): count += 1 kvmap = { @@ -75,4 +80,7 @@ def search(query, request_params) -> EngineResults: ) ) res.add(res.types.LegacyResult(number_of_results=count)) + + # cache counter value for 20sec + CACHE.set("count", count, expire=20) return res diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 450cd9cf8..62e1603a6 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -6,16 +6,17 @@ DuckDuckGo WEB from __future__ import annotations -from typing import TYPE_CHECKING -import re -from urllib.parse import quote_plus import json +import re +import typing + +from urllib.parse import quote_plus + import babel import lxml.html from searx import ( locales, - redislib, external_bang, ) from searx.utils import ( @@ -25,12 +26,12 @@ from searx.utils import ( extract_text, ) from searx.network import get # see https://github.com/searxng/searxng/issues/762 -from searx import redisdb from searx.enginelib.traits import EngineTraits +from searx.enginelib import EngineCache from searx.exceptions import SearxEngineCaptchaException from searx.result_types import EngineResults -if TYPE_CHECKING: +if typing.TYPE_CHECKING: import logging logger: logging.Logger @@ -61,28 +62,18 @@ url = "https://html.duckduckgo.com/html" time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'} form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'} -__CACHE = [] + +CACHE: EngineCache +"""Persistent (SQLite) key/value cache that deletes its values after ``expire`` +seconds.""" -def _cache_key(query: str, region: str): - return 'SearXNG_ddg_web_vqd' + redislib.secret_hash(f"{query}//{region}") +def init(_): # pylint: disable=unused-argument + global CACHE # pylint: disable=global-statement + CACHE = EngineCache("duckduckgo") # type:ignore -def cache_vqd(query: str, region: str, value: str): - """Caches a ``vqd`` value from a query.""" - c = redisdb.client() - if c: - logger.debug("VALKEY cache vqd value: %s (%s)", value, region) - c.set(_cache_key(query, region), value, ex=600) - - else: - logger.debug("MEM cache vqd value: %s (%s)", value, region) - if len(__CACHE) > 100: # cache vqd from last 100 queries - __CACHE.pop(0) - __CACHE.append((_cache_key(query, region), value)) - - -def get_vqd(query: str, region: str, force_request: bool = False): +def get_vqd(query: str, region: str, force_request: bool = False) -> str: """Returns the ``vqd`` that fits to the *query*. :param query: The query term @@ -114,31 +105,34 @@ def get_vqd(query: str, region: str, force_request: bool = False): seems the block list is a sliding window: to get my IP rid from the bot list I had to cool down my IP for 1h (send no requests from that IP to DDG). """ - key = _cache_key(query, region) + key = CACHE.secret_hash(f"{query}//{region}") + value = CACHE.get(key=key) + if value is not None and not force_request: + logger.debug("vqd: re-use cached value: %s", value) + return value - c = redisdb.client() - if c: - value = c.get(key) - if value or value == b'': - value = value.decode('utf-8') # type: ignore - logger.debug("re-use CACHED vqd value: %s", value) - return value + logger.debug("vqd: request value from from duckduckgo.com") + resp = get(f'https://duckduckgo.com/?q={quote_plus(query)}') + if resp.status_code == 200: # type: ignore + value = extr(resp.text, 'vqd="', '"') # type: ignore + if value: + logger.debug("vqd value from duckduckgo.com request: '%s'", value) + else: + logger.error("vqd: can't parse value from ddg response (return empty string)") + return "" + else: + logger.error("vqd: got HTTP %s from duckduckgo.com", resp.status_code) - for k, value in __CACHE: - if k == key: - logger.debug("MEM re-use CACHED vqd value: %s", value) - return value + if value: + CACHE.set(key=key, value=value) + else: + logger.error("vqd value from duckduckgo.com ", resp.status_code) + return value - if force_request: - resp = get(f'https://duckduckgo.com/?q={quote_plus(query)}') - if resp.status_code == 200: # type: ignore - value = extr(resp.text, 'vqd="', '"') # type: ignore - if value: - logger.debug("vqd value from DDG request: %s", value) - cache_vqd(query, region, value) - return value - return None +def set_vqd(query: str, region: str, value: str): + key = CACHE.secret_hash(f"{query}//{region}") + CACHE.set(key=key, value=value, expire=3600) def get_ddg_lang(eng_traits: EngineTraits, sxng_locale, default='en_US'): @@ -373,8 +367,11 @@ def response(resp) -> EngineResults: # some locales (at least China) does not have a "next page" button form = form[0] form_vqd = eval_xpath(form, '//input[@name="vqd"]/@value')[0] - - cache_vqd(resp.search_params['data']['q'], resp.search_params['data']['kl'], form_vqd) + set_vqd( + query=resp.search_params['data']['q'], + region=resp.search_params['data']['kl'], + value=str(form_vqd), + ) # just select "web-result" and ignore results of class "result--ad result--ad--small" for div_result in eval_xpath(doc, '//div[@id="links"]/div[contains(@class, "web-result")]'): @@ -401,7 +398,7 @@ def response(resp) -> EngineResults: results.add( results.types.Answer( answer=zero_click, - url=eval_xpath_getindex(doc, '//div[@id="zero_click_abstract"]/a/@href', 0), + url=eval_xpath_getindex(doc, '//div[@id="zero_click_abstract"]/a/@href', 0), # type: ignore ) ) diff --git a/searx/engines/radio_browser.py b/searx/engines/radio_browser.py index 64208304e..70aecd476 100644 --- a/searx/engines/radio_browser.py +++ b/searx/engines/radio_browser.py @@ -5,7 +5,9 @@ https://de1.api.radio-browser.info/#Advanced_station_search """ +from __future__ import annotations +import typing import random import socket from urllib.parse import urlencode @@ -13,9 +15,15 @@ import babel from flask_babel import gettext from searx.network import get +from searx.enginelib import EngineCache from searx.enginelib.traits import EngineTraits from searx.locales import language_tag +if typing.TYPE_CHECKING: + import logging + + logger = logging.getLogger() + traits: EngineTraits about = { @@ -52,11 +60,24 @@ none filters are applied. Valid filters are: """ -servers = [] +CACHE: EngineCache +"""Persistent (SQLite) key/value cache that deletes its values after ``expire`` +seconds.""" def init(_): - # see https://api.radio-browser.info + global CACHE # pylint: disable=global-statement + CACHE = EngineCache("radio_browser") + server_list() + + +def server_list() -> list[str]: + + servers = CACHE.get("servers", []) + if servers: + return servers + + # hint: can take up to 40sec! ips = socket.getaddrinfo("all.api.radio-browser.info", 80, 0, 0, socket.IPPROTO_TCP) for ip_tuple in ips: _ip: str = ip_tuple[4][0] # type: ignore @@ -65,8 +86,22 @@ def init(_): if srv not in servers: servers.append(srv) + # update server list once in 24h + CACHE.set(key="servers", value=servers, expire=60 * 60 * 24) + + return servers + def request(query, params): + + servers = server_list() + if not servers: + logger.error("Fetched server list is empty!") + params["url"] = None + return + + server = random.choice(servers) + args = { 'name': query, 'order': 'votes', @@ -87,8 +122,7 @@ def request(query, params): if countrycode in traits.custom['countrycodes']: # type: ignore args['countrycode'] = countrycode - params['url'] = f"{random.choice(servers)}/json/stations/search?{urlencode(args)}" - return params + params['url'] = f"{server}/json/stations/search?{urlencode(args)}" def response(resp): @@ -154,8 +188,9 @@ def fetch_traits(engine_traits: EngineTraits): babel_reg_list = get_global("territory_languages").keys() - language_list = get(f'{servers[0]}/json/languages').json() # type: ignore - country_list = get(f'{servers[0]}/json/countries').json() # type: ignore + server = server_list()[0] + language_list = get(f'{server}/json/languages').json() # type: ignore + country_list = get(f'{server}/json/countries').json() # type: ignore for lang in language_list: diff --git a/searx/engines/soundcloud.py b/searx/engines/soundcloud.py index 23032223e..08df9aa04 100644 --- a/searx/engines/soundcloud.py +++ b/searx/engines/soundcloud.py @@ -1,14 +1,23 @@ # SPDX-License-Identifier: AGPL-3.0-or-later """SoundCloud is a German audio streaming service.""" +from __future__ import annotations import re -from urllib.parse import quote_plus, urlencode +import typing import datetime +from urllib.parse import quote_plus, urlencode + from dateutil import parser from lxml import html from searx.network import get as http_get +from searx.enginelib import EngineCache + +if typing.TYPE_CHECKING: + import logging + + logger: logging.Logger about = { "website": "https://soundcloud.com", @@ -28,7 +37,6 @@ HTML frontend of the common WEB site. """ cid_re = re.compile(r'client_id:"([^"]*)"', re.I | re.U) -guest_client_id = "" results_per_page = 10 soundcloud_facet = "model" @@ -48,6 +56,10 @@ app_locale_map = { "sv": "sv", } +CACHE: EngineCache +"""Persistent (SQLite) key/value cache that deletes its values after ``expire`` +seconds.""" + def request(query, params): @@ -55,6 +67,12 @@ def request(query, params): # - user_id=451561-497874-703312-310156 # - app_version=1740727428 + guest_client_id = CACHE.get("guest_client_id") + if guest_client_id is None: + guest_client_id = get_client_id() + if guest_client_id: + CACHE.set(key="guest_client_id", value=guest_client_id) + args = { "q": query, "offset": (params['pageno'] - 1) * results_per_page, @@ -104,12 +122,12 @@ def response(resp): return results -def init(engine_settings=None): # pylint: disable=unused-argument - global guest_client_id # pylint: disable=global-statement - guest_client_id = get_client_id() +def init(engine_settings): # pylint: disable=unused-argument + global CACHE # pylint: disable=global-statement + CACHE = EngineCache(engine_settings["name"]) # type:ignore -def get_client_id() -> str: +def get_client_id() -> str | None: client_id = "" url = "https://soundcloud.com" @@ -143,4 +161,4 @@ def get_client_id() -> str: logger.info("using client_id '%s' for soundclud queries", client_id) else: logger.warning("missing valid client_id for soundclud queries") - return client_id + return client_id or None diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 54e05604b..6c77e37c8 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -84,7 +84,6 @@ from typing import TYPE_CHECKING, Any from collections import OrderedDict import re from unicodedata import normalize, combining -from time import time from datetime import datetime, timedelta from json import loads @@ -97,6 +96,7 @@ from searx.network import get # see https://github.com/searxng/searxng/issues/7 from searx.exceptions import SearxEngineCaptchaException from searx.locales import region_tag from searx.enginelib.traits import EngineTraits +from searx.enginelib import EngineCache if TYPE_CHECKING: import logging @@ -159,10 +159,21 @@ search_form_xpath = '//form[@id="search"]' """ -# timestamp of the last fetch of 'sc' code -sc_code_ts = 0 -sc_code = '' -sc_code_cache_sec = 30 + +CACHE: EngineCache +"""Persistent (SQLite) key/value cache that deletes its values after ``expire`` +seconds.""" + + +def init(_): + global CACHE # pylint: disable=global-statement + + # hint: all three startpage engines (WEB, Images & News) can/should use the + # same sc_code .. + CACHE = EngineCache("startpage") # type:ignore + + +sc_code_cache_sec = 3600 """Time in seconds the sc-code is cached in memory :py:obj:`get_sc_code`.""" @@ -176,14 +187,10 @@ def get_sc_code(searxng_locale, params): Startpage's search form generates a new sc-code on each request. This function scrap a new sc-code from Startpage's home page every - :py:obj:`sc_code_cache_sec` seconds. + :py:obj:`sc_code_cache_sec` seconds.""" - """ - - global sc_code_ts, sc_code # pylint: disable=global-statement - - if sc_code and (time() < (sc_code_ts + sc_code_cache_sec)): - logger.debug("get_sc_code: reuse '%s'", sc_code) + sc_code = CACHE.get("SC_CODE", "") + if sc_code: return sc_code headers = {**params['headers']} @@ -233,8 +240,9 @@ def get_sc_code(searxng_locale, params): message="get_sc_code: [PR-695] query new sc time-stamp failed! (%s)" % resp.url, # type: ignore ) from exc - sc_code_ts = time() + sc_code = str(sc_code) logger.debug("get_sc_code: new value is: %s", sc_code) + CACHE.set(key="SC_CODE", value=sc_code, expire=sc_code_cache_sec) return sc_code diff --git a/searx/engines/wolframalpha_api.py b/searx/engines/wolframalpha_api.py index 5670e356f..60892d4d8 100644 --- a/searx/engines/wolframalpha_api.py +++ b/searx/engines/wolframalpha_api.py @@ -5,7 +5,7 @@ from urllib.parse import urlencode -from lxml import etree +import lxml.etree # about about = { @@ -72,7 +72,7 @@ def replace_pua_chars(text): def response(resp): results = [] - search_results = etree.XML(resp.content) + search_results = lxml.etree.XML(resp.content) # return empty array if there are no results if search_results.xpath(failure_xpath): diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py index a9d177c32..5ac261d12 100644 --- a/searx/engines/wolframalpha_noapi.py +++ b/searx/engines/wolframalpha_noapi.py @@ -3,11 +3,13 @@ Wolfram|Alpha (Science) """ +from __future__ import annotations + from json import loads -from time import time from urllib.parse import urlencode from searx.network import get as http_get +from searx.enginelib import EngineCache # about about = { @@ -40,41 +42,39 @@ search_url = ( referer_url = url + 'input/?{query}' -token = {'value': '', 'last_updated': None} - # pods to display as image in infobox # this pods do return a plaintext, but they look better and are more useful as images image_pods = {'VisualRepresentation', 'Illustration', 'Symbol'} -# seems, wolframalpha resets its token in every hour -def obtain_token(): - update_time = time() - (time() % 3600) - try: - token_response = http_get('https://www.wolframalpha.com/input/api/v1/code?ts=9999999999999999999', timeout=2.0) - token['value'] = loads(token_response.text)['code'] - token['last_updated'] = update_time - except: # pylint: disable=bare-except - pass +CACHE: EngineCache +"""Persistent (SQLite) key/value cache that deletes its values after ``expire`` +seconds.""" + + +def init(engine_settings): + global CACHE # pylint: disable=global-statement + CACHE = EngineCache(engine_settings["name"]) # type:ignore + + +def obtain_token() -> str: + token = CACHE.get(key="token") + if token is None: + resp = http_get('https://www.wolframalpha.com/input/api/v1/code?ts=9999999999999999999', timeout=2.0) + token = resp.json()["code"] + # seems, wolframalpha resets its token in every hour + CACHE.set(key="code", value=token, expire=3600) return token -def init(engine_settings=None): # pylint: disable=unused-argument - obtain_token() - - -# do search-request def request(query, params): - # obtain token if last update was more than an hour - if time() - (token['last_updated'] or 0) > 3600: - obtain_token() - params['url'] = search_url.format(query=urlencode({'input': query}), token=token['value']) + token = obtain_token() + params['url'] = search_url.format(query=urlencode({'input': query}), token=token) params['headers']['Referer'] = referer_url.format(query=urlencode({'i': query})) return params -# get response from search-request def response(resp): results = []