searxng/searx/engines/soundcloud.py
Markus Heiser bdfe1c2a15 [mod] engines: migration of the individual cache solutions to EngineCache
The EngineCache class replaces all previously individual solutions for caches in
the context of the engines.

- demo_offline.py
- duckduckgo.py
- radio_browser.py
- soundcloud.py
- startpage.py
- wolframalpha_api.py
- wolframalpha_noapi.py

Search term to test most of the modified engines::

    !ddg !rb !sc !sp !wa test

    !ddg !rb !sc !sp !wa foo

For introspection of the DB, jump into developer environment and run command to
show cache state::

    $ ./manage pyenv.cmd bash --norc --noprofile
    (py3) python -m searx.enginelib cache state

    cache tables and key/values
    ===========================
    [demo_offline        ] 2025-04-22 11:32:50 count        --> (int) 4
    [startpage           ] 2025-04-22 12:32:30 SC_CODE      --> (str) fSOBnhEMlDfE20
    [duckduckgo          ] 2025-04-22 12:32:31 4dff493e.... --> (str) 4-128634958369380006627592672385352473325
    [duckduckgo          ] 2025-04-22 12:40:06 3e2583e2.... --> (str) 4-263126175288871260472289814259666848451
    [radio_browser       ] 2025-04-23 11:33:08 servers      --> (list) ['https://de2.api.radio-browser.info',  ...]
    [soundcloud          ] 2025-04-29 11:40:06 guest_client_id --> (str) EjkRJG0BLNEZquRiPZYdNtJdyGtTuHdp
    [wolframalpha        ] 2025-04-22 12:40:06 code         --> (str) 5aa79f86205ad26188e0e26e28fb7ae7
    number of tables: 6
    number of key/value pairs: 7

In the "cache tables and key/values" section, the table name (engine name) is at
first position on the second there is the calculated expire date and on the
third and fourth position the key/value is shown.

About duckduckgo: The *vqd coode* of ddg depends on the query term and therefore
the key is a hash value of the query term (to not to store the raw query term).

In the "properties of ENGINES_CACHE" section all properties of the SQLiteAppl /
ExpireCache and their last modification date are shown::

    properties of ENGINES_CACHE
    ===========================
    [last modified: 2025-04-22 11:32:27] DB_SCHEMA           : 1
    [last modified: 2025-04-22 11:32:27] LAST_MAINTENANCE    :
    [last modified: 2025-04-22 11:32:27] crypt_hash          : ca612e3566fdfd7cf7efe2b1c9349f461158d07cb78a3750e5c5be686aa8ebdc
    [last modified: 2025-04-22 11:32:30] CACHE-TABLE--demo_offline: demo_offline
    [last modified: 2025-04-22 11:32:30] CACHE-TABLE--startpage: startpage
    [last modified: 2025-04-22 11:32:31] CACHE-TABLE--duckduckgo: duckduckgo
    [last modified: 2025-04-22 11:33:08] CACHE-TABLE--radio_browser: radio_browser
    [last modified: 2025-04-22 11:40:06] CACHE-TABLE--soundcloud: soundcloud
    [last modified: 2025-04-22 11:40:06] CACHE-TABLE--wolframalpha: wolframalpha

These properties provide information about the state of the ExpireCache and
control the behavior.  For example, the maintenance intervals are controlled by
the last modification date of the LAST_MAINTENANCE property and the hash value
of the password can be used to detect whether the password has been changed (in
this case the DB entries can no longer be decrypted and the entire cache must be
discarded).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2025-05-03 08:39:12 +02:00

165 lines
4.5 KiB
Python

# SPDX-License-Identifier: AGPL-3.0-or-later
"""SoundCloud is a German audio streaming service."""
from __future__ import annotations
import re
import typing
import datetime
from urllib.parse import quote_plus, urlencode
from dateutil import parser
from lxml import html
from searx.network import get as http_get
from searx.enginelib import EngineCache
if typing.TYPE_CHECKING:
import logging
logger: logging.Logger
about = {
"website": "https://soundcloud.com",
"wikidata_id": "Q568769",
"official_api_documentation": "https://developers.soundcloud.com/docs/api/guide",
"use_official_api": False,
"require_api_key": False,
"results": 'JSON',
}
categories = ["music"]
paging = True
search_url = "https://api-v2.soundcloud.com/search"
"""This is not the official (developer) url, it is the API which is used by the
HTML frontend of the common WEB site.
"""
cid_re = re.compile(r'client_id:"([^"]*)"', re.I | re.U)
results_per_page = 10
soundcloud_facet = "model"
app_locale_map = {
"de": "de",
"en": "en",
"es": "es",
"fr": "fr",
"oc": "fr",
"it": "it",
"nl": "nl",
"pl": "pl",
"szl": "pl",
"pt": "pt_BR",
"pap": "pt_BR",
"sv": "sv",
}
CACHE: EngineCache
"""Persistent (SQLite) key/value cache that deletes its values after ``expire``
seconds."""
def request(query, params):
# missing attributes: user_id, app_version
# - user_id=451561-497874-703312-310156
# - app_version=1740727428
guest_client_id = CACHE.get("guest_client_id")
if guest_client_id is None:
guest_client_id = get_client_id()
if guest_client_id:
CACHE.set(key="guest_client_id", value=guest_client_id)
args = {
"q": query,
"offset": (params['pageno'] - 1) * results_per_page,
"limit": results_per_page,
"facet": soundcloud_facet,
"client_id": guest_client_id,
"app_locale": app_locale_map.get(params["language"].split("-")[0], "en"),
}
params['url'] = f"{search_url}?{urlencode(args)}"
return params
def response(resp):
results = []
data = resp.json()
for result in data.get("collection", []):
if result["kind"] in ("track", "playlist"):
url = result.get("permalink_url")
if not url:
continue
uri = quote_plus(result.get("uri"))
content = [
result.get("description"),
result.get("label_name"),
]
res = {
"url": url,
"title": result["title"],
"content": " / ".join([c for c in content if c]),
"publishedDate": parser.parse(result["last_modified"]),
"iframe_src": "https://w.soundcloud.com/player/?url=" + uri,
"views": result.get("likes_count"),
}
thumbnail = result["artwork_url"] or result["user"]["avatar_url"]
res["thumbnail"] = thumbnail or None
length = int(result.get("duration", 0) / 1000)
if length:
length = datetime.timedelta(seconds=length)
res["length"] = length
res["views"] = result.get("playback_count", 0) or None
res["author"] = result.get("user", {}).get("full_name") or None
results.append(res)
return results
def init(engine_settings): # pylint: disable=unused-argument
global CACHE # pylint: disable=global-statement
CACHE = EngineCache(engine_settings["name"]) # type:ignore
def get_client_id() -> str | None:
client_id = ""
url = "https://soundcloud.com"
resp = http_get(url, timeout=10)
if not resp.ok:
logger.error("init: GET %s failed", url)
return client_id
tree = html.fromstring(resp.content)
script_tags = tree.xpath("//script[contains(@src, '/assets/')]")
app_js_urls = [tag.get("src") for tag in script_tags if tag is not None]
# extracts valid app_js urls from soundcloud.com content
for url in app_js_urls[::-1]:
# gets app_js and search for the client_id
resp = http_get(url)
if not resp.ok:
logger.error("init: app_js GET %s failed", url)
continue
cids = cid_re.search(resp.content.decode())
if cids and len(cids.groups()):
client_id = cids.groups()[0]
break
if client_id:
logger.info("using client_id '%s' for soundclud queries", client_id)
else:
logger.warning("missing valid client_id for soundclud queries")
return client_id or None