diff --git a/docs/dev/engines/offline_concept.rst b/docs/dev/engines/offline_concept.rst index ddb34fc60..b1a44f469 100644 --- a/docs/dev/engines/offline_concept.rst +++ b/docs/dev/engines/offline_concept.rst @@ -29,7 +29,7 @@ Programming Interface parameter. This function can be omitted, if there is no need to setup anything in advance. -:py:func:`search(query, params) ` +:py:func:`search(query, params) ` Each offline engine has a function named ``search``. This function is responsible to perform a search and return the results in a presentable format. (Where *presentable* means presentable by the selected result diff --git a/searx/favicons/__init__.py b/searx/favicons/__init__.py index 39c4ad716..4e0aaaf3e 100644 --- a/searx/favicons/__init__.py +++ b/searx/favicons/__init__.py @@ -1,5 +1,12 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -"""Implementations for providing the favicons in SearXNG""" +"""Implementations for providing the favicons in SearXNG. + +There is a command line for developer purposes and for deeper analysis. Here is +an example in which the command line is called in the development environment:: + + $ ./manage pyenv.cmd bash --norc --noprofile + (py3) python -m searx.favicons --help +""" from __future__ import annotations diff --git a/searx/favicons/cache.py b/searx/favicons/cache.py index 1aa722d89..cf71705aa 100644 --- a/searx/favicons/cache.py +++ b/searx/favicons/cache.py @@ -236,6 +236,12 @@ class FaviconCacheSQLite(sqlitedb.SQLiteAppl, FaviconCache): model in the SQLite DB is implemented using the abstract class :py:obj:`sqlitedb.SQLiteAppl`. + For introspection of the DB, jump into developer environment and run command + to show cache state:: + + $ ./manage pyenv.cmd bash --norc --noprofile + (py3) python -m searx.favicons cache state + The following configurations are required / supported: - :py:obj:`FaviconCacheConfig.db_url` @@ -357,6 +363,10 @@ CREATE TABLE IF NOT EXISTS blob_map ( if sha256 != FALLBACK_ICON: conn.execute(self.SQL_INSERT_BLOBS, (sha256, bytes_c, mime, data)) conn.execute(self.SQL_INSERT_BLOB_MAP, (sha256, resolver, authority)) + # hint: the with context of the connection object closes the transaction + # but not the DB connection. The connection has to be closed by the + # caller of self.connect()! + conn.close() return True @@ -376,7 +386,8 @@ CREATE TABLE IF NOT EXISTS blob_map ( return self.properties.set("LAST_MAINTENANCE", "") # hint: this (also) sets the m_time of the property! - # do maintenance tasks + # Do maintenance tasks. This can be take a little more time, to avoid + # DB locks, etablish a new DB connecton. with self.connect() as conn: @@ -407,6 +418,12 @@ CREATE TABLE IF NOT EXISTS blob_map ( conn.execute("DELETE FROM blob_map WHERE sha256 IN ('%s')" % "','".join(sha_list)) logger.debug("dropped %s blobs with total size of %s bytes", len(sha_list), c) + # Vacuuming the WALs + # https://www.theunterminatedstring.com/sqlite-vacuuming/ + + conn.execute("PRAGMA wal_checkpoint(TRUNCATE)") + conn.close() + def _query_val(self, sql, default=None): val = self.DB.execute(sql).fetchone() if val is not None: diff --git a/searx/sqlitedb.py b/searx/sqlitedb.py index 8c982c49a..364b43db3 100644 --- a/searx/sqlitedb.py +++ b/searx/sqlitedb.py @@ -7,20 +7,83 @@ :py:obj:`SQLiteProperties`: Class to manage properties stored in a database. ----- +Examplarical implementations based on :py:obj:`SQLiteAppl`: +:py:obj:`searx.cache.ExpireCacheSQLite` : + Cache that manages key/value pairs in a SQLite DB, in which the key/value + pairs are deleted after an "expire" time. This type of cache is used, for + example, for the engines, see :py:obj:`searx.enginelib.EngineCache`. + +:py:obj:`searx.favicons.cache.FaviconCacheSQLite` : + Favicon cache that manages the favicon BLOBs in a SQLite DB. + +---- """ from __future__ import annotations -import sys +import abc +import datetime import re import sqlite3 +import sys import threading -import abc +import uuid from searx import logger -logger = logger.getChild('sqlitedb') +logger = logger.getChild("sqlitedb") + +THREAD_LOCAL = threading.local() + + +class DBSession: + """A *thead-local* DB session""" + + @classmethod + def get_connect(cls, app: SQLiteAppl) -> sqlite3.Connection: + """Returns a thread local DB connection. The connection is only + established once per thread. + """ + if getattr(THREAD_LOCAL, "DBSession_map", None) is None: + THREAD_LOCAL.DBSession_map = {} + + session = THREAD_LOCAL.DBSession_map.get(app.db_url) + if session is None: + session = cls(app) + return session.conn + + def __init__(self, app: SQLiteAppl): + self.uuid = uuid.uuid4() + self.app = app + self._conn = None + # self.__del__ will be called, when thread ends + if getattr(THREAD_LOCAL, "DBSession_map", None) is None: + THREAD_LOCAL.DBSession_map = {} + THREAD_LOCAL.DBSession_map[self.app.db_url] = self + + @property + def conn(self) -> sqlite3.Connection: + msg = f"[{threading.current_thread().ident}] DBSession: " f"{self.app.__class__.__name__}({self.app.db_url})" + if self._conn is None: + self._conn = self.app.connect() + logger.debug("%s --> created new connection", msg) + # else: + # logger.debug("%s --> already connected", msg) + + return self._conn + + def __del__(self): + try: + if self._conn is not None: + # HINT: Don't use Python's logging facility in a destructor, it + # will produce error reports when python aborts the process or + # thread, because at this point objects that the logging module + # needs, do not exist anymore. + # msg = f"DBSession: close [{self.uuid}] {self.app.__class__.__name__}({self.app.db_url})" + # logger.debug(msg) + self._conn.close() + except Exception: # pylint: disable=broad-exception-caught + pass class SQLiteAppl(abc.ABC): @@ -51,13 +114,18 @@ class SQLiteAppl(abc.ABC): """ SQLITE_JOURNAL_MODE = "WAL" + """``SQLiteAppl`` applications are optimzed for WAL_ mode, its not recommend + to change the journal mode (see :py:obj:`SQLiteAppl.tear_down`). + + .. _WAL: https://sqlite.org/wal.html + """ SQLITE_CONNECT_ARGS = { # "timeout": 5.0, # "detect_types": 0, "check_same_thread": bool(SQLITE_THREADING_MODE != "serialized"), "cached_statements": 0, # https://github.com/python/cpython/issues/118172 # "uri": False, - "autocommit": False, + "isolation_level": None, } # fmt:skip """Connection arguments (:py:obj:`sqlite3.connect`) @@ -66,10 +134,6 @@ class SQLiteAppl(abc.ABC): ``serialized``. The check is more of a hindrance in this case because it would prevent a DB connector from being used in multiple threads. - ``autocommit``: - Is disabled by default. Note: autocommit option has been added in Python - 3.12. - ``cached_statements``: Is set to ``0`` by default. Note: Python 3.12+ fetch result are not consistent in multi-threading application and causing an API misuse error. @@ -89,9 +153,17 @@ class SQLiteAppl(abc.ABC): self.db_url = db_url self.properties = SQLiteProperties(db_url) - self.thread_local = threading.local() self._init_done = False self._compatibility() + # atexit.register(self.tear_down) + + # def tear_down(self): + # """:ref:`Vacuuming the WALs` upon normal interpreter termination + # (:py:obj:`atexit.register`). + + # .. _SQLite: Vacuuming the WALs: https://www.theunterminatedstring.com/sqlite-vacuuming/ + # """ + # self.DB.execute("PRAGMA wal_checkpoint(TRUNCATE)") def _compatibility(self): @@ -113,19 +185,31 @@ class SQLiteAppl(abc.ABC): "SQLite runtime library version %s is not supported (require >= 3.35)", sqlite3.sqlite_version ) + def _connect(self) -> sqlite3.Connection: + conn = sqlite3.Connection(self.db_url, **self.SQLITE_CONNECT_ARGS) # type: ignore + conn.execute(f"PRAGMA journal_mode={self.SQLITE_JOURNAL_MODE}") + self.register_functions(conn) + return conn + def connect(self) -> sqlite3.Connection: """Creates a new DB connection (:py:obj:`SQLITE_CONNECT_ARGS`). If not - already done, the DB schema is set up + already done, the DB schema is set up. The caller must take care of + closing the resource. Alternatively, :py:obj:`SQLiteAppl.DB` can also + be used (the resource behind `self.DB` is automatically closed when the + process or thread is terminated). """ if sys.version_info < (3, 12): # Prior Python 3.12 there is no "autocommit" option self.SQLITE_CONNECT_ARGS.pop("autocommit", None) - self.init() - logger.debug("%s: connect to DB: %s // %s", self.__class__.__name__, self.db_url, self.SQLITE_CONNECT_ARGS) - conn = sqlite3.Connection(self.db_url, **self.SQLITE_CONNECT_ARGS) # type: ignore - conn.execute(f"PRAGMA journal_mode={self.SQLITE_JOURNAL_MODE}") - self.register_functions(conn) + msg = ( + f"[{threading.current_thread().ident}] {self.__class__.__name__}({self.db_url})" + f" {self.SQLITE_CONNECT_ARGS} // {self.SQLITE_JOURNAL_MODE}" + ) + logger.debug(msg) + + with self._connect() as conn: + self.init(conn) return conn def register_functions(self, conn): @@ -150,7 +234,7 @@ class SQLiteAppl(abc.ABC): .. _re.search: https://docs.python.org/3/library/re.html#re.search """ - conn.create_function('regexp', 2, lambda x, y: 1 if re.search(x, y) else 0, deterministic=True) + conn.create_function("regexp", 2, lambda x, y: 1 if re.search(x, y) else 0, deterministic=True) @property def DB(self) -> sqlite3.Connection: @@ -168,57 +252,66 @@ class SQLiteAppl(abc.ABC): https://docs.python.org/3/library/sqlite3.html#sqlite3-controlling-transactions """ - if getattr(self.thread_local, 'DB', None) is None: - self.thread_local.DB = self.connect() + conn = None - # Theoretically it is possible to reuse the DB cursor across threads as - # of Python 3.12, in practice the threading of the cursor seems to me to - # be so faulty that I prefer to establish one connection per thread + if self.SQLITE_THREADING_MODE == "serialized": + # Theoretically it is possible to reuse the DB cursor across threads + # as of Python 3.12, in practice the threading of the cursor seems + # to me a little faulty that I prefer to establish one connection + # per thread. + # + # may we can activate this code one day .. + # if self._DB is None: + # self._DB = self.connect() + # conn = self._DB + conn = DBSession.get_connect(self) + else: + conn = DBSession.get_connect(self) - self.thread_local.DB.commit() - return self.thread_local.DB + # Since more than one instance of SQLiteAppl share the same DB + # connection, we need to make sure that each SQLiteAppl instance has run + # its init method at least once. + self.init(conn) - # In "serialized" mode, SQLite can be safely used by multiple threads - # with no restriction. - # - # if self.SQLITE_THREADING_MODE != "serialized": - # if getattr(self.thread_local, 'DB', None) is None: - # self.thread_local.DB = self.connect() - # return self.thread_local.DB - # - # if self._DB is None: - # self._DB = self.connect() # pylint: disable=attribute-defined-outside-init - # return self._DB + return conn - def init(self): + def init(self, conn: sqlite3.Connection) -> bool: """Initializes the DB schema and properties, is only executed once even - if called several times.""" + if called several times. + + If the initialization has not yet taken place, it is carried out and a + `True` is returned to the caller at the end. If the initialization has + already been carried out in the past, `False` is returned. + """ if self._init_done: - return + return False self._init_done = True logger.debug("init DB: %s", self.db_url) - self.properties.init() + self.properties.init(conn) + ver = self.properties("DB_SCHEMA") if ver is None: - with self.properties.DB: - self.create_schema(self.properties.DB) + with conn: + self.create_schema(conn) else: ver = int(ver) if ver != self.DB_SCHEMA: raise sqlite3.DatabaseError("Expected DB schema v%s, DB schema is v%s" % (self.DB_SCHEMA, ver)) logger.debug("DB_SCHEMA = %s", ver) - def create_schema(self, conn): + return True + + def create_schema(self, conn: sqlite3.Connection): logger.debug("create schema ..") + self.properties.set("DB_SCHEMA", self.DB_SCHEMA) + self.properties.set("LAST_MAINTENANCE", "") with conn: for table_name, sql in self.DDL_CREATE_TABLES.items(): conn.execute(sql) self.properties.set(f"Table {table_name} created", table_name) - self.properties.set("DB_SCHEMA", self.DB_SCHEMA) - self.properties.set("LAST_MAINTENANCE", "") class SQLiteProperties(SQLiteAppl): @@ -253,33 +346,32 @@ CREATE TABLE IF NOT EXISTS properties ( " ON CONFLICT(name) DO UPDATE" " SET value=excluded.value, m_time=strftime('%s', 'now')" ) + SQL_DELETE = "DELETE FROM properties WHERE name = ?" SQL_TABLE_EXISTS = ( "SELECT name FROM sqlite_master" " WHERE type='table' AND name='properties'" ) # fmt:skip SQLITE_CONNECT_ARGS = dict(SQLiteAppl.SQLITE_CONNECT_ARGS) - SQLITE_CONNECT_ARGS["autocommit"] = True # This option has no effect before Python 3.12 def __init__(self, db_url: str): # pylint: disable=super-init-not-called self.db_url = db_url - self.thread_local = threading.local() self._init_done = False self._compatibility() - def init(self): + def init(self, conn: sqlite3.Connection) -> bool: """Initializes DB schema of the properties in the DB.""" if self._init_done: - return + return False self._init_done = True logger.debug("init properties of DB: %s", self.db_url) - with self.DB as conn: - res = conn.execute(self.SQL_TABLE_EXISTS) - if res.fetchone() is None: # DB schema needs to be be created - self.create_schema(conn) + res = conn.execute(self.SQL_TABLE_EXISTS) + if res.fetchone() is None: # DB schema needs to be be created + self.create_schema(conn) + return True - def __call__(self, name, default=None): + def __call__(self, name: str, default=None): """Returns the value of the property ``name`` or ``default`` if property not exists in DB.""" @@ -288,36 +380,47 @@ CREATE TABLE IF NOT EXISTS properties ( return default return res[0] - def set(self, name, value): + def set(self, name: str, value: str | int): """Set ``value`` of property ``name`` in DB. If property already exists, update the ``m_time`` (and the value).""" - self.DB.execute(self.SQL_SET, (name, value)) + with self.DB: + self.DB.execute(self.SQL_SET, (name, value)) - if sys.version_info <= (3, 12): - # Prior Python 3.12 there is no "autocommit" option / lets commit - # explicitely. - self.DB.commit() + def delete(self, name: str) -> int: + """Delete of property ``name`` from DB.""" + with self.DB: + cur = self.DB.execute(self.SQL_DELETE, (name,)) + return cur.rowcount - def row(self, name, default=None): + def row(self, name: str, default=None): """Returns the DB row of property ``name`` or ``default`` if property not exists in DB.""" - cur = self.DB.cursor() - cur.execute("SELECT * FROM properties WHERE name = ?", (name,)) - res = cur.fetchone() - if res is None: + res = self.DB.execute("SELECT * FROM properties WHERE name = ?", (name,)) + row = res.fetchone() + if row is None: return default - col_names = [column[0] for column in cur.description] - return dict(zip(col_names, res)) - def m_time(self, name, default: int = 0) -> int: + col_names = [column[0] for column in row.description] + return dict(zip(col_names, row)) + + def m_time(self, name: str, default: int = 0) -> int: """Last modification time of this property.""" - res = self.DB.execute(self.SQL_M_TIME, (name,)).fetchone() - if res is None: + res = self.DB.execute(self.SQL_M_TIME, (name,)) + row = res.fetchone() + if row is None: return default - return int(res[0]) + return int(row[0]) def create_schema(self, conn): with conn: conn.execute(self.DDL_PROPERTIES) + + def __str__(self) -> str: + lines = [] + for row in self.DB.execute("SELECT name, value, m_time FROM properties"): + name, value, m_time = row + m_time = datetime.datetime.fromtimestamp(m_time).strftime("%Y-%m-%d %H:%M:%S") + lines.append(f"[last modified: {m_time}] {name:20s}: {value}") + return "\n".join(lines)