diff --git a/.github/workflows/data-update.yml b/.github/workflows/data-update.yml index 39893127d..ac826d196 100644 --- a/.github/workflows/data-update.yml +++ b/.github/workflows/data-update.yml @@ -20,6 +20,7 @@ jobs: - update_engine_traits.py - update_wikidata_units.py - update_engine_descriptions.py + - update_tracker_patterns.py steps: - name: Checkout uses: actions/checkout@v4 diff --git a/searx/data/__init__.py b/searx/data/__init__.py index 28a3974fa..2bb3a8ca0 100644 --- a/searx/data/__init__.py +++ b/searx/data/__init__.py @@ -15,6 +15,7 @@ __all__ = [ 'OSM_KEYS_TAGS', 'ENGINE_DESCRIPTIONS', 'LOCALES', + 'TRACKER_PATTERNS', 'ahmia_blacklist_loader', ] @@ -51,3 +52,4 @@ OSM_KEYS_TAGS = _load('osm_keys_tags.json') ENGINE_DESCRIPTIONS = _load('engine_descriptions.json') ENGINE_TRAITS = _load('engine_traits.json') LOCALES = _load('locales.json') +TRACKER_PATTERNS = _load('tracker_patterns.json') diff --git a/searx/plugins/tracker_url_remover.py b/searx/plugins/tracker_url_remover.py index d9c767a36..f337aba0f 100644 --- a/searx/plugins/tracker_url_remover.py +++ b/searx/plugins/tracker_url_remover.py @@ -5,11 +5,12 @@ from __future__ import annotations import typing import re -from urllib.parse import urlunparse, parse_qsl, urlencode +from urllib.parse import urlunparse, parse_qsl, urlencode, ParseResult from flask_babel import gettext from searx.plugins import Plugin, PluginInfo +from searx.data import TRACKER_PATTERNS if typing.TYPE_CHECKING: from searx.search import SearchWithPlugins @@ -17,13 +18,6 @@ if typing.TYPE_CHECKING: from searx.result_types import Result from searx.plugins import PluginCfg -regexes = { - re.compile(r'utm_[^&]+'), - re.compile(r'(wkey|wemail)[^&]*'), - re.compile(r'(_hsenc|_hsmi|hsCtaTracking|__hssc|__hstc|__hsfp)[^&]*'), - re.compile(r'&$'), -} - class SXNGPlugin(Plugin): """Remove trackers arguments from the returned URL""" @@ -39,20 +33,31 @@ class SXNGPlugin(Plugin): preference_section="privacy", ) + def _remove_queries(self, url: ParseResult, query_regexes: list[str]): + parsed_query: list[tuple[str, str]] = list(parse_qsl(url.query)) + + for param_name, param_value in parsed_query.copy(): + for reg in query_regexes: + if re.match(reg, param_name): + parsed_query.remove((param_name, param_value)) + + return url._replace(query=urlencode(parsed_query)) + def on_result( self, request: "SXNG_Request", search: "SearchWithPlugins", result: Result ) -> bool: # pylint: disable=unused-argument if not result.parsed_url: return True - parsed_query: list[tuple[str, str]] = parse_qsl(result.parsed_url.query) - for name_value in list(parsed_query): - param_name = name_value[0] - for reg in regexes: - if reg.match(param_name): - parsed_query.remove(name_value) - result.parsed_url = result.parsed_url._replace(query=urlencode(parsed_query)) - result.url = urlunparse(result.parsed_url) + for rule in TRACKER_PATTERNS: + if not re.match(rule["urlPattern"], result.url): + continue + + for exception in rule["exceptions"]: + if re.match(exception, result.url): break + else: + result.parsed_url = self._remove_queries(result.parsed_url, rule["trackerParams"]) + result.url = urlunparse(result.parsed_url) return True diff --git a/searxng_extra/update/update_tracker_patterns.py b/searxng_extra/update/update_tracker_patterns.py new file mode 100644 index 000000000..e5db2ce40 --- /dev/null +++ b/searxng_extra/update/update_tracker_patterns.py @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Fetch trackers""" + +import json +import httpx + +from searx.data import data_dir + +DATA_FILE = data_dir / 'tracker_patterns.json' +CLEAR_LIST_URL = "https://raw.githubusercontent.com/ClearURLs/Rules/refs/heads/master/data.min.json" + + +def fetch_clear_url_filters(): + resp = httpx.get(CLEAR_LIST_URL) + if resp.status_code != 200: + # pylint: disable=broad-exception-raised + raise Exception("Error fetching ClearURL filter lists, HTTP code " + resp.status_code) # type: ignore + + providers = resp.json()["providers"] + rules = [] + for rule in providers.values(): + rules.append( + { + "urlPattern": rule["urlPattern"].replace("\\\\", "\\"), # fix javascript regex syntax + "exceptions": [exc.replace("\\\\", "\\") for exc in rule["exceptions"]], + "trackerParams": rule["rules"], + } + ) + + return rules + + +if __name__ == '__main__': + filter_list = fetch_clear_url_filters() + with DATA_FILE.open("w", encoding='utf-8') as f: + json.dump(filter_list, f)