[feat] tracker url plugin: use ClearURL tracking param list

This commit is contained in:
Bnyro 2025-04-28 18:06:59 +02:00
parent c733aa83e8
commit 1b62571df9
No known key found for this signature in database
4 changed files with 60 additions and 16 deletions

View File

@ -20,6 +20,7 @@ jobs:
- update_engine_traits.py
- update_wikidata_units.py
- update_engine_descriptions.py
- update_tracker_patterns.py
steps:
- name: Checkout
uses: actions/checkout@v4

View File

@ -15,6 +15,7 @@ __all__ = [
'OSM_KEYS_TAGS',
'ENGINE_DESCRIPTIONS',
'LOCALES',
'TRACKER_PATTERNS',
'ahmia_blacklist_loader',
]
@ -51,3 +52,4 @@ OSM_KEYS_TAGS = _load('osm_keys_tags.json')
ENGINE_DESCRIPTIONS = _load('engine_descriptions.json')
ENGINE_TRAITS = _load('engine_traits.json')
LOCALES = _load('locales.json')
TRACKER_PATTERNS = _load('tracker_patterns.json')

View File

@ -5,11 +5,12 @@ from __future__ import annotations
import typing
import re
from urllib.parse import urlunparse, parse_qsl, urlencode
from urllib.parse import urlunparse, parse_qsl, urlencode, ParseResult
from flask_babel import gettext
from searx.plugins import Plugin, PluginInfo
from searx.data import TRACKER_PATTERNS
if typing.TYPE_CHECKING:
from searx.search import SearchWithPlugins
@ -17,13 +18,6 @@ if typing.TYPE_CHECKING:
from searx.result_types import Result
from searx.plugins import PluginCfg
regexes = {
re.compile(r'utm_[^&]+'),
re.compile(r'(wkey|wemail)[^&]*'),
re.compile(r'(_hsenc|_hsmi|hsCtaTracking|__hssc|__hstc|__hsfp)[^&]*'),
re.compile(r'&$'),
}
class SXNGPlugin(Plugin):
"""Remove trackers arguments from the returned URL"""
@ -39,20 +33,31 @@ class SXNGPlugin(Plugin):
preference_section="privacy",
)
def _remove_queries(self, url: ParseResult, query_regexes: list[str]):
parsed_query: list[tuple[str, str]] = list(parse_qsl(url.query))
for param_name, param_value in parsed_query.copy():
for reg in query_regexes:
if re.match(reg, param_name):
parsed_query.remove((param_name, param_value))
return url._replace(query=urlencode(parsed_query))
def on_result(
self, request: "SXNG_Request", search: "SearchWithPlugins", result: Result
) -> bool: # pylint: disable=unused-argument
if not result.parsed_url:
return True
parsed_query: list[tuple[str, str]] = parse_qsl(result.parsed_url.query)
for name_value in list(parsed_query):
param_name = name_value[0]
for reg in regexes:
if reg.match(param_name):
parsed_query.remove(name_value)
result.parsed_url = result.parsed_url._replace(query=urlencode(parsed_query))
result.url = urlunparse(result.parsed_url)
for rule in TRACKER_PATTERNS:
if not re.match(rule["urlPattern"], result.url):
continue
for exception in rule["exceptions"]:
if re.match(exception, result.url):
break
else:
result.parsed_url = self._remove_queries(result.parsed_url, rule["trackerParams"])
result.url = urlunparse(result.parsed_url)
return True

View File

@ -0,0 +1,36 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Fetch trackers"""
import json
import httpx
from searx.data import data_dir
DATA_FILE = data_dir / 'tracker_patterns.json'
CLEAR_LIST_URL = "https://raw.githubusercontent.com/ClearURLs/Rules/refs/heads/master/data.min.json"
def fetch_clear_url_filters():
resp = httpx.get(CLEAR_LIST_URL)
if resp.status_code != 200:
# pylint: disable=broad-exception-raised
raise Exception("Error fetching ClearURL filter lists, HTTP code " + resp.status_code) # type: ignore
providers = resp.json()["providers"]
rules = []
for rule in providers.values():
rules.append(
{
"urlPattern": rule["urlPattern"].replace("\\\\", "\\"), # fix javascript regex syntax
"exceptions": [exc.replace("\\\\", "\\") for exc in rule["exceptions"]],
"trackerParams": rule["rules"],
}
)
return rules
if __name__ == '__main__':
filter_list = fetch_clear_url_filters()
with DATA_FILE.open("w", encoding='utf-8') as f:
json.dump(filter_list, f)