Merge cdb336969cc9fa22a2d549a5feb10c8d8c63c8bf into bc06b1aece43c50fa0dae7e6d90389814bebbc91
This commit is contained in:
commit
4cea5c1a96
1
.github/workflows/data-update.yml
vendored
1
.github/workflows/data-update.yml
vendored
@ -20,6 +20,7 @@ jobs:
|
||||
- update_engine_traits.py
|
||||
- update_wikidata_units.py
|
||||
- update_engine_descriptions.py
|
||||
- update_tracker_patterns.py
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
@ -15,6 +15,7 @@ __all__ = [
|
||||
'OSM_KEYS_TAGS',
|
||||
'ENGINE_DESCRIPTIONS',
|
||||
'LOCALES',
|
||||
'TRACKER_PATTERNS',
|
||||
'ahmia_blacklist_loader',
|
||||
]
|
||||
|
||||
@ -51,3 +52,4 @@ OSM_KEYS_TAGS = _load('osm_keys_tags.json')
|
||||
ENGINE_DESCRIPTIONS = _load('engine_descriptions.json')
|
||||
ENGINE_TRAITS = _load('engine_traits.json')
|
||||
LOCALES = _load('locales.json')
|
||||
TRACKER_PATTERNS = _load('tracker_patterns.json')
|
||||
|
1
searx/data/tracker_patterns.json
Normal file
1
searx/data/tracker_patterns.json
Normal file
File diff suppressed because one or more lines are too long
@ -5,11 +5,12 @@ from __future__ import annotations
|
||||
import typing
|
||||
|
||||
import re
|
||||
from urllib.parse import urlunparse, parse_qsl, urlencode
|
||||
from urllib.parse import urlunparse, parse_qsl, urlencode, ParseResult
|
||||
|
||||
from flask_babel import gettext
|
||||
|
||||
from searx.plugins import Plugin, PluginInfo
|
||||
from searx.data import TRACKER_PATTERNS
|
||||
|
||||
if typing.TYPE_CHECKING:
|
||||
from searx.search import SearchWithPlugins
|
||||
@ -17,13 +18,6 @@ if typing.TYPE_CHECKING:
|
||||
from searx.result_types import Result
|
||||
from searx.plugins import PluginCfg
|
||||
|
||||
regexes = {
|
||||
re.compile(r'utm_[^&]+'),
|
||||
re.compile(r'(wkey|wemail)[^&]*'),
|
||||
re.compile(r'(_hsenc|_hsmi|hsCtaTracking|__hssc|__hstc|__hsfp)[^&]*'),
|
||||
re.compile(r'&$'),
|
||||
}
|
||||
|
||||
|
||||
class SXNGPlugin(Plugin):
|
||||
"""Remove trackers arguments from the returned URL"""
|
||||
@ -39,20 +33,31 @@ class SXNGPlugin(Plugin):
|
||||
preference_section="privacy",
|
||||
)
|
||||
|
||||
def _remove_queries(self, url: ParseResult, query_regexes: list[str]):
|
||||
parsed_query: list[tuple[str, str]] = list(parse_qsl(url.query))
|
||||
|
||||
for param_name, param_value in parsed_query.copy():
|
||||
for reg in query_regexes:
|
||||
if re.match(reg, param_name):
|
||||
parsed_query.remove((param_name, param_value))
|
||||
|
||||
return url._replace(query=urlencode(parsed_query))
|
||||
|
||||
def on_result(
|
||||
self, request: "SXNG_Request", search: "SearchWithPlugins", result: Result
|
||||
) -> bool: # pylint: disable=unused-argument
|
||||
if not result.parsed_url:
|
||||
return True
|
||||
|
||||
parsed_query: list[tuple[str, str]] = parse_qsl(result.parsed_url.query)
|
||||
for name_value in list(parsed_query):
|
||||
param_name = name_value[0]
|
||||
for reg in regexes:
|
||||
if reg.match(param_name):
|
||||
parsed_query.remove(name_value)
|
||||
result.parsed_url = result.parsed_url._replace(query=urlencode(parsed_query))
|
||||
result.url = urlunparse(result.parsed_url)
|
||||
for rule in TRACKER_PATTERNS:
|
||||
if not re.match(rule["urlPattern"], result.url):
|
||||
continue
|
||||
|
||||
for exception in rule["exceptions"]:
|
||||
if re.match(exception, result.url):
|
||||
break
|
||||
else:
|
||||
result.parsed_url = self._remove_queries(result.parsed_url, rule["trackerParams"])
|
||||
result.url = urlunparse(result.parsed_url)
|
||||
|
||||
return True
|
||||
|
36
searxng_extra/update/update_tracker_patterns.py
Normal file
36
searxng_extra/update/update_tracker_patterns.py
Normal file
@ -0,0 +1,36 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Fetch trackers"""
|
||||
|
||||
import json
|
||||
import httpx
|
||||
|
||||
from searx.data import data_dir
|
||||
|
||||
DATA_FILE = data_dir / 'tracker_patterns.json'
|
||||
CLEAR_LIST_URL = "https://raw.githubusercontent.com/ClearURLs/Rules/refs/heads/master/data.min.json"
|
||||
|
||||
|
||||
def fetch_clear_url_filters():
|
||||
resp = httpx.get(CLEAR_LIST_URL)
|
||||
if resp.status_code != 200:
|
||||
# pylint: disable=broad-exception-raised
|
||||
raise Exception("Error fetching ClearURL filter lists, HTTP code " + resp.status_code) # type: ignore
|
||||
|
||||
providers = resp.json()["providers"]
|
||||
rules = []
|
||||
for rule in providers.values():
|
||||
rules.append(
|
||||
{
|
||||
"urlPattern": rule["urlPattern"].replace("\\\\", "\\"), # fix javascript regex syntax
|
||||
"exceptions": [exc.replace("\\\\", "\\") for exc in rule["exceptions"]],
|
||||
"trackerParams": rule["rules"],
|
||||
}
|
||||
)
|
||||
|
||||
return rules
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
filter_list = fetch_clear_url_filters()
|
||||
with DATA_FILE.open("w", encoding='utf-8') as f:
|
||||
json.dump(filter_list, f)
|
Loading…
x
Reference in New Issue
Block a user