Merge cdb336969cc9fa22a2d549a5feb10c8d8c63c8bf into bc06b1aece43c50fa0dae7e6d90389814bebbc91
This commit is contained in:
commit
4cea5c1a96
1
.github/workflows/data-update.yml
vendored
1
.github/workflows/data-update.yml
vendored
@ -20,6 +20,7 @@ jobs:
|
|||||||
- update_engine_traits.py
|
- update_engine_traits.py
|
||||||
- update_wikidata_units.py
|
- update_wikidata_units.py
|
||||||
- update_engine_descriptions.py
|
- update_engine_descriptions.py
|
||||||
|
- update_tracker_patterns.py
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
@ -15,6 +15,7 @@ __all__ = [
|
|||||||
'OSM_KEYS_TAGS',
|
'OSM_KEYS_TAGS',
|
||||||
'ENGINE_DESCRIPTIONS',
|
'ENGINE_DESCRIPTIONS',
|
||||||
'LOCALES',
|
'LOCALES',
|
||||||
|
'TRACKER_PATTERNS',
|
||||||
'ahmia_blacklist_loader',
|
'ahmia_blacklist_loader',
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -51,3 +52,4 @@ OSM_KEYS_TAGS = _load('osm_keys_tags.json')
|
|||||||
ENGINE_DESCRIPTIONS = _load('engine_descriptions.json')
|
ENGINE_DESCRIPTIONS = _load('engine_descriptions.json')
|
||||||
ENGINE_TRAITS = _load('engine_traits.json')
|
ENGINE_TRAITS = _load('engine_traits.json')
|
||||||
LOCALES = _load('locales.json')
|
LOCALES = _load('locales.json')
|
||||||
|
TRACKER_PATTERNS = _load('tracker_patterns.json')
|
||||||
|
1
searx/data/tracker_patterns.json
Normal file
1
searx/data/tracker_patterns.json
Normal file
File diff suppressed because one or more lines are too long
@ -5,11 +5,12 @@ from __future__ import annotations
|
|||||||
import typing
|
import typing
|
||||||
|
|
||||||
import re
|
import re
|
||||||
from urllib.parse import urlunparse, parse_qsl, urlencode
|
from urllib.parse import urlunparse, parse_qsl, urlencode, ParseResult
|
||||||
|
|
||||||
from flask_babel import gettext
|
from flask_babel import gettext
|
||||||
|
|
||||||
from searx.plugins import Plugin, PluginInfo
|
from searx.plugins import Plugin, PluginInfo
|
||||||
|
from searx.data import TRACKER_PATTERNS
|
||||||
|
|
||||||
if typing.TYPE_CHECKING:
|
if typing.TYPE_CHECKING:
|
||||||
from searx.search import SearchWithPlugins
|
from searx.search import SearchWithPlugins
|
||||||
@ -17,13 +18,6 @@ if typing.TYPE_CHECKING:
|
|||||||
from searx.result_types import Result
|
from searx.result_types import Result
|
||||||
from searx.plugins import PluginCfg
|
from searx.plugins import PluginCfg
|
||||||
|
|
||||||
regexes = {
|
|
||||||
re.compile(r'utm_[^&]+'),
|
|
||||||
re.compile(r'(wkey|wemail)[^&]*'),
|
|
||||||
re.compile(r'(_hsenc|_hsmi|hsCtaTracking|__hssc|__hstc|__hsfp)[^&]*'),
|
|
||||||
re.compile(r'&$'),
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class SXNGPlugin(Plugin):
|
class SXNGPlugin(Plugin):
|
||||||
"""Remove trackers arguments from the returned URL"""
|
"""Remove trackers arguments from the returned URL"""
|
||||||
@ -39,20 +33,31 @@ class SXNGPlugin(Plugin):
|
|||||||
preference_section="privacy",
|
preference_section="privacy",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _remove_queries(self, url: ParseResult, query_regexes: list[str]):
|
||||||
|
parsed_query: list[tuple[str, str]] = list(parse_qsl(url.query))
|
||||||
|
|
||||||
|
for param_name, param_value in parsed_query.copy():
|
||||||
|
for reg in query_regexes:
|
||||||
|
if re.match(reg, param_name):
|
||||||
|
parsed_query.remove((param_name, param_value))
|
||||||
|
|
||||||
|
return url._replace(query=urlencode(parsed_query))
|
||||||
|
|
||||||
def on_result(
|
def on_result(
|
||||||
self, request: "SXNG_Request", search: "SearchWithPlugins", result: Result
|
self, request: "SXNG_Request", search: "SearchWithPlugins", result: Result
|
||||||
) -> bool: # pylint: disable=unused-argument
|
) -> bool: # pylint: disable=unused-argument
|
||||||
if not result.parsed_url:
|
if not result.parsed_url:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
parsed_query: list[tuple[str, str]] = parse_qsl(result.parsed_url.query)
|
for rule in TRACKER_PATTERNS:
|
||||||
for name_value in list(parsed_query):
|
if not re.match(rule["urlPattern"], result.url):
|
||||||
param_name = name_value[0]
|
continue
|
||||||
for reg in regexes:
|
|
||||||
if reg.match(param_name):
|
for exception in rule["exceptions"]:
|
||||||
parsed_query.remove(name_value)
|
if re.match(exception, result.url):
|
||||||
result.parsed_url = result.parsed_url._replace(query=urlencode(parsed_query))
|
|
||||||
result.url = urlunparse(result.parsed_url)
|
|
||||||
break
|
break
|
||||||
|
else:
|
||||||
|
result.parsed_url = self._remove_queries(result.parsed_url, rule["trackerParams"])
|
||||||
|
result.url = urlunparse(result.parsed_url)
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
36
searxng_extra/update/update_tracker_patterns.py
Normal file
36
searxng_extra/update/update_tracker_patterns.py
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
|
"""Fetch trackers"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from searx.data import data_dir
|
||||||
|
|
||||||
|
DATA_FILE = data_dir / 'tracker_patterns.json'
|
||||||
|
CLEAR_LIST_URL = "https://raw.githubusercontent.com/ClearURLs/Rules/refs/heads/master/data.min.json"
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_clear_url_filters():
|
||||||
|
resp = httpx.get(CLEAR_LIST_URL)
|
||||||
|
if resp.status_code != 200:
|
||||||
|
# pylint: disable=broad-exception-raised
|
||||||
|
raise Exception("Error fetching ClearURL filter lists, HTTP code " + resp.status_code) # type: ignore
|
||||||
|
|
||||||
|
providers = resp.json()["providers"]
|
||||||
|
rules = []
|
||||||
|
for rule in providers.values():
|
||||||
|
rules.append(
|
||||||
|
{
|
||||||
|
"urlPattern": rule["urlPattern"].replace("\\\\", "\\"), # fix javascript regex syntax
|
||||||
|
"exceptions": [exc.replace("\\\\", "\\") for exc in rule["exceptions"]],
|
||||||
|
"trackerParams": rule["rules"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return rules
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
filter_list = fetch_clear_url_filters()
|
||||||
|
with DATA_FILE.open("w", encoding='utf-8') as f:
|
||||||
|
json.dump(filter_list, f)
|
Loading…
x
Reference in New Issue
Block a user