[fix] Results.url: don't normalize www.example.com to example.com

Hostname "www" in URL results can't be normalized to an empty string:

- https://www.tu-darmstadt.de/
- https://tu-darmstadt.de/

Reported-By: @Bnyro <bnyro@tutanota.com>
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2025-03-21 08:04:10 +01:00 committed by Markus Heiser
parent ef99cc472d
commit 96a6e3dcb2

View File

@ -51,7 +51,7 @@ def _normalize_url_fields(result: Result | LegacyResult):
# if the result has no scheme, use http as default # if the result has no scheme, use http as default
scheme=result.parsed_url.scheme or "http", scheme=result.parsed_url.scheme or "http",
# normalize ``www.example.com`` to ``example.com`` # normalize ``www.example.com`` to ``example.com``
netloc=result.parsed_url.netloc.replace("www.", ""), # netloc=result.parsed_url.netloc.replace("www.", ""),
# normalize ``example.com/path/`` to ``example.com/path`` # normalize ``example.com/path/`` to ``example.com/path``
path=result.parsed_url.path.rstrip("/"), path=result.parsed_url.path.rstrip("/"),
) )
@ -69,7 +69,7 @@ def _normalize_url_fields(result: Result | LegacyResult):
_url = urllib.parse.urlparse(_url) _url = urllib.parse.urlparse(_url)
item["url"] = _url._replace( item["url"] = _url._replace(
scheme=_url.scheme or "http", scheme=_url.scheme or "http",
netloc=_url.netloc.replace("www.", ""), # netloc=_url.netloc.replace("www.", ""),
path=_url.path.rstrip("/"), path=_url.path.rstrip("/"),
).geturl() ).geturl()
@ -78,7 +78,7 @@ def _normalize_url_fields(result: Result | LegacyResult):
_url = urllib.parse.urlparse(infobox_id) _url = urllib.parse.urlparse(infobox_id)
result.id = _url._replace( result.id = _url._replace(
scheme=_url.scheme or "http", scheme=_url.scheme or "http",
netloc=_url.netloc.replace("www.", ""), # netloc=_url.netloc.replace("www.", ""),
path=_url.path.rstrip("/"), path=_url.path.rstrip("/"),
).geturl() ).geturl()