Fix autolinking errors due to regex and email validation

Fix some edge cases in the autolinking logic with the regex as well as
validating email domains and add unit tests for them.

Fixes: https://github.com/mozilla/pdf.js/issues/19462
This commit is contained in:
Ujjwal Sharma 2025-02-10 16:29:43 +01:00
parent a857ca3261
commit 70e2873430
2 changed files with 23 additions and 13 deletions

View File

@ -87,6 +87,9 @@ describe("autolinker", function () {
"CAP.cap@Gmail.Com", // Keep the original case. "CAP.cap@Gmail.Com", // Keep the original case.
"mailto:CAP.cap@Gmail.Com", "mailto:CAP.cap@Gmail.Com",
], ],
["partl@mail.boku.ac.at", "mailto:partl@mail.boku.ac.at"],
["Irene.Hyna@bmwf.ac.at", "mailto:Irene.Hyna@bmwf.ac.at"],
["<hi@foo.bar.baz>", "mailto:hi@foo.bar.baz"],
]); ]);
}); });
@ -140,6 +143,7 @@ describe("autolinker", function () {
"http//[00:00:00:00:00:00", // Invalid IPv6 address. "http//[00:00:00:00:00:00", // Invalid IPv6 address.
"http//[]", // Empty IPv6 address. "http//[]", // Empty IPv6 address.
"abc.example.com", // URL without scheme. "abc.example.com", // URL without scheme.
"JD?M$0QP)lKn06l1apKDC@\\qJ4B!!(5m+j.7F790m", // Not a valid email.
].join("\n") ].join("\n")
); );
expect(matches.length).toEqual(0); expect(matches.length).toEqual(0);

View File

@ -96,31 +96,37 @@ class Autolinker {
static #regex; static #regex;
static findLinks(text) { static findLinks(text) {
// Regex can be tested and verified at https://regex101.com/r/zgDwPE/1. // Regex can be tested and verified at https://regex101.com/r/rXoLiT/2.
this.#regex ??= this.#regex ??=
/\b(?:https?:\/\/|mailto:|www\.)(?:[[\S--\[]--\p{P}]|\/|[\p{P}--\[]+[[\S--\[]--\p{P}])+|\b[[\S--@]--\{]+@[\S--.]+\.[[\S--\[]--\p{P}]{2,}/gmv; /\b(?:https?:\/\/|mailto:|www\.)(?:[\S--[\p{P}<>]]|\/|[\S--[\[\]]]+[\S--[\p{P}<>]])+|\b[\S--[@\p{Ps}\p{Pe}<>]]+@([\S--[\p{P}<>]]+(?:\.[\S--[\p{P}<>]]+)+)/gmv;
const [normalizedText, diffs] = normalize(text); const [normalizedText, diffs] = normalize(text);
const matches = normalizedText.matchAll(this.#regex); const matches = normalizedText.matchAll(this.#regex);
const links = []; const links = [];
for (const match of matches) { for (const match of matches) {
const raw = const [url, emailDomain] = match;
match[0].startsWith("www.") || let raw;
match[0].startsWith("mailto:") || if (
match[0].startsWith("http://") || url.startsWith("www.") ||
match[0].startsWith("https://") url.startsWith("http://") ||
? match[0] url.startsWith("https://")
: `mailto:${match[0]}`; ) {
const url = createValidAbsoluteUrl(raw, null, { raw = url;
} else if (URL.canParse(`http://${emailDomain}`)) {
raw = url.startsWith("mailto:") ? url : `mailto:${url}`;
} else {
continue;
}
const absoluteURL = createValidAbsoluteUrl(raw, null, {
addDefaultProtocol: true, addDefaultProtocol: true,
}); });
if (url) { if (absoluteURL) {
const [index, length] = getOriginalIndex( const [index, length] = getOriginalIndex(
diffs, diffs,
match.index, match.index,
match[0].length url.length
); );
links.push({ url: url.href, index, length }); links.push({ url: absoluteURL.href, index, length });
} }
} }
return links; return links;