Fix autolinking errors due to regex and email validation
Fix some edge cases in the autolinking logic with the regex as well as validating email domains and add unit tests for them. Fixes: https://github.com/mozilla/pdf.js/issues/19462
This commit is contained in:
parent
a857ca3261
commit
70e2873430
@ -87,6 +87,9 @@ describe("autolinker", function () {
|
|||||||
"CAP.cap@Gmail.Com", // Keep the original case.
|
"CAP.cap@Gmail.Com", // Keep the original case.
|
||||||
"mailto:CAP.cap@Gmail.Com",
|
"mailto:CAP.cap@Gmail.Com",
|
||||||
],
|
],
|
||||||
|
["partl@mail.boku.ac.at", "mailto:partl@mail.boku.ac.at"],
|
||||||
|
["Irene.Hyna@bmwf.ac.at", "mailto:Irene.Hyna@bmwf.ac.at"],
|
||||||
|
["<hi@foo.bar.baz>", "mailto:hi@foo.bar.baz"],
|
||||||
]);
|
]);
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -140,6 +143,7 @@ describe("autolinker", function () {
|
|||||||
"http//[00:00:00:00:00:00", // Invalid IPv6 address.
|
"http//[00:00:00:00:00:00", // Invalid IPv6 address.
|
||||||
"http//[]", // Empty IPv6 address.
|
"http//[]", // Empty IPv6 address.
|
||||||
"abc.example.com", // URL without scheme.
|
"abc.example.com", // URL without scheme.
|
||||||
|
"JD?M$0QP)lKn06l1apKDC@\\qJ4B!!(5m+j.7F790m", // Not a valid email.
|
||||||
].join("\n")
|
].join("\n")
|
||||||
);
|
);
|
||||||
expect(matches.length).toEqual(0);
|
expect(matches.length).toEqual(0);
|
||||||
|
|||||||
@ -96,31 +96,37 @@ class Autolinker {
|
|||||||
static #regex;
|
static #regex;
|
||||||
|
|
||||||
static findLinks(text) {
|
static findLinks(text) {
|
||||||
// Regex can be tested and verified at https://regex101.com/r/zgDwPE/1.
|
// Regex can be tested and verified at https://regex101.com/r/rXoLiT/2.
|
||||||
this.#regex ??=
|
this.#regex ??=
|
||||||
/\b(?:https?:\/\/|mailto:|www\.)(?:[[\S--\[]--\p{P}]|\/|[\p{P}--\[]+[[\S--\[]--\p{P}])+|\b[[\S--@]--\{]+@[\S--.]+\.[[\S--\[]--\p{P}]{2,}/gmv;
|
/\b(?:https?:\/\/|mailto:|www\.)(?:[\S--[\p{P}<>]]|\/|[\S--[\[\]]]+[\S--[\p{P}<>]])+|\b[\S--[@\p{Ps}\p{Pe}<>]]+@([\S--[\p{P}<>]]+(?:\.[\S--[\p{P}<>]]+)+)/gmv;
|
||||||
|
|
||||||
const [normalizedText, diffs] = normalize(text);
|
const [normalizedText, diffs] = normalize(text);
|
||||||
const matches = normalizedText.matchAll(this.#regex);
|
const matches = normalizedText.matchAll(this.#regex);
|
||||||
const links = [];
|
const links = [];
|
||||||
for (const match of matches) {
|
for (const match of matches) {
|
||||||
const raw =
|
const [url, emailDomain] = match;
|
||||||
match[0].startsWith("www.") ||
|
let raw;
|
||||||
match[0].startsWith("mailto:") ||
|
if (
|
||||||
match[0].startsWith("http://") ||
|
url.startsWith("www.") ||
|
||||||
match[0].startsWith("https://")
|
url.startsWith("http://") ||
|
||||||
? match[0]
|
url.startsWith("https://")
|
||||||
: `mailto:${match[0]}`;
|
) {
|
||||||
const url = createValidAbsoluteUrl(raw, null, {
|
raw = url;
|
||||||
|
} else if (URL.canParse(`http://${emailDomain}`)) {
|
||||||
|
raw = url.startsWith("mailto:") ? url : `mailto:${url}`;
|
||||||
|
} else {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const absoluteURL = createValidAbsoluteUrl(raw, null, {
|
||||||
addDefaultProtocol: true,
|
addDefaultProtocol: true,
|
||||||
});
|
});
|
||||||
if (url) {
|
if (absoluteURL) {
|
||||||
const [index, length] = getOriginalIndex(
|
const [index, length] = getOriginalIndex(
|
||||||
diffs,
|
diffs,
|
||||||
match.index,
|
match.index,
|
||||||
match[0].length
|
url.length
|
||||||
);
|
);
|
||||||
links.push({ url: url.href, index, length });
|
links.push({ url: absoluteURL.href, index, length });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return links;
|
return links;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user