Don't remove a dash at the end of a line when guessing urls (bug 1974112)

This commit is contained in:
Calixte Denizet 2025-06-26 18:53:56 +02:00
parent 85b67f19bc
commit bb6b42177c
3 changed files with 18 additions and 2 deletions

View File

@ -195,4 +195,13 @@ describe("autolinker", function () {
["httptest@email.com", "mailto:httptest@email.com"], ["httptest@email.com", "mailto:httptest@email.com"],
]); ]);
}); });
it("shouldn't remove the dash when it's an the end of a line (bug 1974112)", function () {
testLinks([
[
"https://github.com/pypi/linehaul-cloud-\nfunction",
"https://github.com/pypi/linehaul-cloud-function",
],
]);
});
}); });

View File

@ -138,7 +138,7 @@ class Autolinker {
this.#regex ??= this.#regex ??=
/\b(?:https?:\/\/|mailto:|www\.)(?:[\S--[\p{P}<>]]|\/|[\S--[\[\]]]+[\S--[\p{P}<>]])+|\b[\S--[@\p{Ps}\p{Pe}<>]]+@([\S--[\p{P}<>]]+(?:\.[\S--[\p{P}<>]]+)+)/gmv; /\b(?:https?:\/\/|mailto:|www\.)(?:[\S--[\p{P}<>]]|\/|[\S--[\[\]]]+[\S--[\p{P}<>]])+|\b[\S--[@\p{Ps}\p{Pe}<>]]+@([\S--[\p{P}<>]]+(?:\.[\S--[\p{P}<>]]+)+)/gmv;
const [normalizedText, diffs] = normalize(text); const [normalizedText, diffs] = normalize(text, { ignoreDashEOL: true });
const matches = normalizedText.matchAll(this.#regex); const matches = normalizedText.matchAll(this.#regex);
const links = []; const links = [];
for (const match of matches) { for (const match of matches) {

View File

@ -97,7 +97,7 @@ const NFKC_CHARS_TO_NORMALIZE = new Map();
let noSyllablesRegExp = null; let noSyllablesRegExp = null;
let withSyllablesRegExp = null; let withSyllablesRegExp = null;
function normalize(text) { function normalize(text, options = {}) {
// The diacritics in the text or in the query can be composed or not. // The diacritics in the text or in the query can be composed or not.
// So we use a decomposed text using NFD (and the same for the query) // So we use a decomposed text using NFD (and the same for the query)
// in order to be sure that diacritics are in the same order. // in order to be sure that diacritics are in the same order.
@ -118,6 +118,7 @@ function normalize(text) {
} }
const hasSyllables = syllablePositions.length > 0; const hasSyllables = syllablePositions.length > 0;
const ignoreDashEOL = options.ignoreDashEOL ?? false;
let normalizationRegex; let normalizationRegex;
if (!hasSyllables && noSyllablesRegExp) { if (!hasSyllables && noSyllablesRegExp) {
@ -294,6 +295,12 @@ function normalize(text) {
} }
if (p5) { if (p5) {
if (ignoreDashEOL) {
// Keep the - but remove the EOL.
shiftOrigin += 1;
eol += 1;
return p5.slice(0, -1);
}
// In "X-\ny", "-\n" is removed because an hyphen at the end of a line // In "X-\ny", "-\n" is removed because an hyphen at the end of a line
// between two letters is likely here to mark a break in a word. // between two letters is likely here to mark a break in a word.
// If X is encoded with UTF-32 then it can have a length greater than 1. // If X is encoded with UTF-32 then it can have a length greater than 1.