From bb6b42177cae20d8a064c60a3b3290499d37a7de Mon Sep 17 00:00:00 2001 From: Calixte Denizet Date: Thu, 26 Jun 2025 18:53:56 +0200 Subject: [PATCH] Don't remove a dash at the end of a line when guessing urls (bug 1974112) --- test/unit/autolinker_spec.js | 9 +++++++++ web/autolinker.js | 2 +- web/pdf_find_controller.js | 9 ++++++++- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/test/unit/autolinker_spec.js b/test/unit/autolinker_spec.js index 2d2e4d673..a7755af67 100644 --- a/test/unit/autolinker_spec.js +++ b/test/unit/autolinker_spec.js @@ -195,4 +195,13 @@ describe("autolinker", function () { ["httptest@email.com", "mailto:httptest@email.com"], ]); }); + + it("shouldn't remove the dash when it's an the end of a line (bug 1974112)", function () { + testLinks([ + [ + "https://github.com/pypi/linehaul-cloud-\nfunction", + "https://github.com/pypi/linehaul-cloud-function", + ], + ]); + }); }); diff --git a/web/autolinker.js b/web/autolinker.js index 957e4de8e..85b7e8e28 100644 --- a/web/autolinker.js +++ b/web/autolinker.js @@ -138,7 +138,7 @@ class Autolinker { this.#regex ??= /\b(?:https?:\/\/|mailto:|www\.)(?:[\S--[\p{P}<>]]|\/|[\S--[\[\]]]+[\S--[\p{P}<>]])+|\b[\S--[@\p{Ps}\p{Pe}<>]]+@([\S--[\p{P}<>]]+(?:\.[\S--[\p{P}<>]]+)+)/gmv; - const [normalizedText, diffs] = normalize(text); + const [normalizedText, diffs] = normalize(text, { ignoreDashEOL: true }); const matches = normalizedText.matchAll(this.#regex); const links = []; for (const match of matches) { diff --git a/web/pdf_find_controller.js b/web/pdf_find_controller.js index 7a34117e9..01610dd11 100644 --- a/web/pdf_find_controller.js +++ b/web/pdf_find_controller.js @@ -97,7 +97,7 @@ const NFKC_CHARS_TO_NORMALIZE = new Map(); let noSyllablesRegExp = null; let withSyllablesRegExp = null; -function normalize(text) { +function normalize(text, options = {}) { // The diacritics in the text or in the query can be composed or not. // So we use a decomposed text using NFD (and the same for the query) // in order to be sure that diacritics are in the same order. @@ -118,6 +118,7 @@ function normalize(text) { } const hasSyllables = syllablePositions.length > 0; + const ignoreDashEOL = options.ignoreDashEOL ?? false; let normalizationRegex; if (!hasSyllables && noSyllablesRegExp) { @@ -294,6 +295,12 @@ function normalize(text) { } if (p5) { + if (ignoreDashEOL) { + // Keep the - but remove the EOL. + shiftOrigin += 1; + eol += 1; + return p5.slice(0, -1); + } // In "X-\ny", "-\n" is removed because an hyphen at the end of a line // between two letters is likely here to mark a break in a word. // If X is encoded with UTF-32 then it can have a length greater than 1.