Merge pull request #20038 from calixteman/bug1974112

Don't remove a dash at the end of a line when guessing urls (bug 1974112)
2025-06-30 20:43:17 +02:00 · 2025-06-30 20:43:17 +02:00 · f4043b03e2
commit f4043b03e2
parent 85b67f19bc bb6b42177c
3 changed files with 18 additions and 2 deletions
--- a/test/unit/autolinker_spec.js
+++ b/test/unit/autolinker_spec.js
@ -195,4 +195,13 @@ describe("autolinker", function () {
      ["httptest@email.com", "mailto:httptest@email.com"],
    ]);
  });
+
+  it("shouldn't remove the dash when it's an the end of a line (bug 1974112)", function () {
+    testLinks([
+      [
+        "https://github.com/pypi/linehaul-cloud-\nfunction",
+        "https://github.com/pypi/linehaul-cloud-function",
+      ],
+    ]);
+  });
 });
--- a/web/autolinker.js
+++ b/web/autolinker.js
@ -138,7 +138,7 @@ class Autolinker {
    this.#regex ??=
      /\b(?:https?:\/\/|mailto:|www\.)(?:[\S--[\p{P}<>]]|\/|[\S--[\[\]]]+[\S--[\p{P}<>]])+|\b[\S--[@\p{Ps}\p{Pe}<>]]+@([\S--[\p{P}<>]]+(?:\.[\S--[\p{P}<>]]+)+)/gmv;

-    const [normalizedText, diffs] = normalize(text);
+    const [normalizedText, diffs] = normalize(text, { ignoreDashEOL: true });
    const matches = normalizedText.matchAll(this.#regex);
    const links = [];
    for (const match of matches) {
--- a/web/pdf_find_controller.js
+++ b/web/pdf_find_controller.js
@ -97,7 +97,7 @@ const NFKC_CHARS_TO_NORMALIZE = new Map();
 let noSyllablesRegExp = null;
 let withSyllablesRegExp = null;

-function normalize(text) {
+function normalize(text, options = {}) {
  // The diacritics in the text or in the query can be composed or not.
  // So we use a decomposed text using NFD (and the same for the query)
  // in order to be sure that diacritics are in the same order.
@ -118,6 +118,7 @@ function normalize(text) {
  }

  const hasSyllables = syllablePositions.length > 0;
+  const ignoreDashEOL = options.ignoreDashEOL ?? false;

  let normalizationRegex;
  if (!hasSyllables && noSyllablesRegExp) {
@ -294,6 +295,12 @@ function normalize(text) {
      }

      if (p5) {
+        if (ignoreDashEOL) {
+          // Keep the - but remove the EOL.
+          shiftOrigin += 1;
+          eol += 1;
+          return p5.slice(0, -1);
+        }
        // In "X-\ny", "-\n" is removed because an hyphen at the end of a line
        // between two letters is likely here to mark a break in a word.
        // If X is encoded with UTF-32 then it can have a length greater than 1.