Allow to, optionally, keep Unicode escape sequences in stringToPDFString (PR 17331 follow-up)

Currently *some* of the links[1] on page three of the `issue19835.pdf` test-case aren't clickable, since the destination (of the LinkAnnotation) becomes empty. The reason is that these destinations include the character `\x1b`, which is interpreted as the start of a Unicode escape sequence specifying the language of the string; please refer to section [7.9.2.2 Text String Type](https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf#G6.1957385) in the PDF specification. Hence it seems that we need a way to optionally disable that behaviour, to avoid a "badly" formatted string from becoming empty (or truncated), at least for cases where we are: - Parsing named destinations[2] and URLs. - Handling "strings" that are actually /Name-instances. - Building a lookup Object/Map based on some PDF data-structure. *NOTE:* The issue that prompted this patch is obviously related to destinations, however I've gone through the `src/core/` folder and updated various other `stringToPDFString` call-sites that (directly or indirectly) fit the categories listed above. --- [1] Try clicking on anything on the line containing "Item 7A. Quantitative and Qualitative Disclosures About Market Risk 27". [2] Unfortunately just skipping `stringToPDFString` in this case would cause other issues, such as the named destination becoming "unusable" in the viewer; see e.g. issues 14847 and 14864.
2025-04-30 15:43:00 +02:00 · 2025-04-30 15:43:00 +02:00 · b629bafd1c
commit b629bafd1c
parent 254431df1e
5 changed files with 63 additions and 16 deletions
--- a/src/core/catalog.js
+++ b/src/core/catalog.js
@ -76,7 +76,7 @@ function fetchRemoteDest(action) {
      dest = dest.name;
    }
    if (typeof dest === "string") {
-      return stringToPDFString(dest);
+      return stringToPDFString(dest, /* keepEscapeSequence = */ true);
    } else if (isValidExplicitDest(dest)) {
      return JSON.stringify(dest);
    }
@ -674,7 +674,8 @@ class Catalog {
        for (const [key, value] of obj.getAll()) {
          const dest = fetchDest(value);
          if (dest) {
-            dests[stringToPDFString(key)] = dest;
+            dests[stringToPDFString(key, /* keepEscapeSequence = */ true)] =
              dest;
          }
        }
      } else if (obj instanceof Dict) {
@ -682,7 +683,8 @@ class Catalog {
          const dest = fetchDest(value);
          if (dest) {
            // Always let the NameTree take precedence.
-            dests[stringToPDFString(key)] ||= dest;
+            dests[stringToPDFString(key, /* keepEscapeSequence = */ true)] ||=
              dest;
          }
        }
      }
@ -1046,7 +1048,8 @@ class Catalog {
      for (const [key, value] of nameTree.getAll()) {
        const fs = new FileSpec(value, this.xref);
        attachments ??= Object.create(null);
-        attachments[stringToPDFString(key)] = fs.serializable;
+        attachments[stringToPDFString(key, /* keepEscapeSequence = */ true)] =
          fs.serializable;
      }
    }
    return shadow(this, "attachments", attachments);
@ -1060,7 +1063,10 @@ class Catalog {
      const nameTree = new NameTree(obj.getRaw("XFAImages"), this.xref);
      for (const [key, value] of nameTree.getAll()) {
        xfaImages ??= new Dict(this.xref);
-        xfaImages.set(stringToPDFString(key), value);
+        xfaImages.set(
          stringToPDFString(key, /* keepEscapeSequence = */ true),
          value
        );
      }
    }
    return shadow(this, "xfaImages", xfaImages);
@ -1084,7 +1090,10 @@ class Catalog {
      } else if (typeof js !== "string") {
        return;
      }
-      js = stringToPDFString(js).replaceAll("\x00", "");
+      js = stringToPDFString(js, /* keepEscapeSequence = */ true).replaceAll(
        "\x00",
        ""
      );
      // Skip empty entries, similar to the `_collectJS` function.
      if (js) {
        (javaScript ||= new Map()).set(name, js);
@ -1094,7 +1103,10 @@ class Catalog {
    if (obj instanceof Dict && obj.has("JavaScript")) {
      const nameTree = new NameTree(obj.getRaw("JavaScript"), this.xref);
      for (const [key, value] of nameTree.getAll()) {
-        appendIfJavaScriptDict(stringToPDFString(key), value);
+        appendIfJavaScriptDict(
          stringToPDFString(key, /* keepEscapeSequence = */ true),
          value
        );
      }
    }
    // Append OpenAction "JavaScript" actions, if any, to the JavaScript map.
@ -1633,7 +1645,10 @@ class Catalog {
            const name = target.get("N");
            if (isName(relationship, "C") && typeof name === "string") {
-              attachment = docAttachments[stringToPDFString(name)];
+              attachment =
                docAttachments[
                  stringToPDFString(name, /* keepEscapeSequence = */ true)
                ];
            }
          }
@ -1699,7 +1714,11 @@ class Catalog {
            js = jsAction;
          }
-          const jsURL = js && recoverJsURL(stringToPDFString(js));
+          const jsURL =
            js &&
            recoverJsURL(
              stringToPDFString(js, /* keepEscapeSequence = */ true)
            );
          if (jsURL) {
            url = jsURL.url;
            resultObj.newWindow = jsURL.newWindow;
@ -1735,7 +1754,10 @@ class Catalog {
        dest = dest.name;
      }
      if (typeof dest === "string") {
-        resultObj.dest = stringToPDFString(dest);
+        resultObj.dest = stringToPDFString(
          dest,
          /* keepEscapeSequence = */ true
        );
      } else if (isValidExplicitDest(dest)) {
        resultObj.dest = dest;
      }
--- a/src/core/core_utils.js
+++ b/src/core/core_utils.js
@ -424,7 +424,10 @@ function _collectJS(entry, xref, list, parents) {
      } else if (typeof js === "string") {
        code = js;
      }
-      code &&= stringToPDFString(code).replaceAll("\x00", "");
+      code &&= stringToPDFString(
        code,
        /* keepEscapeSequence = */ true
      ).replaceAll("\x00", "");
      if (code) {
        list.push(code);
      }
--- a/src/core/file_spec.js
+++ b/src/core/file_spec.js
@ -77,7 +77,7 @@ class FileSpec {
    const item = pickPlatformItem(this.root);
    if (item && typeof item === "string") {
-      filename = stringToPDFString(item)
+      filename = stringToPDFString(item, /* keepEscapeSequence = */ true)
        .replaceAll("\\\\", "\\")
        .replaceAll("\\/", "/")
        .replaceAll("\\", "/");
--- a/src/shared/util.js
+++ b/src/shared/util.js
@ -1022,9 +1022,9 @@ const PDFStringTranslateTable = [
  0x131, 0x142, 0x153, 0x161, 0x17e, 0, 0x20ac,
 ];
-function stringToPDFString(str) {
+function stringToPDFString(str, keepEscapeSequence = false) {
  // See section 7.9.2.2 Text String Type.
-  // The string can contain some language codes bracketed with 0x0b,
+  // The string can contain some language codes bracketed with 0x1b,
  // so we must remove them.
  if (str[0] >= "\xEF") {
    let encoding;
@ -1047,7 +1047,7 @@ function stringToPDFString(str) {
        const decoder = new TextDecoder(encoding, { fatal: true });
        const buffer = stringToBytes(str);
        const decoded = decoder.decode(buffer);
-        if (!decoded.includes("\x1b")) {
+        if (keepEscapeSequence || !decoded.includes("\x1b")) {
          return decoded;
        }
        return decoded.replaceAll(/\x1b[^\x1b]*(?:\x1b|$)/g, "");
@ -1060,7 +1060,7 @@ function stringToPDFString(str) {
  const strBuf = [];
  for (let i = 0, ii = str.length; i < ii; i++) {
    const charCode = str.charCodeAt(i);
-    if (charCode === 0x1b) {
+    if (!keepEscapeSequence && charCode === 0x1b) {
      // eslint-disable-next-line no-empty
      while (++i < ii && str.charCodeAt(i) !== 0x1b) {}
      continue;
--- a/test/unit/api_spec.js
+++ b/test/unit/api_spec.js
@ -1421,6 +1421,28 @@ describe("api", function () {
      await loadingTask.destroy();
    });
    it("gets a destination containing Unicode escape sequence (\x1b), from /Dests dictionary with keys using PDFDocEncoding", async function () {
      if (isNodeJS) {
        pending("Linked test-cases are not supported in Node.js.");
      }
      const loadingTask = getDocument(buildGetDocumentParams("issue19835.pdf"));
      const pdfDoc = await loadingTask.promise;
      const page3 = await pdfDoc.getPage(3);
      const annots = await page3.getAnnotations();
      const annot = annots.find(x => x.id === "55R");
      // Sanity check to make sure that we found the "correct" annotation.
      expect(annot.dest).toEqual(
        "\u02d9\u0064\u002a\u0010\u000e\u0061\u00d6\u0002\u005b\u00b7\u201a\u0022\u00c5\u00da\u017e\u00bb\u00d5\u0062\u02dd\u00d1"
      );
      const dest = await pdfDoc.getDestination(annot.dest);
      expect(dest).toEqual([28, { name: "XYZ" }, 34.0799999, 73.5199999, 0]);
      await loadingTask.destroy();
    });
    it("gets non-string destination", async function () {
      let numberPromise = pdfDocument.getDestination(4.3);
      let booleanPromise = pdfDocument.getDestination(true);