Allow to, optionally, keep Unicode escape sequences in stringToPDFString (PR 17331 follow-up)

Currently *some* of the links[1] on page three of the `issue19835.pdf` test-case aren't clickable, since the destination (of the LinkAnnotation) becomes empty.
The reason is that these destinations include the character `\x1b`, which is interpreted as the start of a Unicode escape sequence specifying the language of the string; please refer to section [7.9.2.2 Text String Type](https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf#G6.1957385) in the PDF specification.

Hence it seems that we need a way to optionally disable that behaviour, to avoid a "badly" formatted string from becoming empty (or truncated), at least for cases where we are:
 - Parsing named destinations[2] and URLs.
 - Handling "strings" that are actually /Name-instances.
 - Building a lookup Object/Map based on some PDF data-structure.

*NOTE:* The issue that prompted this patch is obviously related to destinations, however I've gone through the `src/core/` folder and updated various other `stringToPDFString` call-sites that (directly or indirectly) fit the categories listed above.

---
[1] Try clicking on anything on the line containing "Item 7A. Quantitative and Qualitative Disclosures About Market Risk 27".

[2] Unfortunately just skipping `stringToPDFString` in this case would cause other issues, such as the named destination becoming "unusable" in the viewer; see e.g. issues 14847 and 14864.
This commit is contained in:
Jonas Jenwald 2025-04-30 15:43:00 +02:00
parent 254431df1e
commit b629bafd1c
5 changed files with 63 additions and 16 deletions

View File

@ -76,7 +76,7 @@ function fetchRemoteDest(action) {
dest = dest.name;
}
if (typeof dest === "string") {
return stringToPDFString(dest);
return stringToPDFString(dest, /* keepEscapeSequence = */ true);
} else if (isValidExplicitDest(dest)) {
return JSON.stringify(dest);
}
@ -674,7 +674,8 @@ class Catalog {
for (const [key, value] of obj.getAll()) {
const dest = fetchDest(value);
if (dest) {
dests[stringToPDFString(key)] = dest;
dests[stringToPDFString(key, /* keepEscapeSequence = */ true)] =
dest;
}
}
} else if (obj instanceof Dict) {
@ -682,7 +683,8 @@ class Catalog {
const dest = fetchDest(value);
if (dest) {
// Always let the NameTree take precedence.
dests[stringToPDFString(key)] ||= dest;
dests[stringToPDFString(key, /* keepEscapeSequence = */ true)] ||=
dest;
}
}
}
@ -1046,7 +1048,8 @@ class Catalog {
for (const [key, value] of nameTree.getAll()) {
const fs = new FileSpec(value, this.xref);
attachments ??= Object.create(null);
attachments[stringToPDFString(key)] = fs.serializable;
attachments[stringToPDFString(key, /* keepEscapeSequence = */ true)] =
fs.serializable;
}
}
return shadow(this, "attachments", attachments);
@ -1060,7 +1063,10 @@ class Catalog {
const nameTree = new NameTree(obj.getRaw("XFAImages"), this.xref);
for (const [key, value] of nameTree.getAll()) {
xfaImages ??= new Dict(this.xref);
xfaImages.set(stringToPDFString(key), value);
xfaImages.set(
stringToPDFString(key, /* keepEscapeSequence = */ true),
value
);
}
}
return shadow(this, "xfaImages", xfaImages);
@ -1084,7 +1090,10 @@ class Catalog {
} else if (typeof js !== "string") {
return;
}
js = stringToPDFString(js).replaceAll("\x00", "");
js = stringToPDFString(js, /* keepEscapeSequence = */ true).replaceAll(
"\x00",
""
);
// Skip empty entries, similar to the `_collectJS` function.
if (js) {
(javaScript ||= new Map()).set(name, js);
@ -1094,7 +1103,10 @@ class Catalog {
if (obj instanceof Dict && obj.has("JavaScript")) {
const nameTree = new NameTree(obj.getRaw("JavaScript"), this.xref);
for (const [key, value] of nameTree.getAll()) {
appendIfJavaScriptDict(stringToPDFString(key), value);
appendIfJavaScriptDict(
stringToPDFString(key, /* keepEscapeSequence = */ true),
value
);
}
}
// Append OpenAction "JavaScript" actions, if any, to the JavaScript map.
@ -1633,7 +1645,10 @@ class Catalog {
const name = target.get("N");
if (isName(relationship, "C") && typeof name === "string") {
attachment = docAttachments[stringToPDFString(name)];
attachment =
docAttachments[
stringToPDFString(name, /* keepEscapeSequence = */ true)
];
}
}
@ -1699,7 +1714,11 @@ class Catalog {
js = jsAction;
}
const jsURL = js && recoverJsURL(stringToPDFString(js));
const jsURL =
js &&
recoverJsURL(
stringToPDFString(js, /* keepEscapeSequence = */ true)
);
if (jsURL) {
url = jsURL.url;
resultObj.newWindow = jsURL.newWindow;
@ -1735,7 +1754,10 @@ class Catalog {
dest = dest.name;
}
if (typeof dest === "string") {
resultObj.dest = stringToPDFString(dest);
resultObj.dest = stringToPDFString(
dest,
/* keepEscapeSequence = */ true
);
} else if (isValidExplicitDest(dest)) {
resultObj.dest = dest;
}

View File

@ -424,7 +424,10 @@ function _collectJS(entry, xref, list, parents) {
} else if (typeof js === "string") {
code = js;
}
code &&= stringToPDFString(code).replaceAll("\x00", "");
code &&= stringToPDFString(
code,
/* keepEscapeSequence = */ true
).replaceAll("\x00", "");
if (code) {
list.push(code);
}

View File

@ -77,7 +77,7 @@ class FileSpec {
const item = pickPlatformItem(this.root);
if (item && typeof item === "string") {
filename = stringToPDFString(item)
filename = stringToPDFString(item, /* keepEscapeSequence = */ true)
.replaceAll("\\\\", "\\")
.replaceAll("\\/", "/")
.replaceAll("\\", "/");

View File

@ -1022,9 +1022,9 @@ const PDFStringTranslateTable = [
0x131, 0x142, 0x153, 0x161, 0x17e, 0, 0x20ac,
];
function stringToPDFString(str) {
function stringToPDFString(str, keepEscapeSequence = false) {
// See section 7.9.2.2 Text String Type.
// The string can contain some language codes bracketed with 0x0b,
// The string can contain some language codes bracketed with 0x1b,
// so we must remove them.
if (str[0] >= "\xEF") {
let encoding;
@ -1047,7 +1047,7 @@ function stringToPDFString(str) {
const decoder = new TextDecoder(encoding, { fatal: true });
const buffer = stringToBytes(str);
const decoded = decoder.decode(buffer);
if (!decoded.includes("\x1b")) {
if (keepEscapeSequence || !decoded.includes("\x1b")) {
return decoded;
}
return decoded.replaceAll(/\x1b[^\x1b]*(?:\x1b|$)/g, "");
@ -1060,7 +1060,7 @@ function stringToPDFString(str) {
const strBuf = [];
for (let i = 0, ii = str.length; i < ii; i++) {
const charCode = str.charCodeAt(i);
if (charCode === 0x1b) {
if (!keepEscapeSequence && charCode === 0x1b) {
// eslint-disable-next-line no-empty
while (++i < ii && str.charCodeAt(i) !== 0x1b) {}
continue;

View File

@ -1421,6 +1421,28 @@ describe("api", function () {
await loadingTask.destroy();
});
it("gets a destination containing Unicode escape sequence (\x1b), from /Dests dictionary with keys using PDFDocEncoding", async function () {
if (isNodeJS) {
pending("Linked test-cases are not supported in Node.js.");
}
const loadingTask = getDocument(buildGetDocumentParams("issue19835.pdf"));
const pdfDoc = await loadingTask.promise;
const page3 = await pdfDoc.getPage(3);
const annots = await page3.getAnnotations();
const annot = annots.find(x => x.id === "55R");
// Sanity check to make sure that we found the "correct" annotation.
expect(annot.dest).toEqual(
"\u02d9\u0064\u002a\u0010\u000e\u0061\u00d6\u0002\u005b\u00b7\u201a\u0022\u00c5\u00da\u017e\u00bb\u00d5\u0062\u02dd\u00d1"
);
const dest = await pdfDoc.getDestination(annot.dest);
expect(dest).toEqual([28, { name: "XYZ" }, 34.0799999, 73.5199999, 0]);
await loadingTask.destroy();
});
it("gets non-string destination", async function () {
let numberPromise = pdfDocument.getDestination(4.3);
let booleanPromise = pdfDocument.getDestination(true);