Allow to, optionally, keep Unicode escape sequences in stringToPDFString (PR 17331 follow-up)

Currently *some* of the links[1] on page three of the `issue19835.pdf` test-case aren't clickable, since the destination (of the LinkAnnotation) becomes empty.
The reason is that these destinations include the character `\x1b`, which is interpreted as the start of a Unicode escape sequence specifying the language of the string; please refer to section [7.9.2.2 Text String Type](https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf#G6.1957385) in the PDF specification.

Hence it seems that we need a way to optionally disable that behaviour, to avoid a "badly" formatted string from becoming empty (or truncated), at least for cases where we are:
 - Parsing named destinations[2] and URLs.
 - Handling "strings" that are actually /Name-instances.
 - Building a lookup Object/Map based on some PDF data-structure.

*NOTE:* The issue that prompted this patch is obviously related to destinations, however I've gone through the `src/core/` folder and updated various other `stringToPDFString` call-sites that (directly or indirectly) fit the categories listed above.

---
[1] Try clicking on anything on the line containing "Item 7A. Quantitative and Qualitative Disclosures About Market Risk 27".

[2] Unfortunately just skipping `stringToPDFString` in this case would cause other issues, such as the named destination becoming "unusable" in the viewer; see e.g. issues 14847 and 14864.
This commit is contained in:
Jonas Jenwald 2025-04-30 15:43:00 +02:00
parent 254431df1e
commit b629bafd1c
5 changed files with 63 additions and 16 deletions

View File

@ -76,7 +76,7 @@ function fetchRemoteDest(action) {
dest = dest.name; dest = dest.name;
} }
if (typeof dest === "string") { if (typeof dest === "string") {
return stringToPDFString(dest); return stringToPDFString(dest, /* keepEscapeSequence = */ true);
} else if (isValidExplicitDest(dest)) { } else if (isValidExplicitDest(dest)) {
return JSON.stringify(dest); return JSON.stringify(dest);
} }
@ -674,7 +674,8 @@ class Catalog {
for (const [key, value] of obj.getAll()) { for (const [key, value] of obj.getAll()) {
const dest = fetchDest(value); const dest = fetchDest(value);
if (dest) { if (dest) {
dests[stringToPDFString(key)] = dest; dests[stringToPDFString(key, /* keepEscapeSequence = */ true)] =
dest;
} }
} }
} else if (obj instanceof Dict) { } else if (obj instanceof Dict) {
@ -682,7 +683,8 @@ class Catalog {
const dest = fetchDest(value); const dest = fetchDest(value);
if (dest) { if (dest) {
// Always let the NameTree take precedence. // Always let the NameTree take precedence.
dests[stringToPDFString(key)] ||= dest; dests[stringToPDFString(key, /* keepEscapeSequence = */ true)] ||=
dest;
} }
} }
} }
@ -1046,7 +1048,8 @@ class Catalog {
for (const [key, value] of nameTree.getAll()) { for (const [key, value] of nameTree.getAll()) {
const fs = new FileSpec(value, this.xref); const fs = new FileSpec(value, this.xref);
attachments ??= Object.create(null); attachments ??= Object.create(null);
attachments[stringToPDFString(key)] = fs.serializable; attachments[stringToPDFString(key, /* keepEscapeSequence = */ true)] =
fs.serializable;
} }
} }
return shadow(this, "attachments", attachments); return shadow(this, "attachments", attachments);
@ -1060,7 +1063,10 @@ class Catalog {
const nameTree = new NameTree(obj.getRaw("XFAImages"), this.xref); const nameTree = new NameTree(obj.getRaw("XFAImages"), this.xref);
for (const [key, value] of nameTree.getAll()) { for (const [key, value] of nameTree.getAll()) {
xfaImages ??= new Dict(this.xref); xfaImages ??= new Dict(this.xref);
xfaImages.set(stringToPDFString(key), value); xfaImages.set(
stringToPDFString(key, /* keepEscapeSequence = */ true),
value
);
} }
} }
return shadow(this, "xfaImages", xfaImages); return shadow(this, "xfaImages", xfaImages);
@ -1084,7 +1090,10 @@ class Catalog {
} else if (typeof js !== "string") { } else if (typeof js !== "string") {
return; return;
} }
js = stringToPDFString(js).replaceAll("\x00", ""); js = stringToPDFString(js, /* keepEscapeSequence = */ true).replaceAll(
"\x00",
""
);
// Skip empty entries, similar to the `_collectJS` function. // Skip empty entries, similar to the `_collectJS` function.
if (js) { if (js) {
(javaScript ||= new Map()).set(name, js); (javaScript ||= new Map()).set(name, js);
@ -1094,7 +1103,10 @@ class Catalog {
if (obj instanceof Dict && obj.has("JavaScript")) { if (obj instanceof Dict && obj.has("JavaScript")) {
const nameTree = new NameTree(obj.getRaw("JavaScript"), this.xref); const nameTree = new NameTree(obj.getRaw("JavaScript"), this.xref);
for (const [key, value] of nameTree.getAll()) { for (const [key, value] of nameTree.getAll()) {
appendIfJavaScriptDict(stringToPDFString(key), value); appendIfJavaScriptDict(
stringToPDFString(key, /* keepEscapeSequence = */ true),
value
);
} }
} }
// Append OpenAction "JavaScript" actions, if any, to the JavaScript map. // Append OpenAction "JavaScript" actions, if any, to the JavaScript map.
@ -1633,7 +1645,10 @@ class Catalog {
const name = target.get("N"); const name = target.get("N");
if (isName(relationship, "C") && typeof name === "string") { if (isName(relationship, "C") && typeof name === "string") {
attachment = docAttachments[stringToPDFString(name)]; attachment =
docAttachments[
stringToPDFString(name, /* keepEscapeSequence = */ true)
];
} }
} }
@ -1699,7 +1714,11 @@ class Catalog {
js = jsAction; js = jsAction;
} }
const jsURL = js && recoverJsURL(stringToPDFString(js)); const jsURL =
js &&
recoverJsURL(
stringToPDFString(js, /* keepEscapeSequence = */ true)
);
if (jsURL) { if (jsURL) {
url = jsURL.url; url = jsURL.url;
resultObj.newWindow = jsURL.newWindow; resultObj.newWindow = jsURL.newWindow;
@ -1735,7 +1754,10 @@ class Catalog {
dest = dest.name; dest = dest.name;
} }
if (typeof dest === "string") { if (typeof dest === "string") {
resultObj.dest = stringToPDFString(dest); resultObj.dest = stringToPDFString(
dest,
/* keepEscapeSequence = */ true
);
} else if (isValidExplicitDest(dest)) { } else if (isValidExplicitDest(dest)) {
resultObj.dest = dest; resultObj.dest = dest;
} }

View File

@ -424,7 +424,10 @@ function _collectJS(entry, xref, list, parents) {
} else if (typeof js === "string") { } else if (typeof js === "string") {
code = js; code = js;
} }
code &&= stringToPDFString(code).replaceAll("\x00", ""); code &&= stringToPDFString(
code,
/* keepEscapeSequence = */ true
).replaceAll("\x00", "");
if (code) { if (code) {
list.push(code); list.push(code);
} }

View File

@ -77,7 +77,7 @@ class FileSpec {
const item = pickPlatformItem(this.root); const item = pickPlatformItem(this.root);
if (item && typeof item === "string") { if (item && typeof item === "string") {
filename = stringToPDFString(item) filename = stringToPDFString(item, /* keepEscapeSequence = */ true)
.replaceAll("\\\\", "\\") .replaceAll("\\\\", "\\")
.replaceAll("\\/", "/") .replaceAll("\\/", "/")
.replaceAll("\\", "/"); .replaceAll("\\", "/");

View File

@ -1022,9 +1022,9 @@ const PDFStringTranslateTable = [
0x131, 0x142, 0x153, 0x161, 0x17e, 0, 0x20ac, 0x131, 0x142, 0x153, 0x161, 0x17e, 0, 0x20ac,
]; ];
function stringToPDFString(str) { function stringToPDFString(str, keepEscapeSequence = false) {
// See section 7.9.2.2 Text String Type. // See section 7.9.2.2 Text String Type.
// The string can contain some language codes bracketed with 0x0b, // The string can contain some language codes bracketed with 0x1b,
// so we must remove them. // so we must remove them.
if (str[0] >= "\xEF") { if (str[0] >= "\xEF") {
let encoding; let encoding;
@ -1047,7 +1047,7 @@ function stringToPDFString(str) {
const decoder = new TextDecoder(encoding, { fatal: true }); const decoder = new TextDecoder(encoding, { fatal: true });
const buffer = stringToBytes(str); const buffer = stringToBytes(str);
const decoded = decoder.decode(buffer); const decoded = decoder.decode(buffer);
if (!decoded.includes("\x1b")) { if (keepEscapeSequence || !decoded.includes("\x1b")) {
return decoded; return decoded;
} }
return decoded.replaceAll(/\x1b[^\x1b]*(?:\x1b|$)/g, ""); return decoded.replaceAll(/\x1b[^\x1b]*(?:\x1b|$)/g, "");
@ -1060,7 +1060,7 @@ function stringToPDFString(str) {
const strBuf = []; const strBuf = [];
for (let i = 0, ii = str.length; i < ii; i++) { for (let i = 0, ii = str.length; i < ii; i++) {
const charCode = str.charCodeAt(i); const charCode = str.charCodeAt(i);
if (charCode === 0x1b) { if (!keepEscapeSequence && charCode === 0x1b) {
// eslint-disable-next-line no-empty // eslint-disable-next-line no-empty
while (++i < ii && str.charCodeAt(i) !== 0x1b) {} while (++i < ii && str.charCodeAt(i) !== 0x1b) {}
continue; continue;

View File

@ -1421,6 +1421,28 @@ describe("api", function () {
await loadingTask.destroy(); await loadingTask.destroy();
}); });
it("gets a destination containing Unicode escape sequence (\x1b), from /Dests dictionary with keys using PDFDocEncoding", async function () {
if (isNodeJS) {
pending("Linked test-cases are not supported in Node.js.");
}
const loadingTask = getDocument(buildGetDocumentParams("issue19835.pdf"));
const pdfDoc = await loadingTask.promise;
const page3 = await pdfDoc.getPage(3);
const annots = await page3.getAnnotations();
const annot = annots.find(x => x.id === "55R");
// Sanity check to make sure that we found the "correct" annotation.
expect(annot.dest).toEqual(
"\u02d9\u0064\u002a\u0010\u000e\u0061\u00d6\u0002\u005b\u00b7\u201a\u0022\u00c5\u00da\u017e\u00bb\u00d5\u0062\u02dd\u00d1"
);
const dest = await pdfDoc.getDestination(annot.dest);
expect(dest).toEqual([28, { name: "XYZ" }, 34.0799999, 73.5199999, 0]);
await loadingTask.destroy();
});
it("gets non-string destination", async function () { it("gets non-string destination", async function () {
let numberPromise = pdfDocument.getDestination(4.3); let numberPromise = pdfDocument.getDestination(4.3);
let booleanPromise = pdfDocument.getDestination(true); let booleanPromise = pdfDocument.getDestination(true);