Allow to, optionally, keep Unicode escape sequences in stringToPDFString (PR 17331 follow-up)
Currently *some* of the links[1] on page three of the `issue19835.pdf` test-case aren't clickable, since the destination (of the LinkAnnotation) becomes empty. The reason is that these destinations include the character `\x1b`, which is interpreted as the start of a Unicode escape sequence specifying the language of the string; please refer to section [7.9.2.2 Text String Type](https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf#G6.1957385) in the PDF specification. Hence it seems that we need a way to optionally disable that behaviour, to avoid a "badly" formatted string from becoming empty (or truncated), at least for cases where we are: - Parsing named destinations[2] and URLs. - Handling "strings" that are actually /Name-instances. - Building a lookup Object/Map based on some PDF data-structure. *NOTE:* The issue that prompted this patch is obviously related to destinations, however I've gone through the `src/core/` folder and updated various other `stringToPDFString` call-sites that (directly or indirectly) fit the categories listed above. --- [1] Try clicking on anything on the line containing "Item 7A. Quantitative and Qualitative Disclosures About Market Risk 27". [2] Unfortunately just skipping `stringToPDFString` in this case would cause other issues, such as the named destination becoming "unusable" in the viewer; see e.g. issues 14847 and 14864.
This commit is contained in:
parent
254431df1e
commit
b629bafd1c
@ -76,7 +76,7 @@ function fetchRemoteDest(action) {
|
||||
dest = dest.name;
|
||||
}
|
||||
if (typeof dest === "string") {
|
||||
return stringToPDFString(dest);
|
||||
return stringToPDFString(dest, /* keepEscapeSequence = */ true);
|
||||
} else if (isValidExplicitDest(dest)) {
|
||||
return JSON.stringify(dest);
|
||||
}
|
||||
@ -674,7 +674,8 @@ class Catalog {
|
||||
for (const [key, value] of obj.getAll()) {
|
||||
const dest = fetchDest(value);
|
||||
if (dest) {
|
||||
dests[stringToPDFString(key)] = dest;
|
||||
dests[stringToPDFString(key, /* keepEscapeSequence = */ true)] =
|
||||
dest;
|
||||
}
|
||||
}
|
||||
} else if (obj instanceof Dict) {
|
||||
@ -682,7 +683,8 @@ class Catalog {
|
||||
const dest = fetchDest(value);
|
||||
if (dest) {
|
||||
// Always let the NameTree take precedence.
|
||||
dests[stringToPDFString(key)] ||= dest;
|
||||
dests[stringToPDFString(key, /* keepEscapeSequence = */ true)] ||=
|
||||
dest;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1046,7 +1048,8 @@ class Catalog {
|
||||
for (const [key, value] of nameTree.getAll()) {
|
||||
const fs = new FileSpec(value, this.xref);
|
||||
attachments ??= Object.create(null);
|
||||
attachments[stringToPDFString(key)] = fs.serializable;
|
||||
attachments[stringToPDFString(key, /* keepEscapeSequence = */ true)] =
|
||||
fs.serializable;
|
||||
}
|
||||
}
|
||||
return shadow(this, "attachments", attachments);
|
||||
@ -1060,7 +1063,10 @@ class Catalog {
|
||||
const nameTree = new NameTree(obj.getRaw("XFAImages"), this.xref);
|
||||
for (const [key, value] of nameTree.getAll()) {
|
||||
xfaImages ??= new Dict(this.xref);
|
||||
xfaImages.set(stringToPDFString(key), value);
|
||||
xfaImages.set(
|
||||
stringToPDFString(key, /* keepEscapeSequence = */ true),
|
||||
value
|
||||
);
|
||||
}
|
||||
}
|
||||
return shadow(this, "xfaImages", xfaImages);
|
||||
@ -1084,7 +1090,10 @@ class Catalog {
|
||||
} else if (typeof js !== "string") {
|
||||
return;
|
||||
}
|
||||
js = stringToPDFString(js).replaceAll("\x00", "");
|
||||
js = stringToPDFString(js, /* keepEscapeSequence = */ true).replaceAll(
|
||||
"\x00",
|
||||
""
|
||||
);
|
||||
// Skip empty entries, similar to the `_collectJS` function.
|
||||
if (js) {
|
||||
(javaScript ||= new Map()).set(name, js);
|
||||
@ -1094,7 +1103,10 @@ class Catalog {
|
||||
if (obj instanceof Dict && obj.has("JavaScript")) {
|
||||
const nameTree = new NameTree(obj.getRaw("JavaScript"), this.xref);
|
||||
for (const [key, value] of nameTree.getAll()) {
|
||||
appendIfJavaScriptDict(stringToPDFString(key), value);
|
||||
appendIfJavaScriptDict(
|
||||
stringToPDFString(key, /* keepEscapeSequence = */ true),
|
||||
value
|
||||
);
|
||||
}
|
||||
}
|
||||
// Append OpenAction "JavaScript" actions, if any, to the JavaScript map.
|
||||
@ -1633,7 +1645,10 @@ class Catalog {
|
||||
const name = target.get("N");
|
||||
|
||||
if (isName(relationship, "C") && typeof name === "string") {
|
||||
attachment = docAttachments[stringToPDFString(name)];
|
||||
attachment =
|
||||
docAttachments[
|
||||
stringToPDFString(name, /* keepEscapeSequence = */ true)
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
@ -1699,7 +1714,11 @@ class Catalog {
|
||||
js = jsAction;
|
||||
}
|
||||
|
||||
const jsURL = js && recoverJsURL(stringToPDFString(js));
|
||||
const jsURL =
|
||||
js &&
|
||||
recoverJsURL(
|
||||
stringToPDFString(js, /* keepEscapeSequence = */ true)
|
||||
);
|
||||
if (jsURL) {
|
||||
url = jsURL.url;
|
||||
resultObj.newWindow = jsURL.newWindow;
|
||||
@ -1735,7 +1754,10 @@ class Catalog {
|
||||
dest = dest.name;
|
||||
}
|
||||
if (typeof dest === "string") {
|
||||
resultObj.dest = stringToPDFString(dest);
|
||||
resultObj.dest = stringToPDFString(
|
||||
dest,
|
||||
/* keepEscapeSequence = */ true
|
||||
);
|
||||
} else if (isValidExplicitDest(dest)) {
|
||||
resultObj.dest = dest;
|
||||
}
|
||||
|
||||
@ -424,7 +424,10 @@ function _collectJS(entry, xref, list, parents) {
|
||||
} else if (typeof js === "string") {
|
||||
code = js;
|
||||
}
|
||||
code &&= stringToPDFString(code).replaceAll("\x00", "");
|
||||
code &&= stringToPDFString(
|
||||
code,
|
||||
/* keepEscapeSequence = */ true
|
||||
).replaceAll("\x00", "");
|
||||
if (code) {
|
||||
list.push(code);
|
||||
}
|
||||
|
||||
@ -77,7 +77,7 @@ class FileSpec {
|
||||
|
||||
const item = pickPlatformItem(this.root);
|
||||
if (item && typeof item === "string") {
|
||||
filename = stringToPDFString(item)
|
||||
filename = stringToPDFString(item, /* keepEscapeSequence = */ true)
|
||||
.replaceAll("\\\\", "\\")
|
||||
.replaceAll("\\/", "/")
|
||||
.replaceAll("\\", "/");
|
||||
|
||||
@ -1022,9 +1022,9 @@ const PDFStringTranslateTable = [
|
||||
0x131, 0x142, 0x153, 0x161, 0x17e, 0, 0x20ac,
|
||||
];
|
||||
|
||||
function stringToPDFString(str) {
|
||||
function stringToPDFString(str, keepEscapeSequence = false) {
|
||||
// See section 7.9.2.2 Text String Type.
|
||||
// The string can contain some language codes bracketed with 0x0b,
|
||||
// The string can contain some language codes bracketed with 0x1b,
|
||||
// so we must remove them.
|
||||
if (str[0] >= "\xEF") {
|
||||
let encoding;
|
||||
@ -1047,7 +1047,7 @@ function stringToPDFString(str) {
|
||||
const decoder = new TextDecoder(encoding, { fatal: true });
|
||||
const buffer = stringToBytes(str);
|
||||
const decoded = decoder.decode(buffer);
|
||||
if (!decoded.includes("\x1b")) {
|
||||
if (keepEscapeSequence || !decoded.includes("\x1b")) {
|
||||
return decoded;
|
||||
}
|
||||
return decoded.replaceAll(/\x1b[^\x1b]*(?:\x1b|$)/g, "");
|
||||
@ -1060,7 +1060,7 @@ function stringToPDFString(str) {
|
||||
const strBuf = [];
|
||||
for (let i = 0, ii = str.length; i < ii; i++) {
|
||||
const charCode = str.charCodeAt(i);
|
||||
if (charCode === 0x1b) {
|
||||
if (!keepEscapeSequence && charCode === 0x1b) {
|
||||
// eslint-disable-next-line no-empty
|
||||
while (++i < ii && str.charCodeAt(i) !== 0x1b) {}
|
||||
continue;
|
||||
|
||||
@ -1421,6 +1421,28 @@ describe("api", function () {
|
||||
await loadingTask.destroy();
|
||||
});
|
||||
|
||||
it("gets a destination containing Unicode escape sequence (\x1b), from /Dests dictionary with keys using PDFDocEncoding", async function () {
|
||||
if (isNodeJS) {
|
||||
pending("Linked test-cases are not supported in Node.js.");
|
||||
}
|
||||
const loadingTask = getDocument(buildGetDocumentParams("issue19835.pdf"));
|
||||
const pdfDoc = await loadingTask.promise;
|
||||
|
||||
const page3 = await pdfDoc.getPage(3);
|
||||
const annots = await page3.getAnnotations();
|
||||
|
||||
const annot = annots.find(x => x.id === "55R");
|
||||
// Sanity check to make sure that we found the "correct" annotation.
|
||||
expect(annot.dest).toEqual(
|
||||
"\u02d9\u0064\u002a\u0010\u000e\u0061\u00d6\u0002\u005b\u00b7\u201a\u0022\u00c5\u00da\u017e\u00bb\u00d5\u0062\u02dd\u00d1"
|
||||
);
|
||||
|
||||
const dest = await pdfDoc.getDestination(annot.dest);
|
||||
expect(dest).toEqual([28, { name: "XYZ" }, 34.0799999, 73.5199999, 0]);
|
||||
|
||||
await loadingTask.destroy();
|
||||
});
|
||||
|
||||
it("gets non-string destination", async function () {
|
||||
let numberPromise = pdfDocument.getDestination(4.3);
|
||||
let booleanPromise = pdfDocument.getDestination(true);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user