Update the page labels tree when a pdf is extracted (bug 1997379)

This commit is contained in:
Calixte Denizet 2025-11-07 15:34:08 +01:00
parent 85ed401b82
commit ad97c5b816
5 changed files with 169 additions and 5 deletions

View File

@ -735,6 +735,16 @@ class Catalog {
return rawDests;
}
get rawPageLabels() {
const obj = this.#catDict.getRaw("PageLabels");
if (!obj) {
return null;
}
const numberTree = new NumberTree(obj, this.xref);
return numberTree.getAll();
}
get pageLabels() {
let obj = null;
try {
@ -749,8 +759,8 @@ class Catalog {
}
#readPageLabels() {
const obj = this.#catDict.getRaw("PageLabels");
if (!obj) {
const nums = this.rawPageLabels;
if (!nums) {
return null;
}
@ -758,8 +768,6 @@ class Catalog {
let style = null,
prefix = "";
const numberTree = new NumberTree(obj, this.xref);
const nums = numberTree.getAll();
let currentLabel = "",
currentIndex = 1;

View File

@ -25,6 +25,7 @@ import { StringStream } from "../stream.js";
import { stringToAsciiOrUTF16BE } from "../core_utils.js";
const MAX_LEAVES_PER_PAGES_NODE = 16;
const MAX_IN_NAME_TREE_NODE = 64;
class PageData {
constructor(page, documentData) {
@ -39,6 +40,7 @@ class PageData {
class DocumentData {
constructor(document) {
this.document = document;
this.pageLabels = null;
this.pagesMap = new RefSetCache();
this.oldRefMapping = new RefSetCache();
}
@ -61,6 +63,7 @@ class PDFEditor {
this.version = "1.7";
this.title = title;
this.author = author;
this.pageLabels = null;
}
/**
@ -253,6 +256,8 @@ class PDFEditor {
await Promise.all(promises);
promises.length = 0;
this.#collectPageLabels();
for (const page of this.oldPages) {
promises.push(this.#postCollectPageData(page));
}
@ -270,7 +275,12 @@ class PDFEditor {
* @param {DocumentData} documentData
* @return {Promise<void>}
*/
async #collectDocumentData(documentData) {}
async #collectDocumentData(documentData) {
const { document } = documentData;
await document.pdfManager
.ensureCatalog("rawPageLabels")
.then(pageLabels => (documentData.pageLabels = pageLabels));
}
/**
* Post process the collected page data.
@ -306,6 +316,56 @@ class PDFEditor {
pageData.annotations = newAnnotations.length > 0 ? newAnnotations : null;
}
async #collectPageLabels() {
// We can only preserve page labels when editing a single PDF file.
// This is consistent with behavior in Adobe Acrobat.
if (!this.hasSingleFile) {
return;
}
const {
documentData: { document, pageLabels },
} = this.oldPages[0];
if (!pageLabels) {
return;
}
const numPages = document.numPages;
const oldPageLabels = [];
const oldPageIndices = new Set(
this.oldPages.map(({ page: { pageIndex } }) => pageIndex)
);
let currentLabel = null;
let stFirstIndex = -1;
for (let i = 0; i < numPages; i++) {
const newLabel = pageLabels.get(i);
if (newLabel) {
currentLabel = newLabel;
stFirstIndex = currentLabel.has("St") ? i : -1;
}
if (!oldPageIndices.has(i)) {
continue;
}
if (stFirstIndex !== -1) {
const st = currentLabel.get("St");
currentLabel = currentLabel.clone();
currentLabel.set("St", st + (i - stFirstIndex));
stFirstIndex = -1;
}
oldPageLabels.push(currentLabel);
}
currentLabel = oldPageLabels[0];
let currentIndex = 0;
const newPageLabels = (this.pageLabels = [[0, currentLabel]]);
for (let i = 0, ii = oldPageLabels.length; i < ii; i++) {
const label = oldPageLabels[i];
if (label === currentLabel) {
continue;
}
currentIndex = i;
currentLabel = label;
newPageLabels.push([currentIndex, currentLabel]);
}
}
/**
* Create a copy of a page.
* @param {number} pageIndex
@ -423,6 +483,66 @@ class PDFEditor {
}
}
/**
* Create a name or number tree from the given map.
* @param {Array<[string, any]>} map
* @returns {Ref}
*/
#makeNameNumTree(map, areNames) {
const allEntries = map.sort(
areNames
? ([keyA], [keyB]) => keyA.localeCompare(keyB)
: ([keyA], [keyB]) => keyA - keyB
);
const maxLeaves =
MAX_IN_NAME_TREE_NODE <= 1 ? allEntries.length : MAX_IN_NAME_TREE_NODE;
const [treeRef, treeDict] = this.newDict;
const stack = [{ dict: treeDict, entries: allEntries }];
const valueType = areNames ? "Names" : "Nums";
while (stack.length > 0) {
const { dict, entries } = stack.pop();
if (entries.length <= maxLeaves) {
dict.set("Limits", [entries[0][0], entries.at(-1)[0]]);
dict.set(valueType, entries.flat());
continue;
}
const entriesChunks = [];
const chunkSize = Math.max(
maxLeaves,
Math.ceil(entries.length / maxLeaves)
);
for (let i = 0; i < entries.length; i += chunkSize) {
entriesChunks.push(entries.slice(i, i + chunkSize));
}
const entriesRefs = [];
dict.set("Kids", entriesRefs);
for (const chunk of entriesChunks) {
const [entriesRef, entriesDict] = this.newDict;
entriesRefs.push(entriesRef);
entriesDict.set("Limits", [chunk[0][0], chunk.at(-1)[0]]);
stack.push({ dict: entriesDict, entries: chunk });
}
}
return treeRef;
}
/**
* Create the page labels tree if it exists.
*/
#makePageLabelsTree() {
const { pageLabels } = this;
if (!pageLabels || pageLabels.length === 0) {
return;
}
const { rootDict } = this;
const pageLabelsRef = this.#makeNameNumTree(
this.pageLabels,
/* areNames = */ false
);
rootDict.set("PageLabels", pageLabelsRef);
}
/**
* Create the root dictionary.
* @returns {Promise<void>}
@ -432,6 +552,7 @@ class PDFEditor {
rootDict.setIfName("Type", "Catalog");
rootDict.set("Version", this.version);
this.#makePageTree();
this.#makePageLabelsTree();
}
/**

View File

@ -757,3 +757,4 @@
!doc_1_3_pages.pdf
!doc_2_3_pages.pdf
!doc_3_3_pages.pdf
!labelled_pages.pdf

BIN
test/pdfs/labelled_pages.pdf Executable file

Binary file not shown.

View File

@ -5542,5 +5542,39 @@ small scripts as well as for`);
await loadingTask.destroy();
});
});
describe("Page labels", function () {
it("extract page and check labels", async function () {
let loadingTask = getDocument(
buildGetDocumentParams("labelled_pages.pdf")
);
const pdfDoc = await loadingTask.promise;
let labels = await pdfDoc.getPageLabels();
expect(labels).toEqual([
"i" /* Page 0 */,
"ii" /* Page 1 */,
"iii" /* Page 2 */,
"iv" /* Page 3 */,
"1" /* Page 4 */,
"2" /* Page 5 */,
"3" /* Page 6 */,
"a" /* Page 7 */,
"b" /* Page 8 */,
"4" /* Page 9 */,
"5" /* Page 10 */,
]);
const data = await pdfDoc.extractPages({
document: null,
includePages: [0, 1, 5, 7, 10],
});
await loadingTask.destroy();
loadingTask = getDocument(data);
const newPdfDoc = await loadingTask.promise;
labels = await newPdfDoc.getPageLabels();
expect(labels).toEqual(["i", "ii", "1", "a", "5"]);
await loadingTask.destroy();
});
});
});
});