diff --git a/src/core/catalog.js b/src/core/catalog.js index e5946c50f..5b5b319c7 100644 --- a/src/core/catalog.js +++ b/src/core/catalog.js @@ -735,6 +735,16 @@ class Catalog { return rawDests; } + get rawPageLabels() { + const obj = this.#catDict.getRaw("PageLabels"); + if (!obj) { + return null; + } + + const numberTree = new NumberTree(obj, this.xref); + return numberTree.getAll(); + } + get pageLabels() { let obj = null; try { @@ -749,8 +759,8 @@ class Catalog { } #readPageLabels() { - const obj = this.#catDict.getRaw("PageLabels"); - if (!obj) { + const nums = this.rawPageLabels; + if (!nums) { return null; } @@ -758,8 +768,6 @@ class Catalog { let style = null, prefix = ""; - const numberTree = new NumberTree(obj, this.xref); - const nums = numberTree.getAll(); let currentLabel = "", currentIndex = 1; diff --git a/src/core/editor/pdf_editor.js b/src/core/editor/pdf_editor.js index 7df909156..5241863a7 100644 --- a/src/core/editor/pdf_editor.js +++ b/src/core/editor/pdf_editor.js @@ -25,6 +25,7 @@ import { StringStream } from "../stream.js"; import { stringToAsciiOrUTF16BE } from "../core_utils.js"; const MAX_LEAVES_PER_PAGES_NODE = 16; +const MAX_IN_NAME_TREE_NODE = 64; class PageData { constructor(page, documentData) { @@ -39,6 +40,7 @@ class PageData { class DocumentData { constructor(document) { this.document = document; + this.pageLabels = null; this.pagesMap = new RefSetCache(); this.oldRefMapping = new RefSetCache(); } @@ -61,6 +63,7 @@ class PDFEditor { this.version = "1.7"; this.title = title; this.author = author; + this.pageLabels = null; } /** @@ -253,6 +256,8 @@ class PDFEditor { await Promise.all(promises); promises.length = 0; + this.#collectPageLabels(); + for (const page of this.oldPages) { promises.push(this.#postCollectPageData(page)); } @@ -270,7 +275,12 @@ class PDFEditor { * @param {DocumentData} documentData * @return {Promise} */ - async #collectDocumentData(documentData) {} + async #collectDocumentData(documentData) { + const { document } = documentData; + await document.pdfManager + .ensureCatalog("rawPageLabels") + .then(pageLabels => (documentData.pageLabels = pageLabels)); + } /** * Post process the collected page data. @@ -306,6 +316,56 @@ class PDFEditor { pageData.annotations = newAnnotations.length > 0 ? newAnnotations : null; } + async #collectPageLabels() { + // We can only preserve page labels when editing a single PDF file. + // This is consistent with behavior in Adobe Acrobat. + if (!this.hasSingleFile) { + return; + } + const { + documentData: { document, pageLabels }, + } = this.oldPages[0]; + if (!pageLabels) { + return; + } + const numPages = document.numPages; + const oldPageLabels = []; + const oldPageIndices = new Set( + this.oldPages.map(({ page: { pageIndex } }) => pageIndex) + ); + let currentLabel = null; + let stFirstIndex = -1; + for (let i = 0; i < numPages; i++) { + const newLabel = pageLabels.get(i); + if (newLabel) { + currentLabel = newLabel; + stFirstIndex = currentLabel.has("St") ? i : -1; + } + if (!oldPageIndices.has(i)) { + continue; + } + if (stFirstIndex !== -1) { + const st = currentLabel.get("St"); + currentLabel = currentLabel.clone(); + currentLabel.set("St", st + (i - stFirstIndex)); + stFirstIndex = -1; + } + oldPageLabels.push(currentLabel); + } + currentLabel = oldPageLabels[0]; + let currentIndex = 0; + const newPageLabels = (this.pageLabels = [[0, currentLabel]]); + for (let i = 0, ii = oldPageLabels.length; i < ii; i++) { + const label = oldPageLabels[i]; + if (label === currentLabel) { + continue; + } + currentIndex = i; + currentLabel = label; + newPageLabels.push([currentIndex, currentLabel]); + } + } + /** * Create a copy of a page. * @param {number} pageIndex @@ -423,6 +483,66 @@ class PDFEditor { } } + /** + * Create a name or number tree from the given map. + * @param {Array<[string, any]>} map + * @returns {Ref} + */ + #makeNameNumTree(map, areNames) { + const allEntries = map.sort( + areNames + ? ([keyA], [keyB]) => keyA.localeCompare(keyB) + : ([keyA], [keyB]) => keyA - keyB + ); + const maxLeaves = + MAX_IN_NAME_TREE_NODE <= 1 ? allEntries.length : MAX_IN_NAME_TREE_NODE; + const [treeRef, treeDict] = this.newDict; + const stack = [{ dict: treeDict, entries: allEntries }]; + const valueType = areNames ? "Names" : "Nums"; + + while (stack.length > 0) { + const { dict, entries } = stack.pop(); + if (entries.length <= maxLeaves) { + dict.set("Limits", [entries[0][0], entries.at(-1)[0]]); + dict.set(valueType, entries.flat()); + continue; + } + const entriesChunks = []; + const chunkSize = Math.max( + maxLeaves, + Math.ceil(entries.length / maxLeaves) + ); + for (let i = 0; i < entries.length; i += chunkSize) { + entriesChunks.push(entries.slice(i, i + chunkSize)); + } + const entriesRefs = []; + dict.set("Kids", entriesRefs); + for (const chunk of entriesChunks) { + const [entriesRef, entriesDict] = this.newDict; + entriesRefs.push(entriesRef); + entriesDict.set("Limits", [chunk[0][0], chunk.at(-1)[0]]); + stack.push({ dict: entriesDict, entries: chunk }); + } + } + return treeRef; + } + + /** + * Create the page labels tree if it exists. + */ + #makePageLabelsTree() { + const { pageLabels } = this; + if (!pageLabels || pageLabels.length === 0) { + return; + } + const { rootDict } = this; + const pageLabelsRef = this.#makeNameNumTree( + this.pageLabels, + /* areNames = */ false + ); + rootDict.set("PageLabels", pageLabelsRef); + } + /** * Create the root dictionary. * @returns {Promise} @@ -432,6 +552,7 @@ class PDFEditor { rootDict.setIfName("Type", "Catalog"); rootDict.set("Version", this.version); this.#makePageTree(); + this.#makePageLabelsTree(); } /** diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 91091a44e..9fc4acaf5 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -757,3 +757,4 @@ !doc_1_3_pages.pdf !doc_2_3_pages.pdf !doc_3_3_pages.pdf +!labelled_pages.pdf diff --git a/test/pdfs/labelled_pages.pdf b/test/pdfs/labelled_pages.pdf new file mode 100755 index 000000000..68e389f40 Binary files /dev/null and b/test/pdfs/labelled_pages.pdf differ diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 0729875b6..892166732 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -5542,5 +5542,39 @@ small scripts as well as for`); await loadingTask.destroy(); }); }); + + describe("Page labels", function () { + it("extract page and check labels", async function () { + let loadingTask = getDocument( + buildGetDocumentParams("labelled_pages.pdf") + ); + const pdfDoc = await loadingTask.promise; + let labels = await pdfDoc.getPageLabels(); + expect(labels).toEqual([ + "i" /* Page 0 */, + "ii" /* Page 1 */, + "iii" /* Page 2 */, + "iv" /* Page 3 */, + "1" /* Page 4 */, + "2" /* Page 5 */, + "3" /* Page 6 */, + "a" /* Page 7 */, + "b" /* Page 8 */, + "4" /* Page 9 */, + "5" /* Page 10 */, + ]); + + const data = await pdfDoc.extractPages({ + document: null, + includePages: [0, 1, 5, 7, 10], + }); + await loadingTask.destroy(); + loadingTask = getDocument(data); + const newPdfDoc = await loadingTask.promise; + labels = await newPdfDoc.getPageLabels(); + expect(labels).toEqual(["i", "ii", "1", "a", "5"]); + await loadingTask.destroy(); + }); + }); }); });