From ad97c5b816b01055711ebfe6018c6adc7d17cdc0 Mon Sep 17 00:00:00 2001 From: Calixte Denizet Date: Fri, 7 Nov 2025 15:34:08 +0100 Subject: [PATCH] Update the page labels tree when a pdf is extracted (bug 1997379) --- src/core/catalog.js | 16 +++-- src/core/editor/pdf_editor.js | 123 +++++++++++++++++++++++++++++++++- test/pdfs/.gitignore | 1 + test/pdfs/labelled_pages.pdf | Bin 0 -> 6717 bytes test/unit/api_spec.js | 34 ++++++++++ 5 files changed, 169 insertions(+), 5 deletions(-) create mode 100755 test/pdfs/labelled_pages.pdf diff --git a/src/core/catalog.js b/src/core/catalog.js index e5946c50f..5b5b319c7 100644 --- a/src/core/catalog.js +++ b/src/core/catalog.js @@ -735,6 +735,16 @@ class Catalog { return rawDests; } + get rawPageLabels() { + const obj = this.#catDict.getRaw("PageLabels"); + if (!obj) { + return null; + } + + const numberTree = new NumberTree(obj, this.xref); + return numberTree.getAll(); + } + get pageLabels() { let obj = null; try { @@ -749,8 +759,8 @@ class Catalog { } #readPageLabels() { - const obj = this.#catDict.getRaw("PageLabels"); - if (!obj) { + const nums = this.rawPageLabels; + if (!nums) { return null; } @@ -758,8 +768,6 @@ class Catalog { let style = null, prefix = ""; - const numberTree = new NumberTree(obj, this.xref); - const nums = numberTree.getAll(); let currentLabel = "", currentIndex = 1; diff --git a/src/core/editor/pdf_editor.js b/src/core/editor/pdf_editor.js index 7df909156..5241863a7 100644 --- a/src/core/editor/pdf_editor.js +++ b/src/core/editor/pdf_editor.js @@ -25,6 +25,7 @@ import { StringStream } from "../stream.js"; import { stringToAsciiOrUTF16BE } from "../core_utils.js"; const MAX_LEAVES_PER_PAGES_NODE = 16; +const MAX_IN_NAME_TREE_NODE = 64; class PageData { constructor(page, documentData) { @@ -39,6 +40,7 @@ class PageData { class DocumentData { constructor(document) { this.document = document; + this.pageLabels = null; this.pagesMap = new RefSetCache(); this.oldRefMapping = new RefSetCache(); } @@ -61,6 +63,7 @@ class PDFEditor { this.version = "1.7"; this.title = title; this.author = author; + this.pageLabels = null; } /** @@ -253,6 +256,8 @@ class PDFEditor { await Promise.all(promises); promises.length = 0; + this.#collectPageLabels(); + for (const page of this.oldPages) { promises.push(this.#postCollectPageData(page)); } @@ -270,7 +275,12 @@ class PDFEditor { * @param {DocumentData} documentData * @return {Promise} */ - async #collectDocumentData(documentData) {} + async #collectDocumentData(documentData) { + const { document } = documentData; + await document.pdfManager + .ensureCatalog("rawPageLabels") + .then(pageLabels => (documentData.pageLabels = pageLabels)); + } /** * Post process the collected page data. @@ -306,6 +316,56 @@ class PDFEditor { pageData.annotations = newAnnotations.length > 0 ? newAnnotations : null; } + async #collectPageLabels() { + // We can only preserve page labels when editing a single PDF file. + // This is consistent with behavior in Adobe Acrobat. + if (!this.hasSingleFile) { + return; + } + const { + documentData: { document, pageLabels }, + } = this.oldPages[0]; + if (!pageLabels) { + return; + } + const numPages = document.numPages; + const oldPageLabels = []; + const oldPageIndices = new Set( + this.oldPages.map(({ page: { pageIndex } }) => pageIndex) + ); + let currentLabel = null; + let stFirstIndex = -1; + for (let i = 0; i < numPages; i++) { + const newLabel = pageLabels.get(i); + if (newLabel) { + currentLabel = newLabel; + stFirstIndex = currentLabel.has("St") ? i : -1; + } + if (!oldPageIndices.has(i)) { + continue; + } + if (stFirstIndex !== -1) { + const st = currentLabel.get("St"); + currentLabel = currentLabel.clone(); + currentLabel.set("St", st + (i - stFirstIndex)); + stFirstIndex = -1; + } + oldPageLabels.push(currentLabel); + } + currentLabel = oldPageLabels[0]; + let currentIndex = 0; + const newPageLabels = (this.pageLabels = [[0, currentLabel]]); + for (let i = 0, ii = oldPageLabels.length; i < ii; i++) { + const label = oldPageLabels[i]; + if (label === currentLabel) { + continue; + } + currentIndex = i; + currentLabel = label; + newPageLabels.push([currentIndex, currentLabel]); + } + } + /** * Create a copy of a page. * @param {number} pageIndex @@ -423,6 +483,66 @@ class PDFEditor { } } + /** + * Create a name or number tree from the given map. + * @param {Array<[string, any]>} map + * @returns {Ref} + */ + #makeNameNumTree(map, areNames) { + const allEntries = map.sort( + areNames + ? ([keyA], [keyB]) => keyA.localeCompare(keyB) + : ([keyA], [keyB]) => keyA - keyB + ); + const maxLeaves = + MAX_IN_NAME_TREE_NODE <= 1 ? allEntries.length : MAX_IN_NAME_TREE_NODE; + const [treeRef, treeDict] = this.newDict; + const stack = [{ dict: treeDict, entries: allEntries }]; + const valueType = areNames ? "Names" : "Nums"; + + while (stack.length > 0) { + const { dict, entries } = stack.pop(); + if (entries.length <= maxLeaves) { + dict.set("Limits", [entries[0][0], entries.at(-1)[0]]); + dict.set(valueType, entries.flat()); + continue; + } + const entriesChunks = []; + const chunkSize = Math.max( + maxLeaves, + Math.ceil(entries.length / maxLeaves) + ); + for (let i = 0; i < entries.length; i += chunkSize) { + entriesChunks.push(entries.slice(i, i + chunkSize)); + } + const entriesRefs = []; + dict.set("Kids", entriesRefs); + for (const chunk of entriesChunks) { + const [entriesRef, entriesDict] = this.newDict; + entriesRefs.push(entriesRef); + entriesDict.set("Limits", [chunk[0][0], chunk.at(-1)[0]]); + stack.push({ dict: entriesDict, entries: chunk }); + } + } + return treeRef; + } + + /** + * Create the page labels tree if it exists. + */ + #makePageLabelsTree() { + const { pageLabels } = this; + if (!pageLabels || pageLabels.length === 0) { + return; + } + const { rootDict } = this; + const pageLabelsRef = this.#makeNameNumTree( + this.pageLabels, + /* areNames = */ false + ); + rootDict.set("PageLabels", pageLabelsRef); + } + /** * Create the root dictionary. * @returns {Promise} @@ -432,6 +552,7 @@ class PDFEditor { rootDict.setIfName("Type", "Catalog"); rootDict.set("Version", this.version); this.#makePageTree(); + this.#makePageLabelsTree(); } /** diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 91091a44e..9fc4acaf5 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -757,3 +757,4 @@ !doc_1_3_pages.pdf !doc_2_3_pages.pdf !doc_3_3_pages.pdf +!labelled_pages.pdf diff --git a/test/pdfs/labelled_pages.pdf b/test/pdfs/labelled_pages.pdf new file mode 100755 index 0000000000000000000000000000000000000000..68e389f40c5d467f478b2899378d375682f64ce3 GIT binary patch literal 6717 zcmeHMdvFuS8JDrKV~xc)E+vmTun~q5W9fG9C7nhRmL(gTSXh2g8;px|y0eij>EzQX zHcY02ha@}#hO}uW88a;zNFmUMn2=0TXfbI^2-JWRm=sz_NRxpQNT#KQv{2}t{1PC~ z#51J+$C{Bwce~%~_x*PF+ua!+SBU``S>23J|9b30otEYR1QOq@vs#JrKv?w2f#sqA z5K#_T6EYDMfI&ntFe8eX4-g`1084Quq7>8vnnD0E<_5c6Cx!)82T1nOX&G|ZhD-EI zg6Q$dp(wWKl!CEPI0`7jBa1@7uShaLq}@)G1cHhv6D2{PBKC_AWv+UwtB8Y6k~1?b z!%-}4c93S;;dGH4Wnz)TN!zWEb3il46&Dw|93>o07m+k%iyUSrD&m}`Vu!tfC<_Z> zdp!=EQU5nefHq_Qs)%wi+@vf86jYsgLXFc*R7sKo7#MG_#>fDMCTf;NM53-rY$R&y z5x}uw1fq&8`a(MG(z84-WP5q(+GiJq&s@Ig2Ok(`hkK16>Coiz+@@ zX_v)Dofhh}GiDT5l<2gBFk-7p=Z;n^f zVafj9BoLgE-kJV%`b4^Q^7%$1N&8!Ke5DQeCbhDq>0y^C=3DDgNy7HHqauUJORDYeR4F0~3FU zgX+ZcL8)n&Pz=J20|uP36e*J0>!A^g<1dRy{A=RyZv)}R?D37*is>eX6+p9bQ&nPA ziphQv6A&}4N>VV#0PIhz>gBNcv9+EsYQ`Y;1mcZ>b$U$mu2S;ev96#0-<)+SOUKT- z>h0KBS7+j}v!1XP-UH5$Ht$HC7RN4llrY;RS!zlGa#R6`9a?HA2I7){r1~e$3ck5o z2_-H$>z;r7@!u6re)`tog%|Uw==sc7K1n^ZddlT@GmN|M%gK6m&#M>qU+Y}2^cho9 zY2BWiMs@t;vqCXi_<8Py%uRV2&-bSMG+Ou_bJ?U>U8(u!3*DdOahrpa4lZ$Bb=nS| ztF1eGWqOZ>D_OiZ$y$sOmWvV7M{n`KBR+qN zr~qDU3WRO?ulDZOgMeVu*D`MCjyT1ofzsu&SiO9HjemKIpA+{@HPr9Bi0;bH}} zhl1g#rQN3Y;o?=a;Q#7tLJ#6ylop%bp*Dazw+A>SSp=LB8T=3;Tyz*&6C=PzQvo0$ z9s-ah5HcXnLLv)d2g^6TU9IG-ve0O$a+M6&!%sH-Qbmba2%@d6&Dcg6CAo<}9LEum zBuLVLdl;h2!iulm5ROjoS2C!_B}V;nAfg1MFi`dScqyjX^!lNaQMEk~SST`NC>%A$ zgEac35Yg_75XcCLWI}>}NG1}KgK^gcKOqLi5U!x2SPmtX!ZQy) z6iJRYm}r4D&;l|Wc#_A;SyD7ZhC(9h4-e~2kWJ8wBe8g7Nr?GH`I|s>TmJz)Rze*& z3FD}LkjCHQQHo~>VrYtpH>o4)Tm1|N_$`_lH!>=4$Ke`>Yg7n~s`EIz#^D+j0;B3Y z&aQg~mv(rQfzNAg`Zm0UNVbbfIIbFT?}f~RdoP5~)#u|g@A#SL-5X~TEr1?~sv)X` zs20+DJg;~6wH+mTXQ#;j{0&vTc1y?Zbgcsfz;|~}t6I2m$Fz+v{%ldODWkSUTiM)v z-w(8puJ753T$D>H|~^yRO@$qJ`YKnWXaT@41SkqSn$&=IxO09)IJaMhJ;sD)y;zUk( zQDNne<^|;Z)`O=`rKCNm3H{pe+hjo{oUJ9liJ