diff --git a/src/core/editor/pdf_editor.js b/src/core/editor/pdf_editor.js index 5241863a7..42769f636 100644 --- a/src/core/editor/pdf_editor.js +++ b/src/core/editor/pdf_editor.js @@ -32,6 +32,8 @@ class PageData { this.page = page; this.documentData = documentData; this.annotations = null; + // Named destinations which points to this page. + this.pointingNamedDestinations = null; documentData.pagesMap.put(page.ref, this); } @@ -40,9 +42,13 @@ class PageData { class DocumentData { constructor(document) { this.document = document; + this.destinations = null; this.pageLabels = null; this.pagesMap = new RefSetCache(); this.oldRefMapping = new RefSetCache(); + this.dedupNamedDestinations = new Map(); + this.usedNamedDestinations = new Set(); + this.postponedRefCopies = new RefSetCache(); } } @@ -64,6 +70,7 @@ class PDFEditor { this.title = title; this.author = author; this.pageLabels = null; + this.namedDestinations = new Map(); } /** @@ -114,15 +121,21 @@ class PDFEditor { if (newRef) { return newRef; } + const oldRef = obj; + obj = await xref.fetchAsync(oldRef); + if (typeof obj === "number") { + // Simple value; no need to create a new reference. + return obj; + } + newRef = this.newRef; - oldRefMapping.put(obj, newRef); - obj = await xref.fetchAsync(obj); + oldRefMapping.put(oldRef, newRef); if (typeof PDFJSDev === "undefined" || PDFJSDev.test("TESTING")) { if ( obj instanceof Dict && isName(obj.get("Type"), "Page") && - !this.currentDocument.pagesMap.has(obj) + !this.currentDocument.pagesMap.has(oldRef) ) { throw new Error( "Add a deleted page to the document is not supported." @@ -134,11 +147,20 @@ class PDFEditor { return newRef; } const promises = []; + const { + currentDocument: { postponedRefCopies }, + } = this; if (Array.isArray(obj)) { if (mustClone) { obj = obj.slice(); } for (let i = 0, ii = obj.length; i < ii; i++) { + const postponedActions = postponedRefCopies.get(obj[i]); + if (postponedActions) { + // The object is a reference that needs to be copied later. + postponedActions.push(ref => (obj[i] = ref)); + continue; + } promises.push( this.#collectDependencies(obj[i], true, xref).then( newObj => (obj[i] = newObj) @@ -159,6 +181,12 @@ class PDFEditor { } if (dict) { for (const [key, rawObj] of dict.getRawEntries()) { + const postponedActions = postponedRefCopies.get(rawObj); + if (postponedActions) { + // The object is a reference that needs to be copied later. + postponedActions.push(ref => dict.set(key, ref)); + continue; + } promises.push( this.#collectDependencies(rawObj, true, xref).then(newObj => dict.set(key, newObj) @@ -189,11 +217,13 @@ class PDFEditor { const promises = []; let newIndex = 0; this.hasSingleFile = pageInfos.length === 1; + const allDocumentData = []; for (const { document, includePages, excludePages } of pageInfos) { if (!document) { continue; } const documentData = new DocumentData(document); + allDocumentData.push(documentData); promises.push(this.#collectDocumentData(documentData)); let keptIndices, keptRanges, deletedIndices, deletedRanges; for (const page of includePages || []) { @@ -256,6 +286,7 @@ class PDFEditor { await Promise.all(promises); promises.length = 0; + this.#collectValidDestinations(allDocumentData); this.#collectPageLabels(); for (const page of this.oldPages) { @@ -263,10 +294,15 @@ class PDFEditor { } await Promise.all(promises); + this.#findDuplicateNamedDestinations(); + this.#setPostponedRefCopies(allDocumentData); + for (let i = 0, ii = this.oldPages.length; i < ii; i++) { this.newPages[i] = await this.#makePageCopy(i, null); } + this.#fixPostponedRefCopies(allDocumentData); + return this.writePDF(); } @@ -276,10 +312,17 @@ class PDFEditor { * @return {Promise} */ async #collectDocumentData(documentData) { - const { document } = documentData; - await document.pdfManager - .ensureCatalog("rawPageLabels") - .then(pageLabels => (documentData.pageLabels = pageLabels)); + const { + document: { pdfManager }, + } = documentData; + await Promise.all([ + pdfManager + .ensureCatalog("destinations") + .then(destinations => (documentData.destinations = destinations)), + pdfManager + .ensureCatalog("rawPageLabels") + .then(pageLabels => (documentData.pageLabels = pageLabels)), + ]); } /** @@ -290,6 +333,7 @@ class PDFEditor { async #postCollectPageData(pageData) { const { page: { xref, annotations }, + documentData: { pagesMap, destinations, usedNamedDestinations }, } = pageData; if (!annotations) { @@ -300,22 +344,185 @@ class PDFEditor { let newAnnotations = []; let newIndex = 0; - // TODO: remove only links to deleted pages. + // Filter out annotations that are linking to deleted pages. for (const annotationRef of annotations) { const newAnnotationIndex = newIndex++; promises.push( xref.fetchIfRefAsync(annotationRef).then(async annotationDict => { if (!isName(annotationDict.get("Subtype"), "Link")) { newAnnotations[newAnnotationIndex] = annotationRef; + return; + } + const action = annotationDict.get("A"); + const dest = + action instanceof Dict + ? action.get("D") + : annotationDict.get("Dest"); + + if ( + !dest /* not a destination */ || + (Array.isArray(dest) && + (!(dest[0] instanceof Ref) || pagesMap.has(dest[0]))) + ) { + // Keep the annotation as is: it isn't linking to a deleted page. + newAnnotations[newAnnotationIndex] = annotationRef; + } else if (typeof dest === "string") { + const destString = stringToPDFString( + dest, + /* keepEscapeSequence = */ true + ); + if (destinations.has(destString)) { + // Keep the annotation as is: the named destination is valid. + // Valid named destinations have been collected previously (see + // #collectValidDestinations). + newAnnotations[newAnnotationIndex] = annotationRef; + usedNamedDestinations.add(destString); + } } }) ); } + await Promise.all(promises); newAnnotations = newAnnotations.filter(annot => !!annot); pageData.annotations = newAnnotations.length > 0 ? newAnnotations : null; } + /** + * Some references cannot be copied right away since they correspond to some + * pages that haven't been processed yet. Postpone the copy of those + * references. + * @param {Array} allDocumentData + */ + #setPostponedRefCopies(allDocumentData) { + for (const { postponedRefCopies, pagesMap } of allDocumentData) { + for (const oldPageRef of pagesMap.keys()) { + postponedRefCopies.put(oldPageRef, []); + } + } + } + + /** + * Fix all postponed reference copies. + * @param {Array} allDocumentData + */ + #fixPostponedRefCopies(allDocumentData) { + for (const { postponedRefCopies, oldRefMapping } of allDocumentData) { + for (const [oldRef, actions] of postponedRefCopies.items()) { + const newRef = oldRefMapping.get(oldRef); + for (const action of actions) { + action(newRef); + } + } + postponedRefCopies.clear(); + } + } + + /** + * Collect named destinations that are still valid (i.e. pointing to kept + * pages). + * @param {Array} allDocumentData + */ + #collectValidDestinations(allDocumentData) { + // TODO: Handle OpenAction as well. + for (const documentData of allDocumentData) { + if (!documentData.destinations) { + continue; + } + const { destinations, pagesMap } = documentData; + const newDestinations = (documentData.destinations = new Map()); + for (const [key, dest] of Object.entries(destinations)) { + const pageRef = dest[0]; + const pageData = pagesMap.get(pageRef); + if (!pageData) { + continue; + } + (pageData.pointingNamedDestinations ||= new Set()).add(key); + newDestinations.set(key, dest); + } + } + } + + /** + * Find and rename duplicate named destinations. + */ + #findDuplicateNamedDestinations() { + const { namedDestinations } = this; + for (let i = 0, ii = this.oldPages.length; i < ii; i++) { + const page = this.oldPages[i]; + const { + documentData: { + destinations, + dedupNamedDestinations, + usedNamedDestinations, + }, + } = page; + let { pointingNamedDestinations } = page; + + if (!pointingNamedDestinations) { + // No named destinations pointing to this page. + continue; + } + // Keep only the named destinations that are still used. + page.pointingNamedDestinations = pointingNamedDestinations = + pointingNamedDestinations.intersection(usedNamedDestinations); + + for (const pointingDest of pointingNamedDestinations) { + if (!usedNamedDestinations.has(pointingDest)) { + // If the named destination isn't used, we can keep it as is. + continue; + } + const dest = destinations.get(pointingDest).slice(); + if (!namedDestinations.has(pointingDest)) { + // If the named destination hasn't been used yet, we can keep it + // as is. + namedDestinations.set(pointingDest, dest); + continue; + } + // Create a new unique named destination. + const newName = `${pointingDest}_p${i + 1}`; + dedupNamedDestinations.set(pointingDest, newName); + namedDestinations.set(newName, dest); + } + } + } + + /** + * Fix named destinations in the annotations. + * @param {Array} annotations + * @param {Map} dedupNamedDestinations + */ + #fixNamedDestinations(annotations, dedupNamedDestinations) { + if (dedupNamedDestinations.size === 0) { + return; + } + const fixDestination = (dict, key, dest) => { + if (typeof dest === "string") { + dict.set( + key, + dedupNamedDestinations.get( + stringToPDFString(dest, /* keepEscapeSequence = */ true) + ) || dest + ); + } + }; + + for (const annotRef of annotations) { + const annotDict = this.xref[annotRef.num]; + if (!isName(annotDict.get("Subtype"), "Link")) { + continue; + } + const action = annotDict.get("A"); + if (action instanceof Dict && action.has("D")) { + const dest = action.get("D"); + fixDestination(action, "D", dest); + continue; + } + const dest = annotDict.get("Dest"); + fixDestination(annotDict, "Dest", dest); + } + } + async #collectPageLabels() { // We can only preserve page labels when editing a single PDF file. // This is consistent with behavior in Adobe Acrobat. @@ -372,14 +579,23 @@ class PDFEditor { * @returns {Promise} the page reference in the new PDF document. */ async #makePageCopy(pageIndex) { - const { page, documentData, annotations } = this.oldPages[pageIndex]; + const { page, documentData, annotations, pointingNamedDestinations } = + this.oldPages[pageIndex]; this.currentDocument = documentData; - const { oldRefMapping } = documentData; + const { dedupNamedDestinations, oldRefMapping } = documentData; const { xref, rotate, mediaBox, resources, ref: oldPageRef } = page; const pageRef = this.newRef; const pageDict = (this.xref[pageRef.num] = page.pageDict.clone()); oldRefMapping.put(oldPageRef, pageRef); + if (pointingNamedDestinations) { + for (const pointingDest of pointingNamedDestinations) { + const name = dedupNamedDestinations.get(pointingDest) || pointingDest; + const dest = this.namedDestinations.get(name); + dest[0] = pageRef; + } + } + // No need to keep these entries as we'll set them again later. for (const key of [ "Rotate", @@ -416,10 +632,16 @@ class PDFEditor { "Resources", await this.#collectDependencies(resources, true, xref) ); - pageDict.setIfArray( - "Annots", - await this.#collectDependencies(annotations, true, xref) - ); + + if (annotations) { + const newAnnotations = await this.#collectDependencies( + annotations, + true, + xref + ); + this.#fixNamedDestinations(newAnnotations, dedupNamedDestinations); + pageDict.setIfArray("Annots", newAnnotations); + } if (this.useObjectStreams) { const newLastRef = this.newRefCount; @@ -485,7 +707,7 @@ class PDFEditor { /** * Create a name or number tree from the given map. - * @param {Array<[string, any]>} map + * @param {Array<[string|number, any]>} map * @returns {Ref} */ #makeNameNumTree(map, areNames) { @@ -543,6 +765,24 @@ class PDFEditor { rootDict.set("PageLabels", pageLabelsRef); } + #makeDestinationsTree() { + const { namedDestinations } = this; + if (namedDestinations.size === 0) { + return; + } + if (!this.namesDict) { + [this.namesRef, this.namesDict] = this.newDict; + this.rootDict.set("Names", this.namesRef); + } + this.namesDict.set( + "Dests", + this.#makeNameNumTree( + Array.from(namedDestinations.entries()), + /* areNames = */ true + ) + ); + } + /** * Create the root dictionary. * @returns {Promise} @@ -553,6 +793,7 @@ class PDFEditor { rootDict.set("Version", this.version); this.#makePageTree(); this.#makePageLabelsTree(); + this.#makeDestinationsTree(); } /** diff --git a/src/core/primitives.js b/src/core/primitives.js index 22cdd2527..854341dbd 100644 --- a/src/core/primitives.js +++ b/src/core/primitives.js @@ -439,6 +439,12 @@ class RefSetCache { yield [Ref.fromString(ref), value]; } } + + *keys() { + for (const ref of this._map.keys()) { + yield Ref.fromString(ref); + } + } } function isName(v, name) { diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 9fc4acaf5..e2bb0f1e8 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -758,3 +758,4 @@ !doc_2_3_pages.pdf !doc_3_3_pages.pdf !labelled_pages.pdf +!extract_link.pdf diff --git a/test/pdfs/extract_link.pdf b/test/pdfs/extract_link.pdf new file mode 100755 index 000000000..e9e4be432 Binary files /dev/null and b/test/pdfs/extract_link.pdf differ diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 892166732..34559c4b4 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -5337,6 +5337,15 @@ small scripts as well as for`); }); describe("PDF page editing", function () { + const getPageRefs = async pdfDoc => { + const refs = []; + for (let i = 1; i <= pdfDoc.numPages; i++) { + const page = await pdfDoc.getPage(i); + refs.push(page.ref); + } + return refs; + }; + describe("Merge pdfs", function () { it("should merge three PDFs", async function () { const loadingTask = getDocument( @@ -5576,5 +5585,142 @@ small scripts as well as for`); await loadingTask.destroy(); }); }); + + describe("Named destinations", function () { + it("extract page and check destinations", async function () { + let loadingTask = getDocument(buildGetDocumentParams("issue6204.pdf")); + let pdfDoc = await loadingTask.promise; + let pagesRef = await getPageRefs(pdfDoc); + let destinations = await pdfDoc.getDestinations(); + expect(destinations).toEqual({ + "Page.1": [pagesRef[0], { name: "XYZ" }, 0, 375, null], + "Page.2": [pagesRef[1], { name: "XYZ" }, 0, 375, null], + }); + + let data = await pdfDoc.extractPages([ + { document: null }, + { document: null }, + ]); + await loadingTask.destroy(); + + loadingTask = getDocument(data); + pdfDoc = await loadingTask.promise; + + expect(pdfDoc.numPages).toEqual(4); + + pagesRef = await getPageRefs(pdfDoc); + destinations = await pdfDoc.getDestinations(); + expect(destinations).toEqual({ + "Page.1": [pagesRef[0], { name: "XYZ" }, 0, 375, null], + "Page.2": [pagesRef[1], { name: "XYZ" }, 0, 375, null], + "Page.1_p3": [pagesRef[2], { name: "XYZ" }, 0, 375, null], + "Page.2_p4": [pagesRef[3], { name: "XYZ" }, 0, 375, null], + }); + const expectedDests = ["Page.2", "Page.1", "Page.2_p4", "Page.1_p3"]; + for (let i = 1; i <= 4; i++) { + const pdfPage = await pdfDoc.getPage(i); + const annots = await pdfPage.getAnnotations(); + expect(annots.length).toEqual(1); + expect(annots[0].dest).toEqual(expectedDests[i - 1]); + } + + data = await pdfDoc.extractPages([ + { document: null }, + { document: null }, + ]); + await loadingTask.destroy(); + + loadingTask = getDocument(data); + pdfDoc = await loadingTask.promise; + + expect(pdfDoc.numPages).toEqual(8); + + pagesRef = await getPageRefs(pdfDoc); + destinations = await pdfDoc.getDestinations(); + expect(destinations).toEqual({ + "Page.1": [pagesRef[0], { name: "XYZ" }, 0, 375, null], + "Page.2": [pagesRef[1], { name: "XYZ" }, 0, 375, null], + "Page.1_p3": [pagesRef[2], { name: "XYZ" }, 0, 375, null], + "Page.2_p4": [pagesRef[3], { name: "XYZ" }, 0, 375, null], + "Page.1_p5": [pagesRef[4], { name: "XYZ" }, 0, 375, null], + "Page.2_p6": [pagesRef[5], { name: "XYZ" }, 0, 375, null], + "Page.1_p3_p7": [pagesRef[6], { name: "XYZ" }, 0, 375, null], + "Page.2_p4_p8": [pagesRef[7], { name: "XYZ" }, 0, 375, null], + }); + expectedDests.push( + "Page.2_p6", + "Page.1_p5", + "Page.2_p4_p8", + "Page.1_p3_p7" + ); + for (let i = 1; i <= 8; i++) { + const pdfPage = await pdfDoc.getPage(i); + const annots = await pdfPage.getAnnotations(); + expect(annots.length).toEqual(1); + expect(annots[0].dest).toEqual(expectedDests[i - 1]); + } + await loadingTask.destroy(); + }); + + it("extract pages and check deleted destinations", async function () { + let loadingTask = getDocument(buildGetDocumentParams("issue6204.pdf")); + let pdfDoc = await loadingTask.promise; + const data = await pdfDoc.extractPages([ + { document: null }, + { document: null, excludePages: [0] }, + ]); + await loadingTask.destroy(); + + loadingTask = getDocument(data); + pdfDoc = await loadingTask.promise; + + expect(pdfDoc.numPages).toEqual(3); + + const pagesRef = await getPageRefs(pdfDoc); + const destinations = await pdfDoc.getDestinations(); + expect(destinations).toEqual({ + "Page.1": [pagesRef[0], { name: "XYZ" }, 0, 375, null], + "Page.2": [pagesRef[1], { name: "XYZ" }, 0, 375, null], + }); + const pdfPage = await pdfDoc.getPage(3); + const annots = await pdfPage.getAnnotations(); + expect(annots.length).toEqual(0); + }); + }); + + describe("Destinations with a page reference", function () { + it("extract page and check destinations", async function () { + let loadingTask = getDocument( + buildGetDocumentParams("extract_link.pdf") + ); + let pdfDoc = await loadingTask.promise; + let pagesRef = await getPageRefs(pdfDoc); + let pdfPage = await pdfDoc.getPage(1); + let annotations = await pdfPage.getAnnotations(); + expect(annotations.length).toEqual(1); + expect(annotations[0].dest[0]).toEqual(pagesRef[1]); + + const data = await pdfDoc.extractPages([ + { document: null }, + { document: null }, + ]); + await loadingTask.destroy(); + + loadingTask = getDocument(data); + pdfDoc = await loadingTask.promise; + + expect(pdfDoc.numPages).toEqual(4); + + pagesRef = await getPageRefs(pdfDoc); + for (let i = 1; i <= 4; i += 2) { + pdfPage = await pdfDoc.getPage(i); + annotations = await pdfPage.getAnnotations(); + expect(annotations.length).toEqual(1); + expect(annotations[0].dest[0]).toEqual(pagesRef[i]); + } + + await loadingTask.destroy(); + }); + }); }); }); diff --git a/test/unit/primitives_spec.js b/test/unit/primitives_spec.js index b71df0b83..ed9993121 100644 --- a/test/unit/primitives_spec.js +++ b/test/unit/primitives_spec.js @@ -562,6 +562,12 @@ describe("primitives", function () { [ref2, obj2], ]); }); + + it("should support iteration over keys", function () { + cache.put(ref1, obj1); + cache.put(ref2, obj2); + expect([...cache.keys()]).toEqual([ref1, ref2]); + }); }); describe("isName", function () {