diff --git a/src/core/editor/pdf_editor.js b/src/core/editor/pdf_editor.js index 9e4f81c62..2f9d003e3 100644 --- a/src/core/editor/pdf_editor.js +++ b/src/core/editor/pdf_editor.js @@ -17,9 +17,10 @@ /** @typedef {import("../document.js").Page} Page */ /** @typedef {import("../xref.js").XRef} XRef */ -import { Dict, isName, Ref, RefSetCache } from "../primitives.js"; +import { Dict, isName, Name, Ref, RefSet, RefSetCache } from "../primitives.js"; import { getModificationDate, stringToPDFString } from "../../shared/util.js"; import { incrementalUpdate, writeValue } from "../writer.js"; +import { NameTree, NumberTree } from "../name_number_tree.js"; import { BaseStream } from "../base_stream.js"; import { StringStream } from "../stream.js"; import { stringToAsciiOrUTF16BE } from "../core_utils.js"; @@ -49,6 +50,16 @@ class DocumentData { this.dedupNamedDestinations = new Map(); this.usedNamedDestinations = new Set(); this.postponedRefCopies = new RefSetCache(); + this.usedStructParents = new Set(); + this.oldStructParentMapping = new Map(); + this.structTreeRoot = null; + this.parentTree = null; + this.idTree = null; + this.roleMap = null; + this.classMap = null; + this.namespaces = null; + this.structTreeAF = null; + this.structTreePronunciationLexicon = []; } } @@ -82,6 +93,14 @@ class PDFEditor { this.author = author; this.pageLabels = null; this.namedDestinations = new Map(); + this.parentTree = new Map(); + this.structTreeKids = []; + this.idTree = new Map(); + this.classMap = new Dict(); + this.roleMap = new Dict(); + this.namespaces = new Map(); + this.structTreeAF = []; + this.structTreePronunciationLexicon = []; } /** @@ -115,6 +134,12 @@ class PDFEditor { return ref; } + cloneDict(dict) { + const newDict = dict.clone(); + newDict.xref = this.xrefWrapper; + return newDict; + } + /** * Collect the dependencies of an object and create new references for each * dependency. @@ -212,6 +237,232 @@ class PDFEditor { return obj; } + async #cloneStructTreeNode( + parentStructRef, + node, + xref, + removedStructElements, + dedupIDs, + dedupClasses, + dedupRoles, + visited = new RefSet() + ) { + const { + currentDocument: { pagesMap, oldRefMapping }, + } = this; + const pg = node.getRaw("Pg"); + if (pg instanceof Ref && !pagesMap.has(pg)) { + return null; + } + let kids; + const k = (kids = node.getRaw("K")); + if (k instanceof Ref) { + // We're only interested by ref referencing nodes and not an array. + if (visited.has(k)) { + return null; + } + kids = await xref.fetchAsync(k); + if (!Array.isArray(kids)) { + kids = [k]; + } + } + kids = Array.isArray(kids) ? kids : [kids]; + const newKids = []; + const structElemIndices = []; + for (let kid of kids) { + const kidRef = kid instanceof Ref ? kid : null; + if (kidRef) { + if (visited.has(kidRef)) { + continue; + } + visited.put(kidRef); + kid = await xref.fetchAsync(kidRef); + } + if (typeof kid === "number") { + newKids.push(kid); + continue; + } + if (!(kid instanceof Dict)) { + continue; + } + const pgRef = kid.getRaw("Pg"); + if (pgRef instanceof Ref && !pagesMap.has(pgRef)) { + continue; + } + const type = kid.get("Type"); + if (!type || isName(type, "StructElem")) { + let setAsSpan = false; + if (kidRef && removedStructElements.has(kidRef)) { + if (!isName(kid.get("S"), "Link")) { + continue; + } + // A link annotation has been removed but we still need to keep the + // node in order to preserve the structure tree. Mark it as a Span + // so that it doesn't affect the semantics. + setAsSpan = true; + } + const newKidRef = await this.#cloneStructTreeNode( + kidRef, + kid, + xref, + removedStructElements, + dedupIDs, + dedupClasses, + dedupRoles, + visited + ); + if (newKidRef) { + structElemIndices.push(newKids.length); + newKids.push(newKidRef); + if (kidRef) { + oldRefMapping.put(kidRef, newKidRef); + } + if (setAsSpan) { + this.xref[newKidRef.num].setIfName("S", "Span"); + } + } + continue; + } + if (isName(type, "OBJR")) { + if (!kidRef) { + continue; + } + const newKidRef = oldRefMapping.get(kidRef); + if (!newKidRef) { + continue; + } + const newKid = this.xref[newKidRef.num]; + // Fix the missing StructParent entry in the referenced object. + const objRef = newKid.getRaw("Obj"); + if (objRef instanceof Ref) { + const obj = this.xref[objRef.num]; + if ( + obj instanceof Dict && + !obj.has("StructParent") && + parentStructRef + ) { + const structParent = this.parentTree.size; + this.parentTree.set(structParent, [oldRefMapping, parentStructRef]); + obj.set("StructParent", structParent); + } + } + newKids.push(newKidRef); + continue; + } + if (isName(type, "MCR")) { + const newKid = await this.#collectDependencies( + kidRef || kid, + true, + xref + ); + newKids.push(newKid); + continue; + } + if (kidRef) { + const newKidRef = await this.#collectDependencies(kidRef, true, xref); + newKids.push(newKidRef); + } + } + if (kids.length !== 0 && newKids.length === 0) { + return null; + } + + const newNodeRef = this.newRef; + const newNode = (this.xref[newNodeRef.num] = this.cloneDict(node)); + // Don't collect for ID or C since they will be fixed later. + newNode.delete("ID"); + newNode.delete("C"); + newNode.delete("K"); + newNode.delete("P"); + newNode.delete("S"); + await this.#collectDependencies(newNode, false, xref); + + // Fix the class names. + const classNames = node.get("C"); + if (classNames instanceof Name) { + const newClassName = dedupClasses.get(classNames.name); + if (newClassName) { + newNode.set("C", Name.get(newClassName)); + } else { + newNode.set("C", classNames); + } + } else if (Array.isArray(classNames)) { + const newClassNames = []; + for (const className of classNames) { + if (className instanceof Name) { + const newClassName = dedupClasses.get(className.name); + if (newClassName) { + newClassNames.push(Name.get(newClassName)); + } else { + newClassNames.push(className); + } + } + } + newNode.set("C", newClassNames); + } + + // Fix the role name. + const roleName = node.get("S"); + if (roleName instanceof Name) { + const newRoleName = dedupRoles.get(roleName.name); + if (newRoleName) { + newNode.set("S", Name.get(newRoleName)); + } else { + newNode.set("S", roleName); + } + } + + // Fix the ID. + const id = node.get("ID"); + if (typeof id === "string") { + const stringId = stringToPDFString(id, /* keepEscapeSequence = */ false); + const newId = dedupIDs.get(stringId); + if (newId) { + newNode.set("ID", stringToAsciiOrUTF16BE(newId)); + } else { + newNode.set("ID", id); + } + } + + // Table headers may contain IDs that need to be deduplicated. + let attributes = newNode.get("A"); + if (attributes) { + if (!Array.isArray(attributes)) { + attributes = [attributes]; + } + for (let attr of attributes) { + attr = this.xrefWrapper.fetch(attr); + if (isName(attr.get("O"), "Table") && attr.has("Headers")) { + const headers = this.xrefWrapper.fetch(attr.getRaw("Headers")); + if (Array.isArray(headers)) { + for (let i = 0, ii = headers.length; i < ii; i++) { + const newId = dedupIDs.get( + stringToPDFString(headers[i], /* keepEscapeSequence = */ false) + ); + if (newId) { + headers[i] = newId; + } + } + } + } + } + } + + for (const index of structElemIndices) { + const structElemRef = newKids[index]; + const structElem = this.xref[structElemRef.num]; + structElem.set("P", newNodeRef); + } + + if (newKids.length === 1) { + newNode.set("K", newKids[0]); + } else if (newKids.length > 1) { + newNode.set("K", newKids); + } + + return newNodeRef; + } + /** * @typedef {Object} PageInfo * @property {PDFDocument} document @@ -315,6 +566,7 @@ class PDFEditor { } this.#fixPostponedRefCopies(allDocumentData); + await this.#mergeStructTrees(allDocumentData); return this.writePDF(); } @@ -326,7 +578,7 @@ class PDFEditor { */ async #collectDocumentData(documentData) { const { - document: { pdfManager }, + document: { pdfManager, xref }, } = documentData; await Promise.all([ pdfManager @@ -335,7 +587,34 @@ class PDFEditor { pdfManager .ensureCatalog("rawPageLabels") .then(pageLabels => (documentData.pageLabels = pageLabels)), + pdfManager + .ensureCatalog("structTreeRoot") + .then(structTreeRoot => (documentData.structTreeRoot = structTreeRoot)), ]); + const structTreeRoot = documentData.structTreeRoot; + if (structTreeRoot) { + const rootDict = structTreeRoot.dict; + const parentTree = rootDict.get("ParentTree"); + if (parentTree) { + const numberTree = new NumberTree(parentTree, xref); + documentData.parentTree = numberTree.getAll(/* isRaw = */ true); + } + const idTree = rootDict.get("IDTree"); + if (idTree) { + const nameTree = new NameTree(idTree, xref); + documentData.idTree = nameTree.getAll(/* isRaw = */ true); + } + documentData.roleMap = rootDict.get("RoleMap") || null; + documentData.classMap = rootDict.get("ClassMap") || null; + let namespaces = rootDict.get("Namespaces") || null; + if (namespaces && !Array.isArray(namespaces)) { + namespaces = [namespaces]; + } + documentData.namespaces = namespaces; + documentData.structTreeAF = rootDict.get("AF") || null; + documentData.structTreePronunciationLexicon = + rootDict.get("PronunciationLexicon") || null; + } } /** @@ -371,7 +650,6 @@ class PDFEditor { action instanceof Dict ? action.get("D") : annotationDict.get("Dest"); - if ( !dest /* not a destination */ || (Array.isArray(dest) && @@ -431,6 +709,293 @@ class PDFEditor { } } + #visitObject(obj, callback, visited = new RefSet()) { + if (obj instanceof Ref) { + if (!visited.has(obj)) { + visited.put(obj); + this.#visitObject(this.xref[obj.num], callback, visited); + } + return; + } + if (Array.isArray(obj)) { + for (const item of obj) { + this.#visitObject(item, callback, visited); + } + return; + } + let dict; + if (obj instanceof BaseStream) { + ({ dict } = obj); + } else if (obj instanceof Dict) { + dict = obj; + } + if (dict) { + callback(dict); + for (const value of dict.getRawValues()) { + this.#visitObject(value, callback, visited); + } + } + } + + async #mergeStructTrees(allDocumentData) { + let newStructParentId = 0; + const { parentTree: newParentTree } = this; + for (let i = 0, ii = this.newPages.length; i < ii; i++) { + const { + documentData: { + parentTree, + oldRefMapping, + oldStructParentMapping, + usedStructParents, + document: { xref }, + }, + } = this.oldPages[i]; + if (!parentTree) { + continue; + } + const pageRef = this.newPages[i]; + const pageDict = this.xref[pageRef.num]; + + // Visit the new page in order to collect used StructParent entries. + this.#visitObject(pageDict, dict => { + const structParent = + dict.get("StructParent") ?? dict.get("StructParents"); + if (typeof structParent !== "number") { + return; + } + usedStructParents.add(structParent); + let parent = parentTree.get(structParent); + const parentRef = parent instanceof Ref ? parent : null; + if (parentRef) { + const array = xref.fetch(parentRef); + if (Array.isArray(array)) { + parent = array; + } + } + if (Array.isArray(parent) && parent.every(ref => ref === null)) { + parent = null; + } + if (!parent) { + if (dict.has("StructParent")) { + dict.delete("StructParent"); + } else { + dict.delete("StructParents"); + } + return; + } + let newStructParent = oldStructParentMapping.get(structParent); + if (newStructParent === undefined) { + newStructParent = newStructParentId++; + oldStructParentMapping.set(structParent, newStructParent); + newParentTree.set(newStructParent, [oldRefMapping, parent]); + } + if (dict.has("StructParent")) { + dict.set("StructParent", newStructParent); + } else { + dict.set("StructParents", newStructParent); + } + }); + } + + const { + structTreeKids, + idTree: newIdTree, + classMap: newClassMap, + roleMap: newRoleMap, + namespaces: newNamespaces, + structTreeAF: newStructTreeAF, + structTreePronunciationLexicon: newStructTreePronunciationLexicon, + } = this; + // Clone the struct tree nodes for each document. + for (const documentData of allDocumentData) { + const { + document: { xref }, + oldRefMapping, + parentTree, + usedStructParents, + structTreeRoot, + idTree, + classMap, + roleMap, + namespaces, + structTreeAF, + structTreePronunciationLexicon, + } = documentData; + + if (!structTreeRoot) { + continue; + } + + this.currentDocument = documentData; + // Get all the removed StructElem + const removedStructElements = new RefSet(); + for (const [key, value] of parentTree || []) { + if (!usedStructParents.has(key) && value instanceof Ref) { + removedStructElements.put(value); + } + } + + // Deduplicate IDs in the ID tree. + // We keep the old node references since they will be cloned later when + // cloning the struct tree. + const dedupIDs = new Map(); + for (const [id, nodeRef] of idTree || []) { + let _id = id; + if (newIdTree.has(id)) { + for (let i = 1; ; i++) { + const newId = `${id}_${i}`; + if (!newIdTree.has(newId)) { + dedupIDs.set(id, newId); + _id = newId; + break; + } + } + } + newIdTree.set(_id, nodeRef); + } + + const dedupClasses = new Map(); + if (classMap?.size > 0) { + // Deduplicate ClassMap entries. + for (let [className, classDict] of classMap) { + classDict = await this.#collectDependencies(classDict, true, xref); + if (newClassMap.has(className)) { + for (let i = 1; ; i++) { + const newClassName = `${className}_${i}`; + if (!newClassMap.has(newClassName)) { + dedupClasses.set(className, newClassName); + className = newClassName; + break; + } + } + } + newClassMap.set(className, classDict); + } + } + + const dedupRoles = new Map(); + if (roleMap?.size > 0) { + // Deduplicate RoleMap entries. + for (const [roleName, mappedName] of roleMap) { + const newMappedName = newRoleMap.get(roleName); + if (!newMappedName) { + newRoleMap.set(roleName, mappedName); + continue; + } + if (newMappedName === mappedName) { + continue; + } + for (let i = 1; ; i++) { + const newRoleName = `${roleName}_${i}`; + if (!newRoleMap.has(newRoleName)) { + dedupRoles.set(roleName, newRoleName); + newRoleMap.set(newRoleName, mappedName); + break; + } + } + } + } + + if (namespaces?.length > 0) { + for (const namespaceRef of namespaces) { + const namespace = await xref.fetchIfRefAsync(namespaceRef); + let ns = namespace.get("NS"); + if (!ns || newNamespaces.has(ns)) { + continue; + } + ns = stringToPDFString(ns, /* keepEscapeSequence = */ false); + const newNamespace = await this.#collectDependencies( + namespace, + true, + xref + ); + newNamespaces.set(ns, newNamespace); + } + } + + if (structTreeAF) { + for (const afRef of structTreeAF) { + newStructTreeAF.push( + await this.#collectDependencies(afRef, true, xref) + ); + } + } + + if (structTreePronunciationLexicon) { + for (const lexiconRef of structTreePronunciationLexicon) { + newStructTreePronunciationLexicon.push( + await this.#collectDependencies(lexiconRef, true, xref) + ); + } + } + + // Get the kids. + let kids = structTreeRoot.dict.get("K"); + if (!kids) { + continue; + } + kids = Array.isArray(kids) ? kids : [kids]; + for (let kid of kids) { + const kidRef = kid instanceof Ref ? kid : null; + if (kidRef && removedStructElements.has(kidRef)) { + continue; + } + kid = await xref.fetchIfRefAsync(kid); + const newKidRef = await this.#cloneStructTreeNode( + kidRef, + kid, + xref, + removedStructElements, + dedupIDs, + dedupClasses, + dedupRoles + ); + if (newKidRef) { + structTreeKids.push(newKidRef); + } + } + + // Fix the ID tree. + for (const [id, nodeRef] of idTree || []) { + const newNodeRef = oldRefMapping.get(nodeRef); + const newId = dedupIDs.get(id) || id; + if (newNodeRef) { + newIdTree.set(newId, newNodeRef); + } else { + newIdTree.delete(newId); + } + } + } + + for (const [key, [oldRefMapping, parent]] of newParentTree) { + if (!parent) { + newParentTree.delete(key); + continue; + } + // Some nodes haven't been visited while cloning the struct trees so their + // ref don't belong to the oldRefMapping. Remove those nodes. + if (!Array.isArray(parent)) { + const newParent = oldRefMapping.get(parent); + if (newParent === undefined) { + newParentTree.delete(key); + } else { + newParentTree.set(key, newParent); + } + continue; + } + const newParents = parent.map( + ref => (ref instanceof Ref && oldRefMapping.get(ref)) || null + ); + if (newParents.length === 0 || newParents.every(ref => ref === null)) { + newParentTree.delete(key); + continue; + } + newParentTree.set(key, newParents); + } + + this.currentDocument = null; + } + /** * Collect named destinations that are still valid (i.e. pointing to kept * pages). @@ -566,7 +1131,7 @@ class PDFEditor { } if (stFirstIndex !== -1) { const st = currentLabel.get("St"); - currentLabel = currentLabel.clone(); + currentLabel = this.cloneDict(currentLabel); currentLabel.set("St", st + (i - stFirstIndex)); stFirstIndex = -1; } @@ -598,7 +1163,7 @@ class PDFEditor { const { dedupNamedDestinations, oldRefMapping } = documentData; const { xref, rotate, mediaBox, resources, ref: oldPageRef } = page; const pageRef = this.newRef; - const pageDict = (this.xref[pageRef.num] = page.pageDict.clone()); + const pageDict = (this.xref[pageRef.num] = this.cloneDict(page.pageDict)); oldRefMapping.put(oldPageRef, pageRef); if (pointingNamedDestinations) { @@ -796,6 +1361,71 @@ class PDFEditor { ); } + #makeStructTree() { + const { structTreeKids } = this; + if (!structTreeKids || structTreeKids.length === 0) { + return; + } + const { rootDict } = this; + const structTreeRef = this.newRef; + const structTree = (this.xref[structTreeRef.num] = new Dict()); + structTree.setIfName("Type", "StructTreeRoot"); + structTree.setIfArray("K", structTreeKids); + for (const kidRef of structTreeKids) { + const kid = this.xref[kidRef.num]; + const type = kid.get("Type"); + if (!type || isName(type, "StructElem")) { + kid.set("P", structTreeRef); + } + } + if (this.parentTree.size > 0) { + const parentTreeRef = this.#makeNameNumTree( + Array.from(this.parentTree.entries()), + /* areNames = */ false + ); + const parentTree = this.xref[parentTreeRef.num]; + parentTree.setIfName("Type", "ParentTree"); + structTree.set("ParentTree", parentTreeRef); + structTree.set("ParentTreeNextKey", this.parentTree.size); + } + if (this.idTree.size > 0) { + const idTreeRef = this.#makeNameNumTree( + Array.from(this.idTree.entries()), + /* areNames = */ true + ); + const idTree = this.xref[idTreeRef.num]; + idTree.setIfName("Type", "IDTree"); + structTree.set("IDTree", idTreeRef); + } + if (this.classMap.size > 0) { + const classMapRef = this.newRef; + this.xref[classMapRef.num] = this.classMap; + structTree.set("ClassMap", classMapRef); + } + if (this.roleMap.size > 0) { + const roleMapRef = this.newRef; + this.xref[roleMapRef.num] = this.roleMap; + structTree.set("RoleMap", roleMapRef); + } + if (this.namespaces.size > 0) { + const namespacesRef = this.newRef; + this.xref[namespacesRef.num] = Array.from(this.namespaces.values()); + structTree.set("Namespaces", namespacesRef); + } + if (this.structTreeAF.length > 0) { + const structTreeAFRef = this.newRef; + this.xref[structTreeAFRef.num] = this.structTreeAF; + structTree.set("AF", structTreeAFRef); + } + if (this.structTreePronunciationLexicon.length > 0) { + const structTreePronunciationLexiconRef = this.newRef; + this.xref[structTreePronunciationLexiconRef.num] = + this.structTreePronunciationLexicon; + structTree.set("PronunciationLexicon", structTreePronunciationLexiconRef); + } + rootDict.set("StructTreeRoot", structTreeRef); + } + /** * Create the root dictionary. * @returns {Promise} @@ -807,6 +1437,7 @@ class PDFEditor { this.#makePageTree(); this.#makePageLabelsTree(); this.#makeDestinationsTree(); + this.#makeStructTree(); } /** diff --git a/src/core/name_number_tree.js b/src/core/name_number_tree.js index 461711d1f..c5b63dc7b 100644 --- a/src/core/name_number_tree.js +++ b/src/core/name_number_tree.js @@ -34,7 +34,7 @@ class NameOrNumberTree { this._type = type; } - getAll() { + getAll(isRaw = false) { const map = new Map(); if (!this.root) { return map; @@ -68,7 +68,10 @@ class NameOrNumberTree { continue; } for (let i = 0, ii = entries.length; i < ii; i += 2) { - map.set(xref.fetchIfRef(entries[i]), xref.fetchIfRef(entries[i + 1])); + map.set( + xref.fetchIfRef(entries[i]), + isRaw ? entries[i + 1] : xref.fetchIfRef(entries[i + 1]) + ); } } return map; diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index e2bb0f1e8..cb41f1d0a 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -759,3 +759,5 @@ !doc_3_3_pages.pdf !labelled_pages.pdf !extract_link.pdf +!two_paragraphs.pdf +!paragraph_and_link.pdf diff --git a/test/pdfs/paragraph_and_link.pdf b/test/pdfs/paragraph_and_link.pdf new file mode 100755 index 000000000..610abe5b8 Binary files /dev/null and b/test/pdfs/paragraph_and_link.pdf differ diff --git a/test/pdfs/two_paragraphs.pdf b/test/pdfs/two_paragraphs.pdf new file mode 100755 index 000000000..5bfc685a8 Binary files /dev/null and b/test/pdfs/two_paragraphs.pdf differ diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 34559c4b4..50673ad4b 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -5722,5 +5722,251 @@ small scripts as well as for`); await loadingTask.destroy(); }); }); + + describe("Struct trees", function () { + it("extract pages and merge struct trees", async function () { + let loadingTask = getDocument( + buildGetDocumentParams("two_paragraphs.pdf") + ); + let pdfDoc = await loadingTask.promise; + let pdfPage = await pdfDoc.getPage(1); + const structTree = await pdfPage.getStructTree(); + expect(structTree).toEqual({ + children: [ + { + role: "Document", + children: [ + { + role: "Sect", + children: [ + { + role: "P", + children: [{ type: "content", id: "p19R_mc0" }], + lang: "EN-US", + }, + { + role: "P", + children: [{ type: "content", id: "p19R_mc1" }], + lang: "EN-US", + }, + ], + }, + ], + }, + ], + role: "Root", + }); + const filterItems = item => { + if (item.type === "beginMarkedContentProps") { + return item.id; + } + if (item.str !== undefined) { + return item.str; + } + return null; + }; + let { items } = await pdfPage.getTextContent({ + includeMarkedContent: true, + disableNormalization: true, + }); + expect(items.map(filterItems)).toEqual([ + "p19R_mc0", + "The first paragraph.", + null, + "p19R_mc1", + "", + "The second paragraph.", + null, + ]); + + const data = await pdfDoc.extractPages([ + { document: null }, + { document: null }, + ]); + await loadingTask.destroy(); + + loadingTask = getDocument(data); + pdfDoc = await loadingTask.promise; + + expect(pdfDoc.numPages).toEqual(2); + pdfPage = await pdfDoc.getPage(1); + const structTree1 = await pdfPage.getStructTree(); + expect(structTree1).toEqual({ + children: [ + { + role: "Document", + children: [ + { + role: "Sect", + children: [ + { + role: "P", + children: [{ type: "content", id: "p4R_mc0" }], + lang: "EN-US", + }, + { + role: "P", + children: [{ type: "content", id: "p4R_mc1" }], + lang: "EN-US", + }, + ], + }, + ], + }, + ], + role: "Root", + }); + + ({ items } = await pdfPage.getTextContent({ + includeMarkedContent: true, + disableNormalization: true, + })); + expect(items.map(filterItems)).toEqual([ + "p4R_mc0", + "The first paragraph.", + null, + "p4R_mc1", + "", + "The second paragraph.", + null, + ]); + + pdfPage = await pdfDoc.getPage(2); + const structTree2 = await pdfPage.getStructTree(); + expect(structTree2).toEqual({ + children: [ + { + role: "Document", + children: [ + { + role: "Sect", + children: [ + { + role: "P", + children: [{ type: "content", id: "p19R_mc0" }], + lang: "EN-US", + }, + { + role: "P", + children: [{ type: "content", id: "p19R_mc1" }], + lang: "EN-US", + }, + ], + }, + ], + }, + ], + role: "Root", + }); + + ({ items } = await pdfPage.getTextContent({ + includeMarkedContent: true, + disableNormalization: true, + })); + expect(items.map(filterItems)).toEqual([ + "p19R_mc0", + "The first paragraph.", + null, + "p19R_mc1", + "", + "The second paragraph.", + null, + ]); + + await loadingTask.destroy(); + }); + + it("extract pages with a removed link", async function () { + let loadingTask = getDocument( + buildGetDocumentParams("paragraph_and_link.pdf") + ); + let pdfDoc = await loadingTask.promise; + + const data = await pdfDoc.extractPages([ + { document: null, excludePages: [1] }, + { document: null }, + ]); + await loadingTask.destroy(); + + loadingTask = getDocument(data); + pdfDoc = await loadingTask.promise; + + expect(pdfDoc.numPages).toEqual(3); + let pdfPage = await pdfDoc.getPage(1); + let structTree = await pdfPage.getStructTree(); + expect(structTree).toEqual({ + children: [ + { + role: "Document", + children: [ + { + role: "Sect", + children: [ + { + role: "P", + children: [{ type: "content", id: "p4R_mc0" }], + lang: "EN-US", + }, + { + role: "P", + children: [{ type: "content", id: "p4R_mc3" }], + lang: "EN-US", + }, + { + role: "P", + children: [{ type: "content", id: "p4R_mc6" }], + lang: "EN-US", + }, + ], + }, + ], + }, + ], + role: "Root", + }); + + pdfPage = await pdfDoc.getPage(2); + structTree = await pdfPage.getStructTree(); + + expect(structTree).toEqual({ + children: [ + { + role: "Document", + children: [ + { + role: "Sect", + children: [ + { + role: "P", + children: [{ type: "content", id: "p23R_mc0" }], + lang: "EN-US", + }, + { + role: "P", + children: [ + { + role: "Reference", + children: [{ type: "content", id: "p23R_mc2" }], + lang: "EN-US", + }, + { type: "content", id: "p23R_mc3" }, + ], + lang: "EN-US", + }, + { + role: "P", + children: [{ type: "content", id: "p23R_mc6" }], + lang: "EN-US", + }, + ], + }, + ], + }, + ], + role: "Root", + }); + await loadingTask.destroy(); + }); + }); }); });