Merge the structure trees coming from different pdfs (bug 1997379)

This commit is contained in:
Calixte Denizet 2025-11-10 21:09:35 +01:00
parent 603e3a8f3e
commit e13a618df3
6 changed files with 889 additions and 7 deletions

View File

@ -17,9 +17,10 @@
/** @typedef {import("../document.js").Page} Page */
/** @typedef {import("../xref.js").XRef} XRef */
import { Dict, isName, Ref, RefSetCache } from "../primitives.js";
import { Dict, isName, Name, Ref, RefSet, RefSetCache } from "../primitives.js";
import { getModificationDate, stringToPDFString } from "../../shared/util.js";
import { incrementalUpdate, writeValue } from "../writer.js";
import { NameTree, NumberTree } from "../name_number_tree.js";
import { BaseStream } from "../base_stream.js";
import { StringStream } from "../stream.js";
import { stringToAsciiOrUTF16BE } from "../core_utils.js";
@ -49,6 +50,16 @@ class DocumentData {
this.dedupNamedDestinations = new Map();
this.usedNamedDestinations = new Set();
this.postponedRefCopies = new RefSetCache();
this.usedStructParents = new Set();
this.oldStructParentMapping = new Map();
this.structTreeRoot = null;
this.parentTree = null;
this.idTree = null;
this.roleMap = null;
this.classMap = null;
this.namespaces = null;
this.structTreeAF = null;
this.structTreePronunciationLexicon = [];
}
}
@ -82,6 +93,14 @@ class PDFEditor {
this.author = author;
this.pageLabels = null;
this.namedDestinations = new Map();
this.parentTree = new Map();
this.structTreeKids = [];
this.idTree = new Map();
this.classMap = new Dict();
this.roleMap = new Dict();
this.namespaces = new Map();
this.structTreeAF = [];
this.structTreePronunciationLexicon = [];
}
/**
@ -115,6 +134,12 @@ class PDFEditor {
return ref;
}
cloneDict(dict) {
const newDict = dict.clone();
newDict.xref = this.xrefWrapper;
return newDict;
}
/**
* Collect the dependencies of an object and create new references for each
* dependency.
@ -212,6 +237,232 @@ class PDFEditor {
return obj;
}
async #cloneStructTreeNode(
parentStructRef,
node,
xref,
removedStructElements,
dedupIDs,
dedupClasses,
dedupRoles,
visited = new RefSet()
) {
const {
currentDocument: { pagesMap, oldRefMapping },
} = this;
const pg = node.getRaw("Pg");
if (pg instanceof Ref && !pagesMap.has(pg)) {
return null;
}
let kids;
const k = (kids = node.getRaw("K"));
if (k instanceof Ref) {
// We're only interested by ref referencing nodes and not an array.
if (visited.has(k)) {
return null;
}
kids = await xref.fetchAsync(k);
if (!Array.isArray(kids)) {
kids = [k];
}
}
kids = Array.isArray(kids) ? kids : [kids];
const newKids = [];
const structElemIndices = [];
for (let kid of kids) {
const kidRef = kid instanceof Ref ? kid : null;
if (kidRef) {
if (visited.has(kidRef)) {
continue;
}
visited.put(kidRef);
kid = await xref.fetchAsync(kidRef);
}
if (typeof kid === "number") {
newKids.push(kid);
continue;
}
if (!(kid instanceof Dict)) {
continue;
}
const pgRef = kid.getRaw("Pg");
if (pgRef instanceof Ref && !pagesMap.has(pgRef)) {
continue;
}
const type = kid.get("Type");
if (!type || isName(type, "StructElem")) {
let setAsSpan = false;
if (kidRef && removedStructElements.has(kidRef)) {
if (!isName(kid.get("S"), "Link")) {
continue;
}
// A link annotation has been removed but we still need to keep the
// node in order to preserve the structure tree. Mark it as a Span
// so that it doesn't affect the semantics.
setAsSpan = true;
}
const newKidRef = await this.#cloneStructTreeNode(
kidRef,
kid,
xref,
removedStructElements,
dedupIDs,
dedupClasses,
dedupRoles,
visited
);
if (newKidRef) {
structElemIndices.push(newKids.length);
newKids.push(newKidRef);
if (kidRef) {
oldRefMapping.put(kidRef, newKidRef);
}
if (setAsSpan) {
this.xref[newKidRef.num].setIfName("S", "Span");
}
}
continue;
}
if (isName(type, "OBJR")) {
if (!kidRef) {
continue;
}
const newKidRef = oldRefMapping.get(kidRef);
if (!newKidRef) {
continue;
}
const newKid = this.xref[newKidRef.num];
// Fix the missing StructParent entry in the referenced object.
const objRef = newKid.getRaw("Obj");
if (objRef instanceof Ref) {
const obj = this.xref[objRef.num];
if (
obj instanceof Dict &&
!obj.has("StructParent") &&
parentStructRef
) {
const structParent = this.parentTree.size;
this.parentTree.set(structParent, [oldRefMapping, parentStructRef]);
obj.set("StructParent", structParent);
}
}
newKids.push(newKidRef);
continue;
}
if (isName(type, "MCR")) {
const newKid = await this.#collectDependencies(
kidRef || kid,
true,
xref
);
newKids.push(newKid);
continue;
}
if (kidRef) {
const newKidRef = await this.#collectDependencies(kidRef, true, xref);
newKids.push(newKidRef);
}
}
if (kids.length !== 0 && newKids.length === 0) {
return null;
}
const newNodeRef = this.newRef;
const newNode = (this.xref[newNodeRef.num] = this.cloneDict(node));
// Don't collect for ID or C since they will be fixed later.
newNode.delete("ID");
newNode.delete("C");
newNode.delete("K");
newNode.delete("P");
newNode.delete("S");
await this.#collectDependencies(newNode, false, xref);
// Fix the class names.
const classNames = node.get("C");
if (classNames instanceof Name) {
const newClassName = dedupClasses.get(classNames.name);
if (newClassName) {
newNode.set("C", Name.get(newClassName));
} else {
newNode.set("C", classNames);
}
} else if (Array.isArray(classNames)) {
const newClassNames = [];
for (const className of classNames) {
if (className instanceof Name) {
const newClassName = dedupClasses.get(className.name);
if (newClassName) {
newClassNames.push(Name.get(newClassName));
} else {
newClassNames.push(className);
}
}
}
newNode.set("C", newClassNames);
}
// Fix the role name.
const roleName = node.get("S");
if (roleName instanceof Name) {
const newRoleName = dedupRoles.get(roleName.name);
if (newRoleName) {
newNode.set("S", Name.get(newRoleName));
} else {
newNode.set("S", roleName);
}
}
// Fix the ID.
const id = node.get("ID");
if (typeof id === "string") {
const stringId = stringToPDFString(id, /* keepEscapeSequence = */ false);
const newId = dedupIDs.get(stringId);
if (newId) {
newNode.set("ID", stringToAsciiOrUTF16BE(newId));
} else {
newNode.set("ID", id);
}
}
// Table headers may contain IDs that need to be deduplicated.
let attributes = newNode.get("A");
if (attributes) {
if (!Array.isArray(attributes)) {
attributes = [attributes];
}
for (let attr of attributes) {
attr = this.xrefWrapper.fetch(attr);
if (isName(attr.get("O"), "Table") && attr.has("Headers")) {
const headers = this.xrefWrapper.fetch(attr.getRaw("Headers"));
if (Array.isArray(headers)) {
for (let i = 0, ii = headers.length; i < ii; i++) {
const newId = dedupIDs.get(
stringToPDFString(headers[i], /* keepEscapeSequence = */ false)
);
if (newId) {
headers[i] = newId;
}
}
}
}
}
}
for (const index of structElemIndices) {
const structElemRef = newKids[index];
const structElem = this.xref[structElemRef.num];
structElem.set("P", newNodeRef);
}
if (newKids.length === 1) {
newNode.set("K", newKids[0]);
} else if (newKids.length > 1) {
newNode.set("K", newKids);
}
return newNodeRef;
}
/**
* @typedef {Object} PageInfo
* @property {PDFDocument} document
@ -315,6 +566,7 @@ class PDFEditor {
}
this.#fixPostponedRefCopies(allDocumentData);
await this.#mergeStructTrees(allDocumentData);
return this.writePDF();
}
@ -326,7 +578,7 @@ class PDFEditor {
*/
async #collectDocumentData(documentData) {
const {
document: { pdfManager },
document: { pdfManager, xref },
} = documentData;
await Promise.all([
pdfManager
@ -335,7 +587,34 @@ class PDFEditor {
pdfManager
.ensureCatalog("rawPageLabels")
.then(pageLabels => (documentData.pageLabels = pageLabels)),
pdfManager
.ensureCatalog("structTreeRoot")
.then(structTreeRoot => (documentData.structTreeRoot = structTreeRoot)),
]);
const structTreeRoot = documentData.structTreeRoot;
if (structTreeRoot) {
const rootDict = structTreeRoot.dict;
const parentTree = rootDict.get("ParentTree");
if (parentTree) {
const numberTree = new NumberTree(parentTree, xref);
documentData.parentTree = numberTree.getAll(/* isRaw = */ true);
}
const idTree = rootDict.get("IDTree");
if (idTree) {
const nameTree = new NameTree(idTree, xref);
documentData.idTree = nameTree.getAll(/* isRaw = */ true);
}
documentData.roleMap = rootDict.get("RoleMap") || null;
documentData.classMap = rootDict.get("ClassMap") || null;
let namespaces = rootDict.get("Namespaces") || null;
if (namespaces && !Array.isArray(namespaces)) {
namespaces = [namespaces];
}
documentData.namespaces = namespaces;
documentData.structTreeAF = rootDict.get("AF") || null;
documentData.structTreePronunciationLexicon =
rootDict.get("PronunciationLexicon") || null;
}
}
/**
@ -371,7 +650,6 @@ class PDFEditor {
action instanceof Dict
? action.get("D")
: annotationDict.get("Dest");
if (
!dest /* not a destination */ ||
(Array.isArray(dest) &&
@ -431,6 +709,293 @@ class PDFEditor {
}
}
#visitObject(obj, callback, visited = new RefSet()) {
if (obj instanceof Ref) {
if (!visited.has(obj)) {
visited.put(obj);
this.#visitObject(this.xref[obj.num], callback, visited);
}
return;
}
if (Array.isArray(obj)) {
for (const item of obj) {
this.#visitObject(item, callback, visited);
}
return;
}
let dict;
if (obj instanceof BaseStream) {
({ dict } = obj);
} else if (obj instanceof Dict) {
dict = obj;
}
if (dict) {
callback(dict);
for (const value of dict.getRawValues()) {
this.#visitObject(value, callback, visited);
}
}
}
async #mergeStructTrees(allDocumentData) {
let newStructParentId = 0;
const { parentTree: newParentTree } = this;
for (let i = 0, ii = this.newPages.length; i < ii; i++) {
const {
documentData: {
parentTree,
oldRefMapping,
oldStructParentMapping,
usedStructParents,
document: { xref },
},
} = this.oldPages[i];
if (!parentTree) {
continue;
}
const pageRef = this.newPages[i];
const pageDict = this.xref[pageRef.num];
// Visit the new page in order to collect used StructParent entries.
this.#visitObject(pageDict, dict => {
const structParent =
dict.get("StructParent") ?? dict.get("StructParents");
if (typeof structParent !== "number") {
return;
}
usedStructParents.add(structParent);
let parent = parentTree.get(structParent);
const parentRef = parent instanceof Ref ? parent : null;
if (parentRef) {
const array = xref.fetch(parentRef);
if (Array.isArray(array)) {
parent = array;
}
}
if (Array.isArray(parent) && parent.every(ref => ref === null)) {
parent = null;
}
if (!parent) {
if (dict.has("StructParent")) {
dict.delete("StructParent");
} else {
dict.delete("StructParents");
}
return;
}
let newStructParent = oldStructParentMapping.get(structParent);
if (newStructParent === undefined) {
newStructParent = newStructParentId++;
oldStructParentMapping.set(structParent, newStructParent);
newParentTree.set(newStructParent, [oldRefMapping, parent]);
}
if (dict.has("StructParent")) {
dict.set("StructParent", newStructParent);
} else {
dict.set("StructParents", newStructParent);
}
});
}
const {
structTreeKids,
idTree: newIdTree,
classMap: newClassMap,
roleMap: newRoleMap,
namespaces: newNamespaces,
structTreeAF: newStructTreeAF,
structTreePronunciationLexicon: newStructTreePronunciationLexicon,
} = this;
// Clone the struct tree nodes for each document.
for (const documentData of allDocumentData) {
const {
document: { xref },
oldRefMapping,
parentTree,
usedStructParents,
structTreeRoot,
idTree,
classMap,
roleMap,
namespaces,
structTreeAF,
structTreePronunciationLexicon,
} = documentData;
if (!structTreeRoot) {
continue;
}
this.currentDocument = documentData;
// Get all the removed StructElem
const removedStructElements = new RefSet();
for (const [key, value] of parentTree || []) {
if (!usedStructParents.has(key) && value instanceof Ref) {
removedStructElements.put(value);
}
}
// Deduplicate IDs in the ID tree.
// We keep the old node references since they will be cloned later when
// cloning the struct tree.
const dedupIDs = new Map();
for (const [id, nodeRef] of idTree || []) {
let _id = id;
if (newIdTree.has(id)) {
for (let i = 1; ; i++) {
const newId = `${id}_${i}`;
if (!newIdTree.has(newId)) {
dedupIDs.set(id, newId);
_id = newId;
break;
}
}
}
newIdTree.set(_id, nodeRef);
}
const dedupClasses = new Map();
if (classMap?.size > 0) {
// Deduplicate ClassMap entries.
for (let [className, classDict] of classMap) {
classDict = await this.#collectDependencies(classDict, true, xref);
if (newClassMap.has(className)) {
for (let i = 1; ; i++) {
const newClassName = `${className}_${i}`;
if (!newClassMap.has(newClassName)) {
dedupClasses.set(className, newClassName);
className = newClassName;
break;
}
}
}
newClassMap.set(className, classDict);
}
}
const dedupRoles = new Map();
if (roleMap?.size > 0) {
// Deduplicate RoleMap entries.
for (const [roleName, mappedName] of roleMap) {
const newMappedName = newRoleMap.get(roleName);
if (!newMappedName) {
newRoleMap.set(roleName, mappedName);
continue;
}
if (newMappedName === mappedName) {
continue;
}
for (let i = 1; ; i++) {
const newRoleName = `${roleName}_${i}`;
if (!newRoleMap.has(newRoleName)) {
dedupRoles.set(roleName, newRoleName);
newRoleMap.set(newRoleName, mappedName);
break;
}
}
}
}
if (namespaces?.length > 0) {
for (const namespaceRef of namespaces) {
const namespace = await xref.fetchIfRefAsync(namespaceRef);
let ns = namespace.get("NS");
if (!ns || newNamespaces.has(ns)) {
continue;
}
ns = stringToPDFString(ns, /* keepEscapeSequence = */ false);
const newNamespace = await this.#collectDependencies(
namespace,
true,
xref
);
newNamespaces.set(ns, newNamespace);
}
}
if (structTreeAF) {
for (const afRef of structTreeAF) {
newStructTreeAF.push(
await this.#collectDependencies(afRef, true, xref)
);
}
}
if (structTreePronunciationLexicon) {
for (const lexiconRef of structTreePronunciationLexicon) {
newStructTreePronunciationLexicon.push(
await this.#collectDependencies(lexiconRef, true, xref)
);
}
}
// Get the kids.
let kids = structTreeRoot.dict.get("K");
if (!kids) {
continue;
}
kids = Array.isArray(kids) ? kids : [kids];
for (let kid of kids) {
const kidRef = kid instanceof Ref ? kid : null;
if (kidRef && removedStructElements.has(kidRef)) {
continue;
}
kid = await xref.fetchIfRefAsync(kid);
const newKidRef = await this.#cloneStructTreeNode(
kidRef,
kid,
xref,
removedStructElements,
dedupIDs,
dedupClasses,
dedupRoles
);
if (newKidRef) {
structTreeKids.push(newKidRef);
}
}
// Fix the ID tree.
for (const [id, nodeRef] of idTree || []) {
const newNodeRef = oldRefMapping.get(nodeRef);
const newId = dedupIDs.get(id) || id;
if (newNodeRef) {
newIdTree.set(newId, newNodeRef);
} else {
newIdTree.delete(newId);
}
}
}
for (const [key, [oldRefMapping, parent]] of newParentTree) {
if (!parent) {
newParentTree.delete(key);
continue;
}
// Some nodes haven't been visited while cloning the struct trees so their
// ref don't belong to the oldRefMapping. Remove those nodes.
if (!Array.isArray(parent)) {
const newParent = oldRefMapping.get(parent);
if (newParent === undefined) {
newParentTree.delete(key);
} else {
newParentTree.set(key, newParent);
}
continue;
}
const newParents = parent.map(
ref => (ref instanceof Ref && oldRefMapping.get(ref)) || null
);
if (newParents.length === 0 || newParents.every(ref => ref === null)) {
newParentTree.delete(key);
continue;
}
newParentTree.set(key, newParents);
}
this.currentDocument = null;
}
/**
* Collect named destinations that are still valid (i.e. pointing to kept
* pages).
@ -566,7 +1131,7 @@ class PDFEditor {
}
if (stFirstIndex !== -1) {
const st = currentLabel.get("St");
currentLabel = currentLabel.clone();
currentLabel = this.cloneDict(currentLabel);
currentLabel.set("St", st + (i - stFirstIndex));
stFirstIndex = -1;
}
@ -598,7 +1163,7 @@ class PDFEditor {
const { dedupNamedDestinations, oldRefMapping } = documentData;
const { xref, rotate, mediaBox, resources, ref: oldPageRef } = page;
const pageRef = this.newRef;
const pageDict = (this.xref[pageRef.num] = page.pageDict.clone());
const pageDict = (this.xref[pageRef.num] = this.cloneDict(page.pageDict));
oldRefMapping.put(oldPageRef, pageRef);
if (pointingNamedDestinations) {
@ -796,6 +1361,71 @@ class PDFEditor {
);
}
#makeStructTree() {
const { structTreeKids } = this;
if (!structTreeKids || structTreeKids.length === 0) {
return;
}
const { rootDict } = this;
const structTreeRef = this.newRef;
const structTree = (this.xref[structTreeRef.num] = new Dict());
structTree.setIfName("Type", "StructTreeRoot");
structTree.setIfArray("K", structTreeKids);
for (const kidRef of structTreeKids) {
const kid = this.xref[kidRef.num];
const type = kid.get("Type");
if (!type || isName(type, "StructElem")) {
kid.set("P", structTreeRef);
}
}
if (this.parentTree.size > 0) {
const parentTreeRef = this.#makeNameNumTree(
Array.from(this.parentTree.entries()),
/* areNames = */ false
);
const parentTree = this.xref[parentTreeRef.num];
parentTree.setIfName("Type", "ParentTree");
structTree.set("ParentTree", parentTreeRef);
structTree.set("ParentTreeNextKey", this.parentTree.size);
}
if (this.idTree.size > 0) {
const idTreeRef = this.#makeNameNumTree(
Array.from(this.idTree.entries()),
/* areNames = */ true
);
const idTree = this.xref[idTreeRef.num];
idTree.setIfName("Type", "IDTree");
structTree.set("IDTree", idTreeRef);
}
if (this.classMap.size > 0) {
const classMapRef = this.newRef;
this.xref[classMapRef.num] = this.classMap;
structTree.set("ClassMap", classMapRef);
}
if (this.roleMap.size > 0) {
const roleMapRef = this.newRef;
this.xref[roleMapRef.num] = this.roleMap;
structTree.set("RoleMap", roleMapRef);
}
if (this.namespaces.size > 0) {
const namespacesRef = this.newRef;
this.xref[namespacesRef.num] = Array.from(this.namespaces.values());
structTree.set("Namespaces", namespacesRef);
}
if (this.structTreeAF.length > 0) {
const structTreeAFRef = this.newRef;
this.xref[structTreeAFRef.num] = this.structTreeAF;
structTree.set("AF", structTreeAFRef);
}
if (this.structTreePronunciationLexicon.length > 0) {
const structTreePronunciationLexiconRef = this.newRef;
this.xref[structTreePronunciationLexiconRef.num] =
this.structTreePronunciationLexicon;
structTree.set("PronunciationLexicon", structTreePronunciationLexiconRef);
}
rootDict.set("StructTreeRoot", structTreeRef);
}
/**
* Create the root dictionary.
* @returns {Promise<void>}
@ -807,6 +1437,7 @@ class PDFEditor {
this.#makePageTree();
this.#makePageLabelsTree();
this.#makeDestinationsTree();
this.#makeStructTree();
}
/**

View File

@ -34,7 +34,7 @@ class NameOrNumberTree {
this._type = type;
}
getAll() {
getAll(isRaw = false) {
const map = new Map();
if (!this.root) {
return map;
@ -68,7 +68,10 @@ class NameOrNumberTree {
continue;
}
for (let i = 0, ii = entries.length; i < ii; i += 2) {
map.set(xref.fetchIfRef(entries[i]), xref.fetchIfRef(entries[i + 1]));
map.set(
xref.fetchIfRef(entries[i]),
isRaw ? entries[i + 1] : xref.fetchIfRef(entries[i + 1])
);
}
}
return map;

View File

@ -759,3 +759,5 @@
!doc_3_3_pages.pdf
!labelled_pages.pdf
!extract_link.pdf
!two_paragraphs.pdf
!paragraph_and_link.pdf

BIN
test/pdfs/paragraph_and_link.pdf Executable file

Binary file not shown.

BIN
test/pdfs/two_paragraphs.pdf Executable file

Binary file not shown.

View File

@ -5722,5 +5722,251 @@ small scripts as well as for`);
await loadingTask.destroy();
});
});
describe("Struct trees", function () {
it("extract pages and merge struct trees", async function () {
let loadingTask = getDocument(
buildGetDocumentParams("two_paragraphs.pdf")
);
let pdfDoc = await loadingTask.promise;
let pdfPage = await pdfDoc.getPage(1);
const structTree = await pdfPage.getStructTree();
expect(structTree).toEqual({
children: [
{
role: "Document",
children: [
{
role: "Sect",
children: [
{
role: "P",
children: [{ type: "content", id: "p19R_mc0" }],
lang: "EN-US",
},
{
role: "P",
children: [{ type: "content", id: "p19R_mc1" }],
lang: "EN-US",
},
],
},
],
},
],
role: "Root",
});
const filterItems = item => {
if (item.type === "beginMarkedContentProps") {
return item.id;
}
if (item.str !== undefined) {
return item.str;
}
return null;
};
let { items } = await pdfPage.getTextContent({
includeMarkedContent: true,
disableNormalization: true,
});
expect(items.map(filterItems)).toEqual([
"p19R_mc0",
"The first paragraph.",
null,
"p19R_mc1",
"",
"The second paragraph.",
null,
]);
const data = await pdfDoc.extractPages([
{ document: null },
{ document: null },
]);
await loadingTask.destroy();
loadingTask = getDocument(data);
pdfDoc = await loadingTask.promise;
expect(pdfDoc.numPages).toEqual(2);
pdfPage = await pdfDoc.getPage(1);
const structTree1 = await pdfPage.getStructTree();
expect(structTree1).toEqual({
children: [
{
role: "Document",
children: [
{
role: "Sect",
children: [
{
role: "P",
children: [{ type: "content", id: "p4R_mc0" }],
lang: "EN-US",
},
{
role: "P",
children: [{ type: "content", id: "p4R_mc1" }],
lang: "EN-US",
},
],
},
],
},
],
role: "Root",
});
({ items } = await pdfPage.getTextContent({
includeMarkedContent: true,
disableNormalization: true,
}));
expect(items.map(filterItems)).toEqual([
"p4R_mc0",
"The first paragraph.",
null,
"p4R_mc1",
"",
"The second paragraph.",
null,
]);
pdfPage = await pdfDoc.getPage(2);
const structTree2 = await pdfPage.getStructTree();
expect(structTree2).toEqual({
children: [
{
role: "Document",
children: [
{
role: "Sect",
children: [
{
role: "P",
children: [{ type: "content", id: "p19R_mc0" }],
lang: "EN-US",
},
{
role: "P",
children: [{ type: "content", id: "p19R_mc1" }],
lang: "EN-US",
},
],
},
],
},
],
role: "Root",
});
({ items } = await pdfPage.getTextContent({
includeMarkedContent: true,
disableNormalization: true,
}));
expect(items.map(filterItems)).toEqual([
"p19R_mc0",
"The first paragraph.",
null,
"p19R_mc1",
"",
"The second paragraph.",
null,
]);
await loadingTask.destroy();
});
it("extract pages with a removed link", async function () {
let loadingTask = getDocument(
buildGetDocumentParams("paragraph_and_link.pdf")
);
let pdfDoc = await loadingTask.promise;
const data = await pdfDoc.extractPages([
{ document: null, excludePages: [1] },
{ document: null },
]);
await loadingTask.destroy();
loadingTask = getDocument(data);
pdfDoc = await loadingTask.promise;
expect(pdfDoc.numPages).toEqual(3);
let pdfPage = await pdfDoc.getPage(1);
let structTree = await pdfPage.getStructTree();
expect(structTree).toEqual({
children: [
{
role: "Document",
children: [
{
role: "Sect",
children: [
{
role: "P",
children: [{ type: "content", id: "p4R_mc0" }],
lang: "EN-US",
},
{
role: "P",
children: [{ type: "content", id: "p4R_mc3" }],
lang: "EN-US",
},
{
role: "P",
children: [{ type: "content", id: "p4R_mc6" }],
lang: "EN-US",
},
],
},
],
},
],
role: "Root",
});
pdfPage = await pdfDoc.getPage(2);
structTree = await pdfPage.getStructTree();
expect(structTree).toEqual({
children: [
{
role: "Document",
children: [
{
role: "Sect",
children: [
{
role: "P",
children: [{ type: "content", id: "p23R_mc0" }],
lang: "EN-US",
},
{
role: "P",
children: [
{
role: "Reference",
children: [{ type: "content", id: "p23R_mc2" }],
lang: "EN-US",
},
{ type: "content", id: "p23R_mc3" },
],
lang: "EN-US",
},
{
role: "P",
children: [{ type: "content", id: "p23R_mc6" }],
lang: "EN-US",
},
],
},
],
},
],
role: "Root",
});
await loadingTask.destroy();
});
});
});
});