Merge the structure trees coming from different pdfs (bug 1997379)

2025-11-10 21:09:35 +01:00 · 2025-11-10 21:09:35 +01:00 · e13a618df3
commit e13a618df3
parent 603e3a8f3e
6 changed files with 889 additions and 7 deletions
--- a/src/core/editor/pdf_editor.js
+++ b/src/core/editor/pdf_editor.js
@ -17,9 +17,10 @@
 /** @typedef {import("../document.js").Page} Page */
 /** @typedef {import("../xref.js").XRef} XRef */
-import { Dict, isName, Ref, RefSetCache } from "../primitives.js";
+import { Dict, isName, Name, Ref, RefSet, RefSetCache } from "../primitives.js";
 import { getModificationDate, stringToPDFString } from "../../shared/util.js";
 import { incrementalUpdate, writeValue } from "../writer.js";
 import { NameTree, NumberTree } from "../name_number_tree.js";
 import { BaseStream } from "../base_stream.js";
 import { StringStream } from "../stream.js";
 import { stringToAsciiOrUTF16BE } from "../core_utils.js";
@ -49,6 +50,16 @@ class DocumentData {
    this.dedupNamedDestinations = new Map();
    this.usedNamedDestinations = new Set();
    this.postponedRefCopies = new RefSetCache();
    this.usedStructParents = new Set();
    this.oldStructParentMapping = new Map();
    this.structTreeRoot = null;
    this.parentTree = null;
    this.idTree = null;
    this.roleMap = null;
    this.classMap = null;
    this.namespaces = null;
    this.structTreeAF = null;
    this.structTreePronunciationLexicon = [];
  }
 }
@ -82,6 +93,14 @@ class PDFEditor {
    this.author = author;
    this.pageLabels = null;
    this.namedDestinations = new Map();
    this.parentTree = new Map();
    this.structTreeKids = [];
    this.idTree = new Map();
    this.classMap = new Dict();
    this.roleMap = new Dict();
    this.namespaces = new Map();
    this.structTreeAF = [];
    this.structTreePronunciationLexicon = [];
  }
  /**
@ -115,6 +134,12 @@ class PDFEditor {
    return ref;
  }
  cloneDict(dict) {
    const newDict = dict.clone();
    newDict.xref = this.xrefWrapper;
    return newDict;
  }
  /**
   * Collect the dependencies of an object and create new references for each
   * dependency.
@ -212,6 +237,232 @@ class PDFEditor {
    return obj;
  }
  async #cloneStructTreeNode(
    parentStructRef,
    node,
    xref,
    removedStructElements,
    dedupIDs,
    dedupClasses,
    dedupRoles,
    visited = new RefSet()
  ) {
    const {
      currentDocument: { pagesMap, oldRefMapping },
    } = this;
    const pg = node.getRaw("Pg");
    if (pg instanceof Ref && !pagesMap.has(pg)) {
      return null;
    }
    let kids;
    const k = (kids = node.getRaw("K"));
    if (k instanceof Ref) {
      // We're only interested by ref referencing nodes and not an array.
      if (visited.has(k)) {
        return null;
      }
      kids = await xref.fetchAsync(k);
      if (!Array.isArray(kids)) {
        kids = [k];
      }
    }
    kids = Array.isArray(kids) ? kids : [kids];
    const newKids = [];
    const structElemIndices = [];
    for (let kid of kids) {
      const kidRef = kid instanceof Ref ? kid : null;
      if (kidRef) {
        if (visited.has(kidRef)) {
          continue;
        }
        visited.put(kidRef);
        kid = await xref.fetchAsync(kidRef);
      }
      if (typeof kid === "number") {
        newKids.push(kid);
        continue;
      }
      if (!(kid instanceof Dict)) {
        continue;
      }
      const pgRef = kid.getRaw("Pg");
      if (pgRef instanceof Ref && !pagesMap.has(pgRef)) {
        continue;
      }
      const type = kid.get("Type");
      if (!type || isName(type, "StructElem")) {
        let setAsSpan = false;
        if (kidRef && removedStructElements.has(kidRef)) {
          if (!isName(kid.get("S"), "Link")) {
            continue;
          }
          // A link annotation has been removed but we still need to keep the
          // node in order to preserve the structure tree. Mark it as a Span
          // so that it doesn't affect the semantics.
          setAsSpan = true;
        }
        const newKidRef = await this.#cloneStructTreeNode(
          kidRef,
          kid,
          xref,
          removedStructElements,
          dedupIDs,
          dedupClasses,
          dedupRoles,
          visited
        );
        if (newKidRef) {
          structElemIndices.push(newKids.length);
          newKids.push(newKidRef);
          if (kidRef) {
            oldRefMapping.put(kidRef, newKidRef);
          }
          if (setAsSpan) {
            this.xref[newKidRef.num].setIfName("S", "Span");
          }
        }
        continue;
      }
      if (isName(type, "OBJR")) {
        if (!kidRef) {
          continue;
        }
        const newKidRef = oldRefMapping.get(kidRef);
        if (!newKidRef) {
          continue;
        }
        const newKid = this.xref[newKidRef.num];
        // Fix the missing StructParent entry in the referenced object.
        const objRef = newKid.getRaw("Obj");
        if (objRef instanceof Ref) {
          const obj = this.xref[objRef.num];
          if (
            obj instanceof Dict &&
            !obj.has("StructParent") &&
            parentStructRef
          ) {
            const structParent = this.parentTree.size;
            this.parentTree.set(structParent, [oldRefMapping, parentStructRef]);
            obj.set("StructParent", structParent);
          }
        }
        newKids.push(newKidRef);
        continue;
      }
      if (isName(type, "MCR")) {
        const newKid = await this.#collectDependencies(
          kidRef || kid,
          true,
          xref
        );
        newKids.push(newKid);
        continue;
      }
      if (kidRef) {
        const newKidRef = await this.#collectDependencies(kidRef, true, xref);
        newKids.push(newKidRef);
      }
    }
    if (kids.length !== 0 && newKids.length === 0) {
      return null;
    }
    const newNodeRef = this.newRef;
    const newNode = (this.xref[newNodeRef.num] = this.cloneDict(node));
    // Don't collect for ID or C since they will be fixed later.
    newNode.delete("ID");
    newNode.delete("C");
    newNode.delete("K");
    newNode.delete("P");
    newNode.delete("S");
    await this.#collectDependencies(newNode, false, xref);
    // Fix the class names.
    const classNames = node.get("C");
    if (classNames instanceof Name) {
      const newClassName = dedupClasses.get(classNames.name);
      if (newClassName) {
        newNode.set("C", Name.get(newClassName));
      } else {
        newNode.set("C", classNames);
      }
    } else if (Array.isArray(classNames)) {
      const newClassNames = [];
      for (const className of classNames) {
        if (className instanceof Name) {
          const newClassName = dedupClasses.get(className.name);
          if (newClassName) {
            newClassNames.push(Name.get(newClassName));
          } else {
            newClassNames.push(className);
          }
        }
      }
      newNode.set("C", newClassNames);
    }
    // Fix the role name.
    const roleName = node.get("S");
    if (roleName instanceof Name) {
      const newRoleName = dedupRoles.get(roleName.name);
      if (newRoleName) {
        newNode.set("S", Name.get(newRoleName));
      } else {
        newNode.set("S", roleName);
      }
    }
    // Fix the ID.
    const id = node.get("ID");
    if (typeof id === "string") {
      const stringId = stringToPDFString(id, /* keepEscapeSequence = */ false);
      const newId = dedupIDs.get(stringId);
      if (newId) {
        newNode.set("ID", stringToAsciiOrUTF16BE(newId));
      } else {
        newNode.set("ID", id);
      }
    }
    // Table headers may contain IDs that need to be deduplicated.
    let attributes = newNode.get("A");
    if (attributes) {
      if (!Array.isArray(attributes)) {
        attributes = [attributes];
      }
      for (let attr of attributes) {
        attr = this.xrefWrapper.fetch(attr);
        if (isName(attr.get("O"), "Table") && attr.has("Headers")) {
          const headers = this.xrefWrapper.fetch(attr.getRaw("Headers"));
          if (Array.isArray(headers)) {
            for (let i = 0, ii = headers.length; i < ii; i++) {
              const newId = dedupIDs.get(
                stringToPDFString(headers[i], /* keepEscapeSequence = */ false)
              );
              if (newId) {
                headers[i] = newId;
              }
            }
          }
        }
      }
    }
    for (const index of structElemIndices) {
      const structElemRef = newKids[index];
      const structElem = this.xref[structElemRef.num];
      structElem.set("P", newNodeRef);
    }
    if (newKids.length === 1) {
      newNode.set("K", newKids[0]);
    } else if (newKids.length > 1) {
      newNode.set("K", newKids);
    }
    return newNodeRef;
  }
  /**
   * @typedef {Object} PageInfo
   * @property {PDFDocument} document
@ -315,6 +566,7 @@ class PDFEditor {
    }
    this.#fixPostponedRefCopies(allDocumentData);
    await this.#mergeStructTrees(allDocumentData);
    return this.writePDF();
  }
@ -326,7 +578,7 @@ class PDFEditor {
   */
  async #collectDocumentData(documentData) {
    const {
-      document: { pdfManager },
+      document: { pdfManager, xref },
    } = documentData;
    await Promise.all([
      pdfManager
@ -335,7 +587,34 @@ class PDFEditor {
      pdfManager
        .ensureCatalog("rawPageLabels")
        .then(pageLabels => (documentData.pageLabels = pageLabels)),
      pdfManager
        .ensureCatalog("structTreeRoot")
        .then(structTreeRoot => (documentData.structTreeRoot = structTreeRoot)),
    ]);
    const structTreeRoot = documentData.structTreeRoot;
    if (structTreeRoot) {
      const rootDict = structTreeRoot.dict;
      const parentTree = rootDict.get("ParentTree");
      if (parentTree) {
        const numberTree = new NumberTree(parentTree, xref);
        documentData.parentTree = numberTree.getAll(/* isRaw = */ true);
      }
      const idTree = rootDict.get("IDTree");
      if (idTree) {
        const nameTree = new NameTree(idTree, xref);
        documentData.idTree = nameTree.getAll(/* isRaw = */ true);
      }
      documentData.roleMap = rootDict.get("RoleMap") || null;
      documentData.classMap = rootDict.get("ClassMap") || null;
      let namespaces = rootDict.get("Namespaces") || null;
      if (namespaces && !Array.isArray(namespaces)) {
        namespaces = [namespaces];
      }
      documentData.namespaces = namespaces;
      documentData.structTreeAF = rootDict.get("AF") || null;
      documentData.structTreePronunciationLexicon =
        rootDict.get("PronunciationLexicon") || null;
    }
  }
  /**
@ -371,7 +650,6 @@ class PDFEditor {
            action instanceof Dict
              ? action.get("D")
              : annotationDict.get("Dest");
          if (
            !dest /* not a destination */ ||
            (Array.isArray(dest) &&
@ -431,6 +709,293 @@ class PDFEditor {
    }
  }
  #visitObject(obj, callback, visited = new RefSet()) {
    if (obj instanceof Ref) {
      if (!visited.has(obj)) {
        visited.put(obj);
        this.#visitObject(this.xref[obj.num], callback, visited);
      }
      return;
    }
    if (Array.isArray(obj)) {
      for (const item of obj) {
        this.#visitObject(item, callback, visited);
      }
      return;
    }
    let dict;
    if (obj instanceof BaseStream) {
      ({ dict } = obj);
    } else if (obj instanceof Dict) {
      dict = obj;
    }
    if (dict) {
      callback(dict);
      for (const value of dict.getRawValues()) {
        this.#visitObject(value, callback, visited);
      }
    }
  }
  async #mergeStructTrees(allDocumentData) {
    let newStructParentId = 0;
    const { parentTree: newParentTree } = this;
    for (let i = 0, ii = this.newPages.length; i < ii; i++) {
      const {
        documentData: {
          parentTree,
          oldRefMapping,
          oldStructParentMapping,
          usedStructParents,
          document: { xref },
        },
      } = this.oldPages[i];
      if (!parentTree) {
        continue;
      }
      const pageRef = this.newPages[i];
      const pageDict = this.xref[pageRef.num];
      // Visit the new page in order to collect used StructParent entries.
      this.#visitObject(pageDict, dict => {
        const structParent =
          dict.get("StructParent") ?? dict.get("StructParents");
        if (typeof structParent !== "number") {
          return;
        }
        usedStructParents.add(structParent);
        let parent = parentTree.get(structParent);
        const parentRef = parent instanceof Ref ? parent : null;
        if (parentRef) {
          const array = xref.fetch(parentRef);
          if (Array.isArray(array)) {
            parent = array;
          }
        }
        if (Array.isArray(parent) && parent.every(ref => ref === null)) {
          parent = null;
        }
        if (!parent) {
          if (dict.has("StructParent")) {
            dict.delete("StructParent");
          } else {
            dict.delete("StructParents");
          }
          return;
        }
        let newStructParent = oldStructParentMapping.get(structParent);
        if (newStructParent === undefined) {
          newStructParent = newStructParentId++;
          oldStructParentMapping.set(structParent, newStructParent);
          newParentTree.set(newStructParent, [oldRefMapping, parent]);
        }
        if (dict.has("StructParent")) {
          dict.set("StructParent", newStructParent);
        } else {
          dict.set("StructParents", newStructParent);
        }
      });
    }
    const {
      structTreeKids,
      idTree: newIdTree,
      classMap: newClassMap,
      roleMap: newRoleMap,
      namespaces: newNamespaces,
      structTreeAF: newStructTreeAF,
      structTreePronunciationLexicon: newStructTreePronunciationLexicon,
    } = this;
    // Clone the struct tree nodes for each document.
    for (const documentData of allDocumentData) {
      const {
        document: { xref },
        oldRefMapping,
        parentTree,
        usedStructParents,
        structTreeRoot,
        idTree,
        classMap,
        roleMap,
        namespaces,
        structTreeAF,
        structTreePronunciationLexicon,
      } = documentData;
      if (!structTreeRoot) {
        continue;
      }
      this.currentDocument = documentData;
      // Get all the removed StructElem
      const removedStructElements = new RefSet();
      for (const [key, value] of parentTree || []) {
        if (!usedStructParents.has(key) && value instanceof Ref) {
          removedStructElements.put(value);
        }
      }
      // Deduplicate IDs in the ID tree.
      // We keep the old node references since they will be cloned later when
      // cloning the struct tree.
      const dedupIDs = new Map();
      for (const [id, nodeRef] of idTree || []) {
        let _id = id;
        if (newIdTree.has(id)) {
          for (let i = 1; ; i++) {
            const newId = `${id}_${i}`;
            if (!newIdTree.has(newId)) {
              dedupIDs.set(id, newId);
              _id = newId;
              break;
            }
          }
        }
        newIdTree.set(_id, nodeRef);
      }
      const dedupClasses = new Map();
      if (classMap?.size > 0) {
        // Deduplicate ClassMap entries.
        for (let [className, classDict] of classMap) {
          classDict = await this.#collectDependencies(classDict, true, xref);
          if (newClassMap.has(className)) {
            for (let i = 1; ; i++) {
              const newClassName = `${className}_${i}`;
              if (!newClassMap.has(newClassName)) {
                dedupClasses.set(className, newClassName);
                className = newClassName;
                break;
              }
            }
          }
          newClassMap.set(className, classDict);
        }
      }
      const dedupRoles = new Map();
      if (roleMap?.size > 0) {
        // Deduplicate RoleMap entries.
        for (const [roleName, mappedName] of roleMap) {
          const newMappedName = newRoleMap.get(roleName);
          if (!newMappedName) {
            newRoleMap.set(roleName, mappedName);
            continue;
          }
          if (newMappedName === mappedName) {
            continue;
          }
          for (let i = 1; ; i++) {
            const newRoleName = `${roleName}_${i}`;
            if (!newRoleMap.has(newRoleName)) {
              dedupRoles.set(roleName, newRoleName);
              newRoleMap.set(newRoleName, mappedName);
              break;
            }
          }
        }
      }
      if (namespaces?.length > 0) {
        for (const namespaceRef of namespaces) {
          const namespace = await xref.fetchIfRefAsync(namespaceRef);
          let ns = namespace.get("NS");
          if (!ns || newNamespaces.has(ns)) {
            continue;
          }
          ns = stringToPDFString(ns, /* keepEscapeSequence = */ false);
          const newNamespace = await this.#collectDependencies(
            namespace,
            true,
            xref
          );
          newNamespaces.set(ns, newNamespace);
        }
      }
      if (structTreeAF) {
        for (const afRef of structTreeAF) {
          newStructTreeAF.push(
            await this.#collectDependencies(afRef, true, xref)
          );
        }
      }
      if (structTreePronunciationLexicon) {
        for (const lexiconRef of structTreePronunciationLexicon) {
          newStructTreePronunciationLexicon.push(
            await this.#collectDependencies(lexiconRef, true, xref)
          );
        }
      }
      // Get the kids.
      let kids = structTreeRoot.dict.get("K");
      if (!kids) {
        continue;
      }
      kids = Array.isArray(kids) ? kids : [kids];
      for (let kid of kids) {
        const kidRef = kid instanceof Ref ? kid : null;
        if (kidRef && removedStructElements.has(kidRef)) {
          continue;
        }
        kid = await xref.fetchIfRefAsync(kid);
        const newKidRef = await this.#cloneStructTreeNode(
          kidRef,
          kid,
          xref,
          removedStructElements,
          dedupIDs,
          dedupClasses,
          dedupRoles
        );
        if (newKidRef) {
          structTreeKids.push(newKidRef);
        }
      }
      // Fix the ID tree.
      for (const [id, nodeRef] of idTree || []) {
        const newNodeRef = oldRefMapping.get(nodeRef);
        const newId = dedupIDs.get(id) || id;
        if (newNodeRef) {
          newIdTree.set(newId, newNodeRef);
        } else {
          newIdTree.delete(newId);
        }
      }
    }
    for (const [key, [oldRefMapping, parent]] of newParentTree) {
      if (!parent) {
        newParentTree.delete(key);
        continue;
      }
      // Some nodes haven't been visited while cloning the struct trees so their
      // ref don't belong to the oldRefMapping. Remove those nodes.
      if (!Array.isArray(parent)) {
        const newParent = oldRefMapping.get(parent);
        if (newParent === undefined) {
          newParentTree.delete(key);
        } else {
          newParentTree.set(key, newParent);
        }
        continue;
      }
      const newParents = parent.map(
        ref => (ref instanceof Ref && oldRefMapping.get(ref)) || null
      );
      if (newParents.length === 0 || newParents.every(ref => ref === null)) {
        newParentTree.delete(key);
        continue;
      }
      newParentTree.set(key, newParents);
    }
    this.currentDocument = null;
  }
  /**
   * Collect named destinations that are still valid (i.e. pointing to kept
   * pages).
@ -566,7 +1131,7 @@ class PDFEditor {
      }
      if (stFirstIndex !== -1) {
        const st = currentLabel.get("St");
-        currentLabel = currentLabel.clone();
+        currentLabel = this.cloneDict(currentLabel);
        currentLabel.set("St", st + (i - stFirstIndex));
        stFirstIndex = -1;
      }
@ -598,7 +1163,7 @@ class PDFEditor {
    const { dedupNamedDestinations, oldRefMapping } = documentData;
    const { xref, rotate, mediaBox, resources, ref: oldPageRef } = page;
    const pageRef = this.newRef;
-    const pageDict = (this.xref[pageRef.num] = page.pageDict.clone());
+    const pageDict = (this.xref[pageRef.num] = this.cloneDict(page.pageDict));
    oldRefMapping.put(oldPageRef, pageRef);
    if (pointingNamedDestinations) {
@ -796,6 +1361,71 @@ class PDFEditor {
    );
  }
  #makeStructTree() {
    const { structTreeKids } = this;
    if (!structTreeKids || structTreeKids.length === 0) {
      return;
    }
    const { rootDict } = this;
    const structTreeRef = this.newRef;
    const structTree = (this.xref[structTreeRef.num] = new Dict());
    structTree.setIfName("Type", "StructTreeRoot");
    structTree.setIfArray("K", structTreeKids);
    for (const kidRef of structTreeKids) {
      const kid = this.xref[kidRef.num];
      const type = kid.get("Type");
      if (!type || isName(type, "StructElem")) {
        kid.set("P", structTreeRef);
      }
    }
    if (this.parentTree.size > 0) {
      const parentTreeRef = this.#makeNameNumTree(
        Array.from(this.parentTree.entries()),
        /* areNames = */ false
      );
      const parentTree = this.xref[parentTreeRef.num];
      parentTree.setIfName("Type", "ParentTree");
      structTree.set("ParentTree", parentTreeRef);
      structTree.set("ParentTreeNextKey", this.parentTree.size);
    }
    if (this.idTree.size > 0) {
      const idTreeRef = this.#makeNameNumTree(
        Array.from(this.idTree.entries()),
        /* areNames = */ true
      );
      const idTree = this.xref[idTreeRef.num];
      idTree.setIfName("Type", "IDTree");
      structTree.set("IDTree", idTreeRef);
    }
    if (this.classMap.size > 0) {
      const classMapRef = this.newRef;
      this.xref[classMapRef.num] = this.classMap;
      structTree.set("ClassMap", classMapRef);
    }
    if (this.roleMap.size > 0) {
      const roleMapRef = this.newRef;
      this.xref[roleMapRef.num] = this.roleMap;
      structTree.set("RoleMap", roleMapRef);
    }
    if (this.namespaces.size > 0) {
      const namespacesRef = this.newRef;
      this.xref[namespacesRef.num] = Array.from(this.namespaces.values());
      structTree.set("Namespaces", namespacesRef);
    }
    if (this.structTreeAF.length > 0) {
      const structTreeAFRef = this.newRef;
      this.xref[structTreeAFRef.num] = this.structTreeAF;
      structTree.set("AF", structTreeAFRef);
    }
    if (this.structTreePronunciationLexicon.length > 0) {
      const structTreePronunciationLexiconRef = this.newRef;
      this.xref[structTreePronunciationLexiconRef.num] =
        this.structTreePronunciationLexicon;
      structTree.set("PronunciationLexicon", structTreePronunciationLexiconRef);
    }
    rootDict.set("StructTreeRoot", structTreeRef);
  }
  /**
   * Create the root dictionary.
   * @returns {Promise<void>}
@ -807,6 +1437,7 @@ class PDFEditor {
    this.#makePageTree();
    this.#makePageLabelsTree();
    this.#makeDestinationsTree();
    this.#makeStructTree();
  }
  /**
--- a/src/core/name_number_tree.js
+++ b/src/core/name_number_tree.js
@ -34,7 +34,7 @@ class NameOrNumberTree {
    this._type = type;
  }
-  getAll() {
+  getAll(isRaw = false) {
    const map = new Map();
    if (!this.root) {
      return map;
@ -68,7 +68,10 @@ class NameOrNumberTree {
        continue;
      }
      for (let i = 0, ii = entries.length; i < ii; i += 2) {
-        map.set(xref.fetchIfRef(entries[i]), xref.fetchIfRef(entries[i + 1]));
+        map.set(
          xref.fetchIfRef(entries[i]),
          isRaw ? entries[i + 1] : xref.fetchIfRef(entries[i + 1])
        );
      }
    }
    return map;
--- a/test/pdfs/.gitignore
+++ b/test/pdfs/.gitignore
@ -759,3 +759,5 @@
 !doc_3_3_pages.pdf
 !labelled_pages.pdf
 !extract_link.pdf
 !two_paragraphs.pdf
 !paragraph_and_link.pdf
--- a/test/pdfs/paragraph_and_link.pdf
+++ b/test/pdfs/paragraph_and_link.pdf
--- a/test/pdfs/two_paragraphs.pdf
+++ b/test/pdfs/two_paragraphs.pdf
--- a/test/unit/api_spec.js
+++ b/test/unit/api_spec.js
@ -5722,5 +5722,251 @@ small scripts as well as for`);
        await loadingTask.destroy();
      });
    });
    describe("Struct trees", function () {
      it("extract pages and merge struct trees", async function () {
        let loadingTask = getDocument(
          buildGetDocumentParams("two_paragraphs.pdf")
        );
        let pdfDoc = await loadingTask.promise;
        let pdfPage = await pdfDoc.getPage(1);
        const structTree = await pdfPage.getStructTree();
        expect(structTree).toEqual({
          children: [
            {
              role: "Document",
              children: [
                {
                  role: "Sect",
                  children: [
                    {
                      role: "P",
                      children: [{ type: "content", id: "p19R_mc0" }],
                      lang: "EN-US",
                    },
                    {
                      role: "P",
                      children: [{ type: "content", id: "p19R_mc1" }],
                      lang: "EN-US",
                    },
                  ],
                },
              ],
            },
          ],
          role: "Root",
        });
        const filterItems = item => {
          if (item.type === "beginMarkedContentProps") {
            return item.id;
          }
          if (item.str !== undefined) {
            return item.str;
          }
          return null;
        };
        let { items } = await pdfPage.getTextContent({
          includeMarkedContent: true,
          disableNormalization: true,
        });
        expect(items.map(filterItems)).toEqual([
          "p19R_mc0",
          "The ﬁrst paragraph.",
          null,
          "p19R_mc1",
          "",
          "The second paragraph.",
          null,
        ]);
        const data = await pdfDoc.extractPages([
          { document: null },
          { document: null },
        ]);
        await loadingTask.destroy();
        loadingTask = getDocument(data);
        pdfDoc = await loadingTask.promise;
        expect(pdfDoc.numPages).toEqual(2);
        pdfPage = await pdfDoc.getPage(1);
        const structTree1 = await pdfPage.getStructTree();
        expect(structTree1).toEqual({
          children: [
            {
              role: "Document",
              children: [
                {
                  role: "Sect",
                  children: [
                    {
                      role: "P",
                      children: [{ type: "content", id: "p4R_mc0" }],
                      lang: "EN-US",
                    },
                    {
                      role: "P",
                      children: [{ type: "content", id: "p4R_mc1" }],
                      lang: "EN-US",
                    },
                  ],
                },
              ],
            },
          ],
          role: "Root",
        });
        ({ items } = await pdfPage.getTextContent({
          includeMarkedContent: true,
          disableNormalization: true,
        }));
        expect(items.map(filterItems)).toEqual([
          "p4R_mc0",
          "The ﬁrst paragraph.",
          null,
          "p4R_mc1",
          "",
          "The second paragraph.",
          null,
        ]);
        pdfPage = await pdfDoc.getPage(2);
        const structTree2 = await pdfPage.getStructTree();
        expect(structTree2).toEqual({
          children: [
            {
              role: "Document",
              children: [
                {
                  role: "Sect",
                  children: [
                    {
                      role: "P",
                      children: [{ type: "content", id: "p19R_mc0" }],
                      lang: "EN-US",
                    },
                    {
                      role: "P",
                      children: [{ type: "content", id: "p19R_mc1" }],
                      lang: "EN-US",
                    },
                  ],
                },
              ],
            },
          ],
          role: "Root",
        });
        ({ items } = await pdfPage.getTextContent({
          includeMarkedContent: true,
          disableNormalization: true,
        }));
        expect(items.map(filterItems)).toEqual([
          "p19R_mc0",
          "The ﬁrst paragraph.",
          null,
          "p19R_mc1",
          "",
          "The second paragraph.",
          null,
        ]);
        await loadingTask.destroy();
      });
      it("extract pages with a removed link", async function () {
        let loadingTask = getDocument(
          buildGetDocumentParams("paragraph_and_link.pdf")
        );
        let pdfDoc = await loadingTask.promise;
        const data = await pdfDoc.extractPages([
          { document: null, excludePages: [1] },
          { document: null },
        ]);
        await loadingTask.destroy();
        loadingTask = getDocument(data);
        pdfDoc = await loadingTask.promise;
        expect(pdfDoc.numPages).toEqual(3);
        let pdfPage = await pdfDoc.getPage(1);
        let structTree = await pdfPage.getStructTree();
        expect(structTree).toEqual({
          children: [
            {
              role: "Document",
              children: [
                {
                  role: "Sect",
                  children: [
                    {
                      role: "P",
                      children: [{ type: "content", id: "p4R_mc0" }],
                      lang: "EN-US",
                    },
                    {
                      role: "P",
                      children: [{ type: "content", id: "p4R_mc3" }],
                      lang: "EN-US",
                    },
                    {
                      role: "P",
                      children: [{ type: "content", id: "p4R_mc6" }],
                      lang: "EN-US",
                    },
                  ],
                },
              ],
            },
          ],
          role: "Root",
        });
        pdfPage = await pdfDoc.getPage(2);
        structTree = await pdfPage.getStructTree();
        expect(structTree).toEqual({
          children: [
            {
              role: "Document",
              children: [
                {
                  role: "Sect",
                  children: [
                    {
                      role: "P",
                      children: [{ type: "content", id: "p23R_mc0" }],
                      lang: "EN-US",
                    },
                    {
                      role: "P",
                      children: [
                        {
                          role: "Reference",
                          children: [{ type: "content", id: "p23R_mc2" }],
                          lang: "EN-US",
                        },
                        { type: "content", id: "p23R_mc3" },
                      ],
                      lang: "EN-US",
                    },
                    {
                      role: "P",
                      children: [{ type: "content", id: "p23R_mc6" }],
                      lang: "EN-US",
                    },
                  ],
                },
              ],
            },
          ],
          role: "Root",
        });
        await loadingTask.destroy();
      });
    });
  });
 });