Merge the structure trees coming from different pdfs (bug 1997379)
This commit is contained in:
parent
603e3a8f3e
commit
e13a618df3
@ -17,9 +17,10 @@
|
|||||||
/** @typedef {import("../document.js").Page} Page */
|
/** @typedef {import("../document.js").Page} Page */
|
||||||
/** @typedef {import("../xref.js").XRef} XRef */
|
/** @typedef {import("../xref.js").XRef} XRef */
|
||||||
|
|
||||||
import { Dict, isName, Ref, RefSetCache } from "../primitives.js";
|
import { Dict, isName, Name, Ref, RefSet, RefSetCache } from "../primitives.js";
|
||||||
import { getModificationDate, stringToPDFString } from "../../shared/util.js";
|
import { getModificationDate, stringToPDFString } from "../../shared/util.js";
|
||||||
import { incrementalUpdate, writeValue } from "../writer.js";
|
import { incrementalUpdate, writeValue } from "../writer.js";
|
||||||
|
import { NameTree, NumberTree } from "../name_number_tree.js";
|
||||||
import { BaseStream } from "../base_stream.js";
|
import { BaseStream } from "../base_stream.js";
|
||||||
import { StringStream } from "../stream.js";
|
import { StringStream } from "../stream.js";
|
||||||
import { stringToAsciiOrUTF16BE } from "../core_utils.js";
|
import { stringToAsciiOrUTF16BE } from "../core_utils.js";
|
||||||
@ -49,6 +50,16 @@ class DocumentData {
|
|||||||
this.dedupNamedDestinations = new Map();
|
this.dedupNamedDestinations = new Map();
|
||||||
this.usedNamedDestinations = new Set();
|
this.usedNamedDestinations = new Set();
|
||||||
this.postponedRefCopies = new RefSetCache();
|
this.postponedRefCopies = new RefSetCache();
|
||||||
|
this.usedStructParents = new Set();
|
||||||
|
this.oldStructParentMapping = new Map();
|
||||||
|
this.structTreeRoot = null;
|
||||||
|
this.parentTree = null;
|
||||||
|
this.idTree = null;
|
||||||
|
this.roleMap = null;
|
||||||
|
this.classMap = null;
|
||||||
|
this.namespaces = null;
|
||||||
|
this.structTreeAF = null;
|
||||||
|
this.structTreePronunciationLexicon = [];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -82,6 +93,14 @@ class PDFEditor {
|
|||||||
this.author = author;
|
this.author = author;
|
||||||
this.pageLabels = null;
|
this.pageLabels = null;
|
||||||
this.namedDestinations = new Map();
|
this.namedDestinations = new Map();
|
||||||
|
this.parentTree = new Map();
|
||||||
|
this.structTreeKids = [];
|
||||||
|
this.idTree = new Map();
|
||||||
|
this.classMap = new Dict();
|
||||||
|
this.roleMap = new Dict();
|
||||||
|
this.namespaces = new Map();
|
||||||
|
this.structTreeAF = [];
|
||||||
|
this.structTreePronunciationLexicon = [];
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -115,6 +134,12 @@ class PDFEditor {
|
|||||||
return ref;
|
return ref;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cloneDict(dict) {
|
||||||
|
const newDict = dict.clone();
|
||||||
|
newDict.xref = this.xrefWrapper;
|
||||||
|
return newDict;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Collect the dependencies of an object and create new references for each
|
* Collect the dependencies of an object and create new references for each
|
||||||
* dependency.
|
* dependency.
|
||||||
@ -212,6 +237,232 @@ class PDFEditor {
|
|||||||
return obj;
|
return obj;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async #cloneStructTreeNode(
|
||||||
|
parentStructRef,
|
||||||
|
node,
|
||||||
|
xref,
|
||||||
|
removedStructElements,
|
||||||
|
dedupIDs,
|
||||||
|
dedupClasses,
|
||||||
|
dedupRoles,
|
||||||
|
visited = new RefSet()
|
||||||
|
) {
|
||||||
|
const {
|
||||||
|
currentDocument: { pagesMap, oldRefMapping },
|
||||||
|
} = this;
|
||||||
|
const pg = node.getRaw("Pg");
|
||||||
|
if (pg instanceof Ref && !pagesMap.has(pg)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
let kids;
|
||||||
|
const k = (kids = node.getRaw("K"));
|
||||||
|
if (k instanceof Ref) {
|
||||||
|
// We're only interested by ref referencing nodes and not an array.
|
||||||
|
if (visited.has(k)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
kids = await xref.fetchAsync(k);
|
||||||
|
if (!Array.isArray(kids)) {
|
||||||
|
kids = [k];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
kids = Array.isArray(kids) ? kids : [kids];
|
||||||
|
const newKids = [];
|
||||||
|
const structElemIndices = [];
|
||||||
|
for (let kid of kids) {
|
||||||
|
const kidRef = kid instanceof Ref ? kid : null;
|
||||||
|
if (kidRef) {
|
||||||
|
if (visited.has(kidRef)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
visited.put(kidRef);
|
||||||
|
kid = await xref.fetchAsync(kidRef);
|
||||||
|
}
|
||||||
|
if (typeof kid === "number") {
|
||||||
|
newKids.push(kid);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!(kid instanceof Dict)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const pgRef = kid.getRaw("Pg");
|
||||||
|
if (pgRef instanceof Ref && !pagesMap.has(pgRef)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const type = kid.get("Type");
|
||||||
|
if (!type || isName(type, "StructElem")) {
|
||||||
|
let setAsSpan = false;
|
||||||
|
if (kidRef && removedStructElements.has(kidRef)) {
|
||||||
|
if (!isName(kid.get("S"), "Link")) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// A link annotation has been removed but we still need to keep the
|
||||||
|
// node in order to preserve the structure tree. Mark it as a Span
|
||||||
|
// so that it doesn't affect the semantics.
|
||||||
|
setAsSpan = true;
|
||||||
|
}
|
||||||
|
const newKidRef = await this.#cloneStructTreeNode(
|
||||||
|
kidRef,
|
||||||
|
kid,
|
||||||
|
xref,
|
||||||
|
removedStructElements,
|
||||||
|
dedupIDs,
|
||||||
|
dedupClasses,
|
||||||
|
dedupRoles,
|
||||||
|
visited
|
||||||
|
);
|
||||||
|
if (newKidRef) {
|
||||||
|
structElemIndices.push(newKids.length);
|
||||||
|
newKids.push(newKidRef);
|
||||||
|
if (kidRef) {
|
||||||
|
oldRefMapping.put(kidRef, newKidRef);
|
||||||
|
}
|
||||||
|
if (setAsSpan) {
|
||||||
|
this.xref[newKidRef.num].setIfName("S", "Span");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (isName(type, "OBJR")) {
|
||||||
|
if (!kidRef) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const newKidRef = oldRefMapping.get(kidRef);
|
||||||
|
if (!newKidRef) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const newKid = this.xref[newKidRef.num];
|
||||||
|
// Fix the missing StructParent entry in the referenced object.
|
||||||
|
const objRef = newKid.getRaw("Obj");
|
||||||
|
if (objRef instanceof Ref) {
|
||||||
|
const obj = this.xref[objRef.num];
|
||||||
|
if (
|
||||||
|
obj instanceof Dict &&
|
||||||
|
!obj.has("StructParent") &&
|
||||||
|
parentStructRef
|
||||||
|
) {
|
||||||
|
const structParent = this.parentTree.size;
|
||||||
|
this.parentTree.set(structParent, [oldRefMapping, parentStructRef]);
|
||||||
|
obj.set("StructParent", structParent);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
newKids.push(newKidRef);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (isName(type, "MCR")) {
|
||||||
|
const newKid = await this.#collectDependencies(
|
||||||
|
kidRef || kid,
|
||||||
|
true,
|
||||||
|
xref
|
||||||
|
);
|
||||||
|
newKids.push(newKid);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (kidRef) {
|
||||||
|
const newKidRef = await this.#collectDependencies(kidRef, true, xref);
|
||||||
|
newKids.push(newKidRef);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (kids.length !== 0 && newKids.length === 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const newNodeRef = this.newRef;
|
||||||
|
const newNode = (this.xref[newNodeRef.num] = this.cloneDict(node));
|
||||||
|
// Don't collect for ID or C since they will be fixed later.
|
||||||
|
newNode.delete("ID");
|
||||||
|
newNode.delete("C");
|
||||||
|
newNode.delete("K");
|
||||||
|
newNode.delete("P");
|
||||||
|
newNode.delete("S");
|
||||||
|
await this.#collectDependencies(newNode, false, xref);
|
||||||
|
|
||||||
|
// Fix the class names.
|
||||||
|
const classNames = node.get("C");
|
||||||
|
if (classNames instanceof Name) {
|
||||||
|
const newClassName = dedupClasses.get(classNames.name);
|
||||||
|
if (newClassName) {
|
||||||
|
newNode.set("C", Name.get(newClassName));
|
||||||
|
} else {
|
||||||
|
newNode.set("C", classNames);
|
||||||
|
}
|
||||||
|
} else if (Array.isArray(classNames)) {
|
||||||
|
const newClassNames = [];
|
||||||
|
for (const className of classNames) {
|
||||||
|
if (className instanceof Name) {
|
||||||
|
const newClassName = dedupClasses.get(className.name);
|
||||||
|
if (newClassName) {
|
||||||
|
newClassNames.push(Name.get(newClassName));
|
||||||
|
} else {
|
||||||
|
newClassNames.push(className);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
newNode.set("C", newClassNames);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fix the role name.
|
||||||
|
const roleName = node.get("S");
|
||||||
|
if (roleName instanceof Name) {
|
||||||
|
const newRoleName = dedupRoles.get(roleName.name);
|
||||||
|
if (newRoleName) {
|
||||||
|
newNode.set("S", Name.get(newRoleName));
|
||||||
|
} else {
|
||||||
|
newNode.set("S", roleName);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fix the ID.
|
||||||
|
const id = node.get("ID");
|
||||||
|
if (typeof id === "string") {
|
||||||
|
const stringId = stringToPDFString(id, /* keepEscapeSequence = */ false);
|
||||||
|
const newId = dedupIDs.get(stringId);
|
||||||
|
if (newId) {
|
||||||
|
newNode.set("ID", stringToAsciiOrUTF16BE(newId));
|
||||||
|
} else {
|
||||||
|
newNode.set("ID", id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Table headers may contain IDs that need to be deduplicated.
|
||||||
|
let attributes = newNode.get("A");
|
||||||
|
if (attributes) {
|
||||||
|
if (!Array.isArray(attributes)) {
|
||||||
|
attributes = [attributes];
|
||||||
|
}
|
||||||
|
for (let attr of attributes) {
|
||||||
|
attr = this.xrefWrapper.fetch(attr);
|
||||||
|
if (isName(attr.get("O"), "Table") && attr.has("Headers")) {
|
||||||
|
const headers = this.xrefWrapper.fetch(attr.getRaw("Headers"));
|
||||||
|
if (Array.isArray(headers)) {
|
||||||
|
for (let i = 0, ii = headers.length; i < ii; i++) {
|
||||||
|
const newId = dedupIDs.get(
|
||||||
|
stringToPDFString(headers[i], /* keepEscapeSequence = */ false)
|
||||||
|
);
|
||||||
|
if (newId) {
|
||||||
|
headers[i] = newId;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const index of structElemIndices) {
|
||||||
|
const structElemRef = newKids[index];
|
||||||
|
const structElem = this.xref[structElemRef.num];
|
||||||
|
structElem.set("P", newNodeRef);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (newKids.length === 1) {
|
||||||
|
newNode.set("K", newKids[0]);
|
||||||
|
} else if (newKids.length > 1) {
|
||||||
|
newNode.set("K", newKids);
|
||||||
|
}
|
||||||
|
|
||||||
|
return newNodeRef;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @typedef {Object} PageInfo
|
* @typedef {Object} PageInfo
|
||||||
* @property {PDFDocument} document
|
* @property {PDFDocument} document
|
||||||
@ -315,6 +566,7 @@ class PDFEditor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
this.#fixPostponedRefCopies(allDocumentData);
|
this.#fixPostponedRefCopies(allDocumentData);
|
||||||
|
await this.#mergeStructTrees(allDocumentData);
|
||||||
|
|
||||||
return this.writePDF();
|
return this.writePDF();
|
||||||
}
|
}
|
||||||
@ -326,7 +578,7 @@ class PDFEditor {
|
|||||||
*/
|
*/
|
||||||
async #collectDocumentData(documentData) {
|
async #collectDocumentData(documentData) {
|
||||||
const {
|
const {
|
||||||
document: { pdfManager },
|
document: { pdfManager, xref },
|
||||||
} = documentData;
|
} = documentData;
|
||||||
await Promise.all([
|
await Promise.all([
|
||||||
pdfManager
|
pdfManager
|
||||||
@ -335,7 +587,34 @@ class PDFEditor {
|
|||||||
pdfManager
|
pdfManager
|
||||||
.ensureCatalog("rawPageLabels")
|
.ensureCatalog("rawPageLabels")
|
||||||
.then(pageLabels => (documentData.pageLabels = pageLabels)),
|
.then(pageLabels => (documentData.pageLabels = pageLabels)),
|
||||||
|
pdfManager
|
||||||
|
.ensureCatalog("structTreeRoot")
|
||||||
|
.then(structTreeRoot => (documentData.structTreeRoot = structTreeRoot)),
|
||||||
]);
|
]);
|
||||||
|
const structTreeRoot = documentData.structTreeRoot;
|
||||||
|
if (structTreeRoot) {
|
||||||
|
const rootDict = structTreeRoot.dict;
|
||||||
|
const parentTree = rootDict.get("ParentTree");
|
||||||
|
if (parentTree) {
|
||||||
|
const numberTree = new NumberTree(parentTree, xref);
|
||||||
|
documentData.parentTree = numberTree.getAll(/* isRaw = */ true);
|
||||||
|
}
|
||||||
|
const idTree = rootDict.get("IDTree");
|
||||||
|
if (idTree) {
|
||||||
|
const nameTree = new NameTree(idTree, xref);
|
||||||
|
documentData.idTree = nameTree.getAll(/* isRaw = */ true);
|
||||||
|
}
|
||||||
|
documentData.roleMap = rootDict.get("RoleMap") || null;
|
||||||
|
documentData.classMap = rootDict.get("ClassMap") || null;
|
||||||
|
let namespaces = rootDict.get("Namespaces") || null;
|
||||||
|
if (namespaces && !Array.isArray(namespaces)) {
|
||||||
|
namespaces = [namespaces];
|
||||||
|
}
|
||||||
|
documentData.namespaces = namespaces;
|
||||||
|
documentData.structTreeAF = rootDict.get("AF") || null;
|
||||||
|
documentData.structTreePronunciationLexicon =
|
||||||
|
rootDict.get("PronunciationLexicon") || null;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -371,7 +650,6 @@ class PDFEditor {
|
|||||||
action instanceof Dict
|
action instanceof Dict
|
||||||
? action.get("D")
|
? action.get("D")
|
||||||
: annotationDict.get("Dest");
|
: annotationDict.get("Dest");
|
||||||
|
|
||||||
if (
|
if (
|
||||||
!dest /* not a destination */ ||
|
!dest /* not a destination */ ||
|
||||||
(Array.isArray(dest) &&
|
(Array.isArray(dest) &&
|
||||||
@ -431,6 +709,293 @@ class PDFEditor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#visitObject(obj, callback, visited = new RefSet()) {
|
||||||
|
if (obj instanceof Ref) {
|
||||||
|
if (!visited.has(obj)) {
|
||||||
|
visited.put(obj);
|
||||||
|
this.#visitObject(this.xref[obj.num], callback, visited);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (Array.isArray(obj)) {
|
||||||
|
for (const item of obj) {
|
||||||
|
this.#visitObject(item, callback, visited);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let dict;
|
||||||
|
if (obj instanceof BaseStream) {
|
||||||
|
({ dict } = obj);
|
||||||
|
} else if (obj instanceof Dict) {
|
||||||
|
dict = obj;
|
||||||
|
}
|
||||||
|
if (dict) {
|
||||||
|
callback(dict);
|
||||||
|
for (const value of dict.getRawValues()) {
|
||||||
|
this.#visitObject(value, callback, visited);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async #mergeStructTrees(allDocumentData) {
|
||||||
|
let newStructParentId = 0;
|
||||||
|
const { parentTree: newParentTree } = this;
|
||||||
|
for (let i = 0, ii = this.newPages.length; i < ii; i++) {
|
||||||
|
const {
|
||||||
|
documentData: {
|
||||||
|
parentTree,
|
||||||
|
oldRefMapping,
|
||||||
|
oldStructParentMapping,
|
||||||
|
usedStructParents,
|
||||||
|
document: { xref },
|
||||||
|
},
|
||||||
|
} = this.oldPages[i];
|
||||||
|
if (!parentTree) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const pageRef = this.newPages[i];
|
||||||
|
const pageDict = this.xref[pageRef.num];
|
||||||
|
|
||||||
|
// Visit the new page in order to collect used StructParent entries.
|
||||||
|
this.#visitObject(pageDict, dict => {
|
||||||
|
const structParent =
|
||||||
|
dict.get("StructParent") ?? dict.get("StructParents");
|
||||||
|
if (typeof structParent !== "number") {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
usedStructParents.add(structParent);
|
||||||
|
let parent = parentTree.get(structParent);
|
||||||
|
const parentRef = parent instanceof Ref ? parent : null;
|
||||||
|
if (parentRef) {
|
||||||
|
const array = xref.fetch(parentRef);
|
||||||
|
if (Array.isArray(array)) {
|
||||||
|
parent = array;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (Array.isArray(parent) && parent.every(ref => ref === null)) {
|
||||||
|
parent = null;
|
||||||
|
}
|
||||||
|
if (!parent) {
|
||||||
|
if (dict.has("StructParent")) {
|
||||||
|
dict.delete("StructParent");
|
||||||
|
} else {
|
||||||
|
dict.delete("StructParents");
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let newStructParent = oldStructParentMapping.get(structParent);
|
||||||
|
if (newStructParent === undefined) {
|
||||||
|
newStructParent = newStructParentId++;
|
||||||
|
oldStructParentMapping.set(structParent, newStructParent);
|
||||||
|
newParentTree.set(newStructParent, [oldRefMapping, parent]);
|
||||||
|
}
|
||||||
|
if (dict.has("StructParent")) {
|
||||||
|
dict.set("StructParent", newStructParent);
|
||||||
|
} else {
|
||||||
|
dict.set("StructParents", newStructParent);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
const {
|
||||||
|
structTreeKids,
|
||||||
|
idTree: newIdTree,
|
||||||
|
classMap: newClassMap,
|
||||||
|
roleMap: newRoleMap,
|
||||||
|
namespaces: newNamespaces,
|
||||||
|
structTreeAF: newStructTreeAF,
|
||||||
|
structTreePronunciationLexicon: newStructTreePronunciationLexicon,
|
||||||
|
} = this;
|
||||||
|
// Clone the struct tree nodes for each document.
|
||||||
|
for (const documentData of allDocumentData) {
|
||||||
|
const {
|
||||||
|
document: { xref },
|
||||||
|
oldRefMapping,
|
||||||
|
parentTree,
|
||||||
|
usedStructParents,
|
||||||
|
structTreeRoot,
|
||||||
|
idTree,
|
||||||
|
classMap,
|
||||||
|
roleMap,
|
||||||
|
namespaces,
|
||||||
|
structTreeAF,
|
||||||
|
structTreePronunciationLexicon,
|
||||||
|
} = documentData;
|
||||||
|
|
||||||
|
if (!structTreeRoot) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.currentDocument = documentData;
|
||||||
|
// Get all the removed StructElem
|
||||||
|
const removedStructElements = new RefSet();
|
||||||
|
for (const [key, value] of parentTree || []) {
|
||||||
|
if (!usedStructParents.has(key) && value instanceof Ref) {
|
||||||
|
removedStructElements.put(value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Deduplicate IDs in the ID tree.
|
||||||
|
// We keep the old node references since they will be cloned later when
|
||||||
|
// cloning the struct tree.
|
||||||
|
const dedupIDs = new Map();
|
||||||
|
for (const [id, nodeRef] of idTree || []) {
|
||||||
|
let _id = id;
|
||||||
|
if (newIdTree.has(id)) {
|
||||||
|
for (let i = 1; ; i++) {
|
||||||
|
const newId = `${id}_${i}`;
|
||||||
|
if (!newIdTree.has(newId)) {
|
||||||
|
dedupIDs.set(id, newId);
|
||||||
|
_id = newId;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
newIdTree.set(_id, nodeRef);
|
||||||
|
}
|
||||||
|
|
||||||
|
const dedupClasses = new Map();
|
||||||
|
if (classMap?.size > 0) {
|
||||||
|
// Deduplicate ClassMap entries.
|
||||||
|
for (let [className, classDict] of classMap) {
|
||||||
|
classDict = await this.#collectDependencies(classDict, true, xref);
|
||||||
|
if (newClassMap.has(className)) {
|
||||||
|
for (let i = 1; ; i++) {
|
||||||
|
const newClassName = `${className}_${i}`;
|
||||||
|
if (!newClassMap.has(newClassName)) {
|
||||||
|
dedupClasses.set(className, newClassName);
|
||||||
|
className = newClassName;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
newClassMap.set(className, classDict);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const dedupRoles = new Map();
|
||||||
|
if (roleMap?.size > 0) {
|
||||||
|
// Deduplicate RoleMap entries.
|
||||||
|
for (const [roleName, mappedName] of roleMap) {
|
||||||
|
const newMappedName = newRoleMap.get(roleName);
|
||||||
|
if (!newMappedName) {
|
||||||
|
newRoleMap.set(roleName, mappedName);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (newMappedName === mappedName) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
for (let i = 1; ; i++) {
|
||||||
|
const newRoleName = `${roleName}_${i}`;
|
||||||
|
if (!newRoleMap.has(newRoleName)) {
|
||||||
|
dedupRoles.set(roleName, newRoleName);
|
||||||
|
newRoleMap.set(newRoleName, mappedName);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (namespaces?.length > 0) {
|
||||||
|
for (const namespaceRef of namespaces) {
|
||||||
|
const namespace = await xref.fetchIfRefAsync(namespaceRef);
|
||||||
|
let ns = namespace.get("NS");
|
||||||
|
if (!ns || newNamespaces.has(ns)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
ns = stringToPDFString(ns, /* keepEscapeSequence = */ false);
|
||||||
|
const newNamespace = await this.#collectDependencies(
|
||||||
|
namespace,
|
||||||
|
true,
|
||||||
|
xref
|
||||||
|
);
|
||||||
|
newNamespaces.set(ns, newNamespace);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (structTreeAF) {
|
||||||
|
for (const afRef of structTreeAF) {
|
||||||
|
newStructTreeAF.push(
|
||||||
|
await this.#collectDependencies(afRef, true, xref)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (structTreePronunciationLexicon) {
|
||||||
|
for (const lexiconRef of structTreePronunciationLexicon) {
|
||||||
|
newStructTreePronunciationLexicon.push(
|
||||||
|
await this.#collectDependencies(lexiconRef, true, xref)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the kids.
|
||||||
|
let kids = structTreeRoot.dict.get("K");
|
||||||
|
if (!kids) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
kids = Array.isArray(kids) ? kids : [kids];
|
||||||
|
for (let kid of kids) {
|
||||||
|
const kidRef = kid instanceof Ref ? kid : null;
|
||||||
|
if (kidRef && removedStructElements.has(kidRef)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
kid = await xref.fetchIfRefAsync(kid);
|
||||||
|
const newKidRef = await this.#cloneStructTreeNode(
|
||||||
|
kidRef,
|
||||||
|
kid,
|
||||||
|
xref,
|
||||||
|
removedStructElements,
|
||||||
|
dedupIDs,
|
||||||
|
dedupClasses,
|
||||||
|
dedupRoles
|
||||||
|
);
|
||||||
|
if (newKidRef) {
|
||||||
|
structTreeKids.push(newKidRef);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fix the ID tree.
|
||||||
|
for (const [id, nodeRef] of idTree || []) {
|
||||||
|
const newNodeRef = oldRefMapping.get(nodeRef);
|
||||||
|
const newId = dedupIDs.get(id) || id;
|
||||||
|
if (newNodeRef) {
|
||||||
|
newIdTree.set(newId, newNodeRef);
|
||||||
|
} else {
|
||||||
|
newIdTree.delete(newId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const [key, [oldRefMapping, parent]] of newParentTree) {
|
||||||
|
if (!parent) {
|
||||||
|
newParentTree.delete(key);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Some nodes haven't been visited while cloning the struct trees so their
|
||||||
|
// ref don't belong to the oldRefMapping. Remove those nodes.
|
||||||
|
if (!Array.isArray(parent)) {
|
||||||
|
const newParent = oldRefMapping.get(parent);
|
||||||
|
if (newParent === undefined) {
|
||||||
|
newParentTree.delete(key);
|
||||||
|
} else {
|
||||||
|
newParentTree.set(key, newParent);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const newParents = parent.map(
|
||||||
|
ref => (ref instanceof Ref && oldRefMapping.get(ref)) || null
|
||||||
|
);
|
||||||
|
if (newParents.length === 0 || newParents.every(ref => ref === null)) {
|
||||||
|
newParentTree.delete(key);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
newParentTree.set(key, newParents);
|
||||||
|
}
|
||||||
|
|
||||||
|
this.currentDocument = null;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Collect named destinations that are still valid (i.e. pointing to kept
|
* Collect named destinations that are still valid (i.e. pointing to kept
|
||||||
* pages).
|
* pages).
|
||||||
@ -566,7 +1131,7 @@ class PDFEditor {
|
|||||||
}
|
}
|
||||||
if (stFirstIndex !== -1) {
|
if (stFirstIndex !== -1) {
|
||||||
const st = currentLabel.get("St");
|
const st = currentLabel.get("St");
|
||||||
currentLabel = currentLabel.clone();
|
currentLabel = this.cloneDict(currentLabel);
|
||||||
currentLabel.set("St", st + (i - stFirstIndex));
|
currentLabel.set("St", st + (i - stFirstIndex));
|
||||||
stFirstIndex = -1;
|
stFirstIndex = -1;
|
||||||
}
|
}
|
||||||
@ -598,7 +1163,7 @@ class PDFEditor {
|
|||||||
const { dedupNamedDestinations, oldRefMapping } = documentData;
|
const { dedupNamedDestinations, oldRefMapping } = documentData;
|
||||||
const { xref, rotate, mediaBox, resources, ref: oldPageRef } = page;
|
const { xref, rotate, mediaBox, resources, ref: oldPageRef } = page;
|
||||||
const pageRef = this.newRef;
|
const pageRef = this.newRef;
|
||||||
const pageDict = (this.xref[pageRef.num] = page.pageDict.clone());
|
const pageDict = (this.xref[pageRef.num] = this.cloneDict(page.pageDict));
|
||||||
oldRefMapping.put(oldPageRef, pageRef);
|
oldRefMapping.put(oldPageRef, pageRef);
|
||||||
|
|
||||||
if (pointingNamedDestinations) {
|
if (pointingNamedDestinations) {
|
||||||
@ -796,6 +1361,71 @@ class PDFEditor {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#makeStructTree() {
|
||||||
|
const { structTreeKids } = this;
|
||||||
|
if (!structTreeKids || structTreeKids.length === 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const { rootDict } = this;
|
||||||
|
const structTreeRef = this.newRef;
|
||||||
|
const structTree = (this.xref[structTreeRef.num] = new Dict());
|
||||||
|
structTree.setIfName("Type", "StructTreeRoot");
|
||||||
|
structTree.setIfArray("K", structTreeKids);
|
||||||
|
for (const kidRef of structTreeKids) {
|
||||||
|
const kid = this.xref[kidRef.num];
|
||||||
|
const type = kid.get("Type");
|
||||||
|
if (!type || isName(type, "StructElem")) {
|
||||||
|
kid.set("P", structTreeRef);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (this.parentTree.size > 0) {
|
||||||
|
const parentTreeRef = this.#makeNameNumTree(
|
||||||
|
Array.from(this.parentTree.entries()),
|
||||||
|
/* areNames = */ false
|
||||||
|
);
|
||||||
|
const parentTree = this.xref[parentTreeRef.num];
|
||||||
|
parentTree.setIfName("Type", "ParentTree");
|
||||||
|
structTree.set("ParentTree", parentTreeRef);
|
||||||
|
structTree.set("ParentTreeNextKey", this.parentTree.size);
|
||||||
|
}
|
||||||
|
if (this.idTree.size > 0) {
|
||||||
|
const idTreeRef = this.#makeNameNumTree(
|
||||||
|
Array.from(this.idTree.entries()),
|
||||||
|
/* areNames = */ true
|
||||||
|
);
|
||||||
|
const idTree = this.xref[idTreeRef.num];
|
||||||
|
idTree.setIfName("Type", "IDTree");
|
||||||
|
structTree.set("IDTree", idTreeRef);
|
||||||
|
}
|
||||||
|
if (this.classMap.size > 0) {
|
||||||
|
const classMapRef = this.newRef;
|
||||||
|
this.xref[classMapRef.num] = this.classMap;
|
||||||
|
structTree.set("ClassMap", classMapRef);
|
||||||
|
}
|
||||||
|
if (this.roleMap.size > 0) {
|
||||||
|
const roleMapRef = this.newRef;
|
||||||
|
this.xref[roleMapRef.num] = this.roleMap;
|
||||||
|
structTree.set("RoleMap", roleMapRef);
|
||||||
|
}
|
||||||
|
if (this.namespaces.size > 0) {
|
||||||
|
const namespacesRef = this.newRef;
|
||||||
|
this.xref[namespacesRef.num] = Array.from(this.namespaces.values());
|
||||||
|
structTree.set("Namespaces", namespacesRef);
|
||||||
|
}
|
||||||
|
if (this.structTreeAF.length > 0) {
|
||||||
|
const structTreeAFRef = this.newRef;
|
||||||
|
this.xref[structTreeAFRef.num] = this.structTreeAF;
|
||||||
|
structTree.set("AF", structTreeAFRef);
|
||||||
|
}
|
||||||
|
if (this.structTreePronunciationLexicon.length > 0) {
|
||||||
|
const structTreePronunciationLexiconRef = this.newRef;
|
||||||
|
this.xref[structTreePronunciationLexiconRef.num] =
|
||||||
|
this.structTreePronunciationLexicon;
|
||||||
|
structTree.set("PronunciationLexicon", structTreePronunciationLexiconRef);
|
||||||
|
}
|
||||||
|
rootDict.set("StructTreeRoot", structTreeRef);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create the root dictionary.
|
* Create the root dictionary.
|
||||||
* @returns {Promise<void>}
|
* @returns {Promise<void>}
|
||||||
@ -807,6 +1437,7 @@ class PDFEditor {
|
|||||||
this.#makePageTree();
|
this.#makePageTree();
|
||||||
this.#makePageLabelsTree();
|
this.#makePageLabelsTree();
|
||||||
this.#makeDestinationsTree();
|
this.#makeDestinationsTree();
|
||||||
|
this.#makeStructTree();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@ -34,7 +34,7 @@ class NameOrNumberTree {
|
|||||||
this._type = type;
|
this._type = type;
|
||||||
}
|
}
|
||||||
|
|
||||||
getAll() {
|
getAll(isRaw = false) {
|
||||||
const map = new Map();
|
const map = new Map();
|
||||||
if (!this.root) {
|
if (!this.root) {
|
||||||
return map;
|
return map;
|
||||||
@ -68,7 +68,10 @@ class NameOrNumberTree {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
for (let i = 0, ii = entries.length; i < ii; i += 2) {
|
for (let i = 0, ii = entries.length; i < ii; i += 2) {
|
||||||
map.set(xref.fetchIfRef(entries[i]), xref.fetchIfRef(entries[i + 1]));
|
map.set(
|
||||||
|
xref.fetchIfRef(entries[i]),
|
||||||
|
isRaw ? entries[i + 1] : xref.fetchIfRef(entries[i + 1])
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return map;
|
return map;
|
||||||
|
|||||||
2
test/pdfs/.gitignore
vendored
2
test/pdfs/.gitignore
vendored
@ -759,3 +759,5 @@
|
|||||||
!doc_3_3_pages.pdf
|
!doc_3_3_pages.pdf
|
||||||
!labelled_pages.pdf
|
!labelled_pages.pdf
|
||||||
!extract_link.pdf
|
!extract_link.pdf
|
||||||
|
!two_paragraphs.pdf
|
||||||
|
!paragraph_and_link.pdf
|
||||||
|
|||||||
BIN
test/pdfs/paragraph_and_link.pdf
Executable file
BIN
test/pdfs/paragraph_and_link.pdf
Executable file
Binary file not shown.
BIN
test/pdfs/two_paragraphs.pdf
Executable file
BIN
test/pdfs/two_paragraphs.pdf
Executable file
Binary file not shown.
@ -5722,5 +5722,251 @@ small scripts as well as for`);
|
|||||||
await loadingTask.destroy();
|
await loadingTask.destroy();
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("Struct trees", function () {
|
||||||
|
it("extract pages and merge struct trees", async function () {
|
||||||
|
let loadingTask = getDocument(
|
||||||
|
buildGetDocumentParams("two_paragraphs.pdf")
|
||||||
|
);
|
||||||
|
let pdfDoc = await loadingTask.promise;
|
||||||
|
let pdfPage = await pdfDoc.getPage(1);
|
||||||
|
const structTree = await pdfPage.getStructTree();
|
||||||
|
expect(structTree).toEqual({
|
||||||
|
children: [
|
||||||
|
{
|
||||||
|
role: "Document",
|
||||||
|
children: [
|
||||||
|
{
|
||||||
|
role: "Sect",
|
||||||
|
children: [
|
||||||
|
{
|
||||||
|
role: "P",
|
||||||
|
children: [{ type: "content", id: "p19R_mc0" }],
|
||||||
|
lang: "EN-US",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
role: "P",
|
||||||
|
children: [{ type: "content", id: "p19R_mc1" }],
|
||||||
|
lang: "EN-US",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
role: "Root",
|
||||||
|
});
|
||||||
|
const filterItems = item => {
|
||||||
|
if (item.type === "beginMarkedContentProps") {
|
||||||
|
return item.id;
|
||||||
|
}
|
||||||
|
if (item.str !== undefined) {
|
||||||
|
return item.str;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
};
|
||||||
|
let { items } = await pdfPage.getTextContent({
|
||||||
|
includeMarkedContent: true,
|
||||||
|
disableNormalization: true,
|
||||||
|
});
|
||||||
|
expect(items.map(filterItems)).toEqual([
|
||||||
|
"p19R_mc0",
|
||||||
|
"The first paragraph.",
|
||||||
|
null,
|
||||||
|
"p19R_mc1",
|
||||||
|
"",
|
||||||
|
"The second paragraph.",
|
||||||
|
null,
|
||||||
|
]);
|
||||||
|
|
||||||
|
const data = await pdfDoc.extractPages([
|
||||||
|
{ document: null },
|
||||||
|
{ document: null },
|
||||||
|
]);
|
||||||
|
await loadingTask.destroy();
|
||||||
|
|
||||||
|
loadingTask = getDocument(data);
|
||||||
|
pdfDoc = await loadingTask.promise;
|
||||||
|
|
||||||
|
expect(pdfDoc.numPages).toEqual(2);
|
||||||
|
pdfPage = await pdfDoc.getPage(1);
|
||||||
|
const structTree1 = await pdfPage.getStructTree();
|
||||||
|
expect(structTree1).toEqual({
|
||||||
|
children: [
|
||||||
|
{
|
||||||
|
role: "Document",
|
||||||
|
children: [
|
||||||
|
{
|
||||||
|
role: "Sect",
|
||||||
|
children: [
|
||||||
|
{
|
||||||
|
role: "P",
|
||||||
|
children: [{ type: "content", id: "p4R_mc0" }],
|
||||||
|
lang: "EN-US",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
role: "P",
|
||||||
|
children: [{ type: "content", id: "p4R_mc1" }],
|
||||||
|
lang: "EN-US",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
role: "Root",
|
||||||
|
});
|
||||||
|
|
||||||
|
({ items } = await pdfPage.getTextContent({
|
||||||
|
includeMarkedContent: true,
|
||||||
|
disableNormalization: true,
|
||||||
|
}));
|
||||||
|
expect(items.map(filterItems)).toEqual([
|
||||||
|
"p4R_mc0",
|
||||||
|
"The first paragraph.",
|
||||||
|
null,
|
||||||
|
"p4R_mc1",
|
||||||
|
"",
|
||||||
|
"The second paragraph.",
|
||||||
|
null,
|
||||||
|
]);
|
||||||
|
|
||||||
|
pdfPage = await pdfDoc.getPage(2);
|
||||||
|
const structTree2 = await pdfPage.getStructTree();
|
||||||
|
expect(structTree2).toEqual({
|
||||||
|
children: [
|
||||||
|
{
|
||||||
|
role: "Document",
|
||||||
|
children: [
|
||||||
|
{
|
||||||
|
role: "Sect",
|
||||||
|
children: [
|
||||||
|
{
|
||||||
|
role: "P",
|
||||||
|
children: [{ type: "content", id: "p19R_mc0" }],
|
||||||
|
lang: "EN-US",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
role: "P",
|
||||||
|
children: [{ type: "content", id: "p19R_mc1" }],
|
||||||
|
lang: "EN-US",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
role: "Root",
|
||||||
|
});
|
||||||
|
|
||||||
|
({ items } = await pdfPage.getTextContent({
|
||||||
|
includeMarkedContent: true,
|
||||||
|
disableNormalization: true,
|
||||||
|
}));
|
||||||
|
expect(items.map(filterItems)).toEqual([
|
||||||
|
"p19R_mc0",
|
||||||
|
"The first paragraph.",
|
||||||
|
null,
|
||||||
|
"p19R_mc1",
|
||||||
|
"",
|
||||||
|
"The second paragraph.",
|
||||||
|
null,
|
||||||
|
]);
|
||||||
|
|
||||||
|
await loadingTask.destroy();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("extract pages with a removed link", async function () {
|
||||||
|
let loadingTask = getDocument(
|
||||||
|
buildGetDocumentParams("paragraph_and_link.pdf")
|
||||||
|
);
|
||||||
|
let pdfDoc = await loadingTask.promise;
|
||||||
|
|
||||||
|
const data = await pdfDoc.extractPages([
|
||||||
|
{ document: null, excludePages: [1] },
|
||||||
|
{ document: null },
|
||||||
|
]);
|
||||||
|
await loadingTask.destroy();
|
||||||
|
|
||||||
|
loadingTask = getDocument(data);
|
||||||
|
pdfDoc = await loadingTask.promise;
|
||||||
|
|
||||||
|
expect(pdfDoc.numPages).toEqual(3);
|
||||||
|
let pdfPage = await pdfDoc.getPage(1);
|
||||||
|
let structTree = await pdfPage.getStructTree();
|
||||||
|
expect(structTree).toEqual({
|
||||||
|
children: [
|
||||||
|
{
|
||||||
|
role: "Document",
|
||||||
|
children: [
|
||||||
|
{
|
||||||
|
role: "Sect",
|
||||||
|
children: [
|
||||||
|
{
|
||||||
|
role: "P",
|
||||||
|
children: [{ type: "content", id: "p4R_mc0" }],
|
||||||
|
lang: "EN-US",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
role: "P",
|
||||||
|
children: [{ type: "content", id: "p4R_mc3" }],
|
||||||
|
lang: "EN-US",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
role: "P",
|
||||||
|
children: [{ type: "content", id: "p4R_mc6" }],
|
||||||
|
lang: "EN-US",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
role: "Root",
|
||||||
|
});
|
||||||
|
|
||||||
|
pdfPage = await pdfDoc.getPage(2);
|
||||||
|
structTree = await pdfPage.getStructTree();
|
||||||
|
|
||||||
|
expect(structTree).toEqual({
|
||||||
|
children: [
|
||||||
|
{
|
||||||
|
role: "Document",
|
||||||
|
children: [
|
||||||
|
{
|
||||||
|
role: "Sect",
|
||||||
|
children: [
|
||||||
|
{
|
||||||
|
role: "P",
|
||||||
|
children: [{ type: "content", id: "p23R_mc0" }],
|
||||||
|
lang: "EN-US",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
role: "P",
|
||||||
|
children: [
|
||||||
|
{
|
||||||
|
role: "Reference",
|
||||||
|
children: [{ type: "content", id: "p23R_mc2" }],
|
||||||
|
lang: "EN-US",
|
||||||
|
},
|
||||||
|
{ type: "content", id: "p23R_mc3" },
|
||||||
|
],
|
||||||
|
lang: "EN-US",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
role: "P",
|
||||||
|
children: [{ type: "content", id: "p23R_mc6" }],
|
||||||
|
lang: "EN-US",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
role: "Root",
|
||||||
|
});
|
||||||
|
await loadingTask.destroy();
|
||||||
|
});
|
||||||
|
});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user