Add the possibility to create a pdf from different ones (bug 1997379)

For now it's just possible to create a single pdf in selecting some pages in different pdf sources.
The merge is for now pretty basic (it's why it's still a WIP) none of these data are merged for now:
 - the struct trees
 - the page labels
 - the outlines
 - named destinations
For there are 2 new ref tests where some new pdfs are created: one with some extracted pages and an other
one (encrypted) which is just rewritten.
The ref images are generated from the original pdfs in selecting the page we want and the new images are
taken from the generated pdfs.
This commit is contained in:
Calixte Denizet 2025-10-30 18:25:18 +01:00
parent 0a2680bca6
commit bc87f4e8d6
19 changed files with 1089 additions and 44 deletions

View File

@ -131,6 +131,19 @@ class DecodeStream extends BaseStream {
getBaseStreams() {
return this.stream ? this.stream.getBaseStreams() : null;
}
clone() {
// Make sure it has been fully read.
while (!this.eof) {
this.readBlock();
}
return new Stream(
this.buffer,
this.start,
this.end - this.start,
this.dict.clone()
);
}
}
class StreamsSequenceStream extends DecodeStream {

View File

@ -52,6 +52,10 @@ class DecryptStream extends DecodeStream {
buffer.set(chunk, bufferLength);
this.bufferLength = newLength;
}
getOriginalStream() {
return this;
}
}
export { DecryptStream };

View File

@ -178,7 +178,7 @@ class Page {
);
}
#getBoundingBox(name) {
getBoundingBox(name) {
if (this.xfaData) {
return this.xfaData.bbox;
}
@ -201,7 +201,7 @@ class Page {
return shadow(
this,
"mediaBox",
this.#getBoundingBox("MediaBox") || LETTER_SIZE_MEDIABOX
this.getBoundingBox("MediaBox") || LETTER_SIZE_MEDIABOX
);
}
@ -210,7 +210,7 @@ class Page {
return shadow(
this,
"cropBox",
this.#getBoundingBox("CropBox") || this.mediaBox
this.getBoundingBox("CropBox") || this.mediaBox
);
}

View File

@ -0,0 +1,594 @@
/* Copyright 2025 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** @typedef {import("../document.js").PDFDocument} PDFDocument */
/** @typedef {import("../document.js").Page} Page */
/** @typedef {import("../xref.js").XRef} XRef */
import { Dict, isName, Ref, RefSetCache } from "../primitives.js";
import { getModificationDate, stringToPDFString } from "../../shared/util.js";
import { incrementalUpdate, writeValue } from "../writer.js";
import { BaseStream } from "../base_stream.js";
import { StringStream } from "../stream.js";
import { stringToAsciiOrUTF16BE } from "../core_utils.js";
const MAX_LEAVES_PER_PAGES_NODE = 16;
class PageData {
constructor(page, documentData) {
this.page = page;
this.documentData = documentData;
this.annotations = null;
documentData.pagesMap.put(page.ref, this);
}
}
class DocumentData {
constructor(document) {
this.document = document;
this.pagesMap = new RefSetCache();
this.oldRefMapping = new RefSetCache();
}
}
class PDFEditor {
constructor({ useObjectStreams = true, title = "", author = "" } = {}) {
this.hasSingleFile = false;
this.currentDocument = null;
this.oldPages = [];
this.newPages = [];
this.xref = [null];
this.newRefCount = 1;
[this.rootRef, this.rootDict] = this.newDict;
[this.infoRef, this.infoDict] = this.newDict;
[this.pagesRef, this.pagesDict] = this.newDict;
this.namesDict = null;
this.useObjectStreams = useObjectStreams;
this.objStreamRefs = useObjectStreams ? new Set() : null;
this.version = "1.7";
this.title = title;
this.author = author;
}
/**
* Get a new reference for an object in the PDF.
* @returns {Ref}
*/
get newRef() {
const ref = Ref.get(this.newRefCount++, 0);
return ref;
}
/**
* Create a new dictionary and its reference.
* @returns {[Ref, Dict]}
*/
get newDict() {
const ref = this.newRef;
const dict = (this.xref[ref.num] = new Dict());
return [ref, dict];
}
/**
* Clone an object in the PDF.
* @param {*} obj
* @param {XRef} xref
* @returns {Promise<Ref>}
*/
async #cloneObject(obj, xref) {
const ref = this.newRef;
this.xref[ref.num] = await this.#collectDependencies(obj, true, xref);
return ref;
}
/**
* Collect the dependencies of an object and create new references for each
* dependency.
* @param {*} obj
* @param {boolean} mustClone
* @param {XRef} xref
* @returns {Promise<*>}
*/
async #collectDependencies(obj, mustClone, xref) {
if (obj instanceof Ref) {
const {
currentDocument: { oldRefMapping },
} = this;
let newRef = oldRefMapping.get(obj);
if (newRef) {
return newRef;
}
newRef = this.newRef;
oldRefMapping.put(obj, newRef);
obj = await xref.fetchAsync(obj);
if (typeof PDFJSDev === "undefined" || PDFJSDev.test("TESTING")) {
if (
obj instanceof Dict &&
isName(obj.get("Type"), "Page") &&
!this.currentDocument.pagesMap.has(obj)
) {
throw new Error(
"Add a deleted page to the document is not supported."
);
}
}
this.xref[newRef.num] = await this.#collectDependencies(obj, true, xref);
return newRef;
}
const promises = [];
if (Array.isArray(obj)) {
if (mustClone) {
obj = obj.slice();
}
for (let i = 0, ii = obj.length; i < ii; i++) {
promises.push(
this.#collectDependencies(obj[i], true, xref).then(
newObj => (obj[i] = newObj)
)
);
}
await Promise.all(promises);
return obj;
}
let dict;
if (obj instanceof BaseStream) {
({ dict } = obj = obj.getOriginalStream().clone());
} else if (obj instanceof Dict) {
if (mustClone) {
obj = obj.clone();
}
dict = obj;
}
if (dict) {
for (const [key, rawObj] of dict.getRawEntries()) {
promises.push(
this.#collectDependencies(rawObj, true, xref).then(newObj =>
dict.set(key, newObj)
)
);
}
await Promise.all(promises);
}
return obj;
}
/**
* @typedef {Object} PageInfo
* @property {PDFDocument} document
* @property {Array<Array<number>|number>} [includePages]
* included ranges (inclusive) or indices.
* @property {Array<Array<number>|number>} [excludePages]
* excluded ranges (inclusive) or indices.
*/
/**
* Extract pages from the given documents.
* @param {Array<PageInfo>} pageInfos
* @return {Promise<void>}
*/
async extractPages(pageInfos) {
const promises = [];
let newIndex = 0;
this.hasSingleFile = pageInfos.length === 1;
for (const { document, includePages, excludePages } of pageInfos) {
if (!document) {
continue;
}
const documentData = new DocumentData(document);
promises.push(this.#collectDocumentData(documentData));
let keptIndices, keptRanges, deletedIndices, deletedRanges;
for (const page of includePages || []) {
if (Array.isArray(page)) {
(keptRanges ||= []).push(page);
} else {
(keptIndices ||= new Set()).add(page);
}
}
for (const page of excludePages || []) {
if (Array.isArray(page)) {
(deletedRanges ||= []).push(page);
} else {
(deletedIndices ||= new Set()).add(page);
}
}
for (let i = 0, ii = document.numPages; i < ii; i++) {
if (deletedIndices?.has(i)) {
continue;
}
if (deletedRanges) {
let isDeleted = false;
for (const [start, end] of deletedRanges) {
if (i >= start && i <= end) {
isDeleted = true;
break;
}
}
if (isDeleted) {
continue;
}
}
let takePage = false;
if (keptIndices) {
takePage = keptIndices.has(i);
}
if (!takePage && keptRanges) {
for (const [start, end] of keptRanges) {
if (i >= start && i <= end) {
takePage = true;
break;
}
}
}
if (!takePage && !keptIndices && !keptRanges) {
takePage = true;
}
if (!takePage) {
continue;
}
const newPageIndex = newIndex++;
promises.push(
document.getPage(i).then(page => {
this.oldPages[newPageIndex] = new PageData(page, documentData);
})
);
}
}
await Promise.all(promises);
promises.length = 0;
for (const page of this.oldPages) {
promises.push(this.#postCollectPageData(page));
}
await Promise.all(promises);
for (let i = 0, ii = this.oldPages.length; i < ii; i++) {
this.newPages[i] = await this.#makePageCopy(i, null);
}
return this.writePDF();
}
/**
* Collect the document data.
* @param {DocumentData} documentData
* @return {Promise<void>}
*/
async #collectDocumentData(documentData) {}
/**
* Post process the collected page data.
* @param {PageData} pageData
* @returns {Promise<void>}
*/
async #postCollectPageData(pageData) {
const {
page: { xref, annotations },
} = pageData;
if (!annotations) {
return;
}
const promises = [];
let newAnnotations = [];
let newIndex = 0;
// TODO: remove only links to deleted pages.
for (const annotationRef of annotations) {
const newAnnotationIndex = newIndex++;
promises.push(
xref.fetchIfRefAsync(annotationRef).then(async annotationDict => {
if (!isName(annotationDict.get("Subtype"), "Link")) {
newAnnotations[newAnnotationIndex] = annotationRef;
}
})
);
}
await Promise.all(promises);
newAnnotations = newAnnotations.filter(annot => !!annot);
pageData.annotations = newAnnotations.length > 0 ? newAnnotations : null;
}
/**
* Create a copy of a page.
* @param {number} pageIndex
* @returns {Promise<Ref>} the page reference in the new PDF document.
*/
async #makePageCopy(pageIndex) {
const { page, documentData, annotations } = this.oldPages[pageIndex];
this.currentDocument = documentData;
const { oldRefMapping } = documentData;
const { xref, rotate, mediaBox, resources, ref: oldPageRef } = page;
const pageRef = this.newRef;
const pageDict = (this.xref[pageRef.num] = page.pageDict.clone());
oldRefMapping.put(oldPageRef, pageRef);
// No need to keep these entries as we'll set them again later.
for (const key of [
"Rotate",
"MediaBox",
"CropBox",
"BleedBox",
"TrimBox",
"ArtBox",
"Resources",
"Annots",
"Parent",
"UserUnit",
]) {
pageDict.delete(key);
}
const lastRef = this.newRefCount;
await this.#collectDependencies(pageDict, false, xref);
pageDict.set("Rotate", rotate);
pageDict.set("MediaBox", mediaBox);
for (const boxName of ["CropBox", "BleedBox", "TrimBox", "ArtBox"]) {
const box = page.getBoundingBox(boxName);
if (box?.some((value, index) => value !== mediaBox[index])) {
// These boxes are optional and their default value is the MediaBox.
pageDict.set(boxName, box);
}
}
const userUnit = page.userUnit;
if (userUnit !== 1) {
pageDict.set("UserUnit", userUnit);
}
pageDict.setIfDict(
"Resources",
await this.#collectDependencies(resources, true, xref)
);
pageDict.setIfArray(
"Annots",
await this.#collectDependencies(annotations, true, xref)
);
if (this.useObjectStreams) {
const newLastRef = this.newRefCount;
const pageObjectRefs = [];
for (let i = lastRef; i < newLastRef; i++) {
const obj = this.xref[i];
if (obj instanceof BaseStream) {
continue;
}
pageObjectRefs.push(Ref.get(i, 0));
}
for (let i = 0; i < pageObjectRefs.length; i += 0xffff) {
const objStreamRef = this.newRef;
this.objStreamRefs.add(objStreamRef.num);
this.xref[objStreamRef.num] = pageObjectRefs.slice(i, i + 0xffff);
}
}
this.currentDocument = null;
return pageRef;
}
/**
* Create the page tree structure.
*/
#makePageTree() {
const { newPages: pages, rootDict, pagesRef, pagesDict } = this;
rootDict.set("Pages", pagesRef);
pagesDict.setIfName("Type", "Pages");
pagesDict.set("Count", pages.length);
const maxLeaves =
MAX_LEAVES_PER_PAGES_NODE <= 1 ? pages.length : MAX_LEAVES_PER_PAGES_NODE;
const stack = [{ dict: pagesDict, kids: pages, parentRef: pagesRef }];
while (stack.length > 0) {
const { dict, kids, parentRef } = stack.pop();
if (kids.length <= maxLeaves) {
dict.set("Kids", kids);
for (const ref of kids) {
this.xref[ref.num].set("Parent", parentRef);
}
continue;
}
const chunkSize = Math.max(maxLeaves, Math.ceil(kids.length / maxLeaves));
const kidsChunks = [];
for (let i = 0; i < kids.length; i += chunkSize) {
kidsChunks.push(kids.slice(i, i + chunkSize));
}
const kidsRefs = [];
dict.set("Kids", kidsRefs);
for (const chunk of kidsChunks) {
const [kidRef, kidDict] = this.newDict;
kidsRefs.push(kidRef);
kidDict.setIfName("Type", "Pages");
kidDict.set("Parent", parentRef);
kidDict.set("Count", chunk.length);
stack.push({ dict: kidDict, kids: chunk, parentRef: kidRef });
}
}
}
/**
* Create the root dictionary.
* @returns {Promise<void>}
*/
async #makeRoot() {
const { rootDict } = this;
rootDict.setIfName("Type", "Catalog");
rootDict.set("Version", this.version);
this.#makePageTree();
}
/**
* Create the info dictionary.
* @returns {Map} infoMap
*/
#makeInfo() {
const infoMap = new Map();
if (this.hasSingleFile) {
const {
xref: { trailer },
} = this.oldPages[0].documentData.document;
const oldInfoDict = trailer.get("Info");
for (const [key, value] of oldInfoDict || []) {
if (typeof value === "string") {
infoMap.set(key, stringToPDFString(value));
}
}
}
infoMap.delete("ModDate");
infoMap.set("CreationDate", getModificationDate());
infoMap.set("Creator", "PDF.js");
infoMap.set("Producer", "Firefox");
if (this.author) {
infoMap.set("Author", this.author);
}
if (this.title) {
infoMap.set("Title", this.title);
}
for (const [key, value] of infoMap) {
this.infoDict.set(key, stringToAsciiOrUTF16BE(value));
}
return infoMap;
}
/**
* Create the encryption dictionary if required.
* @returns {Promise<[Dict|null, CipherTransformFactory|null, Array|null]>}
*/
async #makeEncrypt() {
if (!this.hasSingleFile) {
return [null, null, null];
}
const { documentData } = this.oldPages[0];
const {
document: {
xref: { trailer, encrypt },
},
} = documentData;
if (!trailer.has("Encrypt")) {
return [null, null, null];
}
const encryptDict = trailer.get("Encrypt");
if (!(encryptDict instanceof Dict)) {
return [null, null, null];
}
this.currentDocument = documentData;
const result = [
await this.#cloneObject(encryptDict, trailer.xref),
encrypt,
trailer.get("ID"),
];
this.currentDocument = null;
return result;
}
/**
* Create the changes required to write the new PDF document.
* @returns {Promise<[RefSetCache, Ref]>}
*/
async #createChanges() {
const changes = new RefSetCache();
changes.put(Ref.get(0, 0xffff), { data: null });
for (let i = 1, ii = this.xref.length; i < ii; i++) {
if (this.objStreamRefs?.has(i)) {
await this.#createObjectStream(Ref.get(i, 0), this.xref[i], changes);
} else {
changes.put(Ref.get(i, 0), { data: this.xref[i] });
}
}
return [changes, this.newRef];
}
/**
* Create an object stream containing the given objects.
* @param {Ref} objStreamRef
* @param {Array<Ref>} objRefs
* @param {RefSetCache} changes
*/
async #createObjectStream(objStreamRef, objRefs, changes) {
const streamBuffer = [""];
const objOffsets = [];
let offset = 0;
const buffer = [];
for (let i = 0, ii = objRefs.length; i < ii; i++) {
const objRef = objRefs[i];
changes.put(objRef, { data: null, objStreamRef, index: i });
objOffsets.push(`${objRef.num} ${offset}`);
const data = this.xref[objRef.num];
await writeValue(data, buffer, /* transform = */ null);
const obj = buffer.join("");
buffer.length = 0;
streamBuffer.push(obj);
offset += obj.length + 1;
}
streamBuffer[0] = objOffsets.join("\n");
const objStream = new StringStream(streamBuffer.join("\n"));
const objStreamDict = (objStream.dict = new Dict());
objStreamDict.setIfName("Type", "ObjStm");
objStreamDict.set("N", objRefs.length);
objStreamDict.set("First", streamBuffer[0].length + 1);
changes.put(objStreamRef, { data: objStream });
}
/**
* Write the new PDF document to a Uint8Array.
* @returns {Promise<Uint8Array>}
*/
async writePDF() {
await this.#makeRoot();
const infoMap = this.#makeInfo();
const [encryptRef, encrypt, fileIds] = await this.#makeEncrypt();
const [changes, xrefTableRef] = await this.#createChanges();
// Create the PDF header in order to help sniffers.
// PDF version must be in the range 1.0 to 1.7 inclusive.
// We add a binary comment line to ensure that the file is treated
// as a binary file by applications that open it.
const header = [
...`%PDF-${this.version}\n%`.split("").map(c => c.charCodeAt(0)),
0xfa,
0xde,
0xfa,
0xce,
];
return incrementalUpdate({
originalData: new Uint8Array(header),
changes,
xrefInfo: {
startXRef: null,
rootRef: this.rootRef,
infoRef: this.infoRef,
encryptRef,
newRef: xrefTableRef,
fileIds: fileIds || [null, null],
infoMap,
},
useXrefStream: this.useObjectStreams,
xref: {
encrypt,
encryptRef,
},
});
}
}
export { PDFEditor };

View File

@ -188,6 +188,10 @@ class Dict {
return [...this._map.values()];
}
getRawEntries() {
return this._map.entries();
}
set(key, value) {
if (typeof PDFJSDev === "undefined" || PDFJSDev.test("TESTING")) {
if (typeof key !== "string") {
@ -231,6 +235,12 @@ class Dict {
}
}
setIfDict(key, value) {
if (value instanceof Dict) {
this.set(key, value);
}
}
has(key) {
return this._map.has(key);
}

View File

@ -82,6 +82,15 @@ class Stream extends BaseStream {
makeSubStream(start, length, dict = null) {
return new Stream(this.bytes.buffer, start, length, dict);
}
clone() {
return new Stream(
this.bytes.buffer,
this.start,
this.end - this.start,
this.dict.clone()
);
}
}
class StringStream extends Stream {

View File

@ -36,6 +36,7 @@ import { MessageHandler, wrapReason } from "../shared/message_handler.js";
import { AnnotationFactory } from "./annotation.js";
import { clearGlobalCaches } from "./cleanup_helper.js";
import { incrementalUpdate } from "./writer.js";
import { PDFEditor } from "./editor/pdf_editor.js";
import { PDFWorkerStream } from "./worker_stream.js";
import { StructTreeRoot } from "./struct_tree.js";
@ -557,6 +558,97 @@ class WorkerMessageHandler {
return pdfManager.ensureDoc("calculationOrderIds");
});
handler.on("ExtractPages", async function ({ pageInfos }) {
if (!pageInfos) {
warn("extractPages: nothing to extract.");
return null;
}
if (!Array.isArray(pageInfos)) {
pageInfos = [pageInfos];
}
let newDocumentId = 0;
for (const pageInfo of pageInfos) {
if (pageInfo.document === null) {
pageInfo.document = pdfManager.pdfDocument;
} else if (ArrayBuffer.isView(pageInfo.document)) {
const manager = new LocalPdfManager({
source: pageInfo.document,
docId: `${docId}_extractPages_${newDocumentId++}`,
handler,
password: pageInfo.password ?? null,
evaluatorOptions: Object.assign({}, pdfManager.evaluatorOptions),
});
let recoveryMode = false;
let isValid = true;
while (true) {
try {
await manager.requestLoadedStream();
await manager.ensureDoc("checkHeader");
await manager.ensureDoc("parseStartXRef");
await manager.ensureDoc("parse", [recoveryMode]);
break;
} catch (e) {
if (e instanceof XRefParseException) {
if (recoveryMode === false) {
recoveryMode = true;
continue;
} else {
isValid = false;
warn("extractPages: XRefParseException.");
}
} else if (e instanceof PasswordException) {
const task = new WorkerTask(
`PasswordException: response ${e.code}`
);
startWorkerTask(task);
try {
const { password } = await handler.sendWithPromise(
"PasswordRequest",
e
);
manager.updatePassword(password);
} catch {
isValid = false;
warn("extractPages: invalid password.");
} finally {
finishWorkerTask(task);
}
} else {
isValid = false;
warn("extractPages: invalid document.");
}
if (!isValid) {
break;
}
}
}
if (!isValid) {
pageInfo.document = null;
}
const isPureXfa = await manager.ensureDoc("isPureXfa");
if (isPureXfa) {
pageInfo.document = null;
warn("extractPages does not support pure XFA documents.");
} else {
pageInfo.document = manager.pdfDocument;
}
} else {
warn("extractPages: invalid document.");
}
}
try {
const pdfEditor = new PDFEditor();
const buffer = await pdfEditor.extractPages(pageInfos);
return buffer;
} catch (reason) {
// eslint-disable-next-line no-console
console.error(reason);
return null;
}
});
handler.on(
"SaveDocument",
async function ({ isPureXfa, numPages, annotationStorage, filename }) {

View File

@ -19,7 +19,6 @@ import {
escapePDFName,
escapeString,
getSizeInBytes,
numberToString,
parseXFAPath,
} from "./core_utils.js";
import { SimpleDOMNode, SimpleXMLParser } from "./xml_parser.js";
@ -27,29 +26,34 @@ import { Stream, StringStream } from "./stream.js";
import { BaseStream } from "./base_stream.js";
import { calculateMD5 } from "./calculate_md5.js";
async function writeObject(ref, obj, buffer, { encrypt = null }) {
const transform = encrypt?.createCipherTransform(ref.num, ref.gen);
async function writeObject(
ref,
obj,
buffer,
{ encrypt = null, encryptRef = null }
) {
// Avoid to encrypt the encrypt dictionary.
const transform =
encrypt && encryptRef !== ref
? encrypt.createCipherTransform(ref.num, ref.gen)
: null;
buffer.push(`${ref.num} ${ref.gen} obj\n`);
if (obj instanceof Dict) {
await writeDict(obj, buffer, transform);
} else if (obj instanceof BaseStream) {
await writeStream(obj, buffer, transform);
} else if (Array.isArray(obj) || ArrayBuffer.isView(obj)) {
await writeArray(obj, buffer, transform);
}
await writeValue(obj, buffer, transform);
buffer.push("\nendobj\n");
}
async function writeDict(dict, buffer, transform) {
buffer.push("<<");
for (const key of dict.getKeys()) {
for (const [key, rawObj] of dict.getRawEntries()) {
buffer.push(` /${escapePDFName(key)} `);
await writeValue(dict.getRaw(key), buffer, transform);
await writeValue(rawObj, buffer, transform);
}
buffer.push(">>");
}
async function writeStream(stream, buffer, transform) {
stream = stream.getOriginalStream();
stream.reset();
let bytes = stream.getBytes();
const { dict } = stream;
@ -67,7 +71,7 @@ async function writeStream(stream, buffer, transform) {
// The number 256 is arbitrary, but it should be reasonable.
const MIN_LENGTH_FOR_COMPRESSING = 256;
if (bytes.length >= MIN_LENGTH_FOR_COMPRESSING || isFilterZeroFlateDecode) {
if (bytes.length >= MIN_LENGTH_FOR_COMPRESSING && !isFilterZeroFlateDecode) {
try {
const cs = new CompressionStream("deflate");
const writer = cs.writable.getWriter();
@ -120,14 +124,11 @@ async function writeStream(stream, buffer, transform) {
async function writeArray(array, buffer, transform) {
buffer.push("[");
let first = true;
for (const val of array) {
if (!first) {
for (let i = 0, ii = array.length; i < ii; i++) {
await writeValue(array[i], buffer, transform);
if (i < ii - 1) {
buffer.push(" ");
} else {
first = false;
}
await writeValue(val, buffer, transform);
}
buffer.push("]");
}
@ -145,7 +146,11 @@ async function writeValue(value, buffer, transform) {
}
buffer.push(`(${escapeString(value)})`);
} else if (typeof value === "number") {
buffer.push(numberToString(value));
// Don't try to round numbers in general, it could lead to have degenerate
// matrices (e.g. [0.000008 0 0 0.000008 0 0]).
// The numbers must be "rounded" only when pdf.js is producing them and the
// current transformation matrix is well known.
buffer.push(value.toString());
} else if (typeof value === "boolean") {
buffer.push(value.toString());
} else if (value instanceof Dict) {
@ -306,7 +311,7 @@ async function getXRefTable(xrefInfo, baseOffset, newRefs, newXref, buffer) {
}
computeIDs(baseOffset, xrefInfo, newXref);
buffer.push("trailer\n");
await writeDict(newXref, buffer);
await writeDict(newXref, buffer, null);
buffer.push("\nstartxref\n", baseOffset.toString(), "\n%%EOF\n");
}
@ -332,10 +337,17 @@ async function getXRefStreamTable(
const xrefTableData = [];
let maxOffset = 0;
let maxGen = 0;
for (const { ref, data } of newRefs) {
for (const { ref, data, objStreamRef, index } of newRefs) {
let gen;
maxOffset = Math.max(maxOffset, baseOffset);
if (data !== null) {
// The first number in each entry is the type (see 7.5.8.3):
// 0: free object
// 1: in-use object
// 2: compressed object
if (objStreamRef) {
gen = index;
xrefTableData.push([2, objStreamRef.num, gen]);
} else if (data !== null) {
gen = Math.min(ref.gen, 0xffff);
xrefTableData.push([1, baseOffset, gen]);
baseOffset += data.length;
@ -371,13 +383,13 @@ async function getXRefStreamTable(
function computeIDs(baseOffset, xrefInfo, newXref) {
if (Array.isArray(xrefInfo.fileIds) && xrefInfo.fileIds.length > 0) {
const md5 = computeMD5(baseOffset, xrefInfo);
newXref.set("ID", [xrefInfo.fileIds[0], md5]);
newXref.set("ID", [xrefInfo.fileIds[0] || md5, md5]);
}
}
function getTrailerDict(xrefInfo, changes, useXrefStream) {
const newXref = new Dict(null);
newXref.set("Prev", xrefInfo.startXRef);
newXref.setIfDefined("Prev", xrefInfo?.startXRef);
const refForXrefTable = xrefInfo.newRef;
if (useXrefStream) {
changes.put(refForXrefTable, { data: "" });
@ -386,21 +398,20 @@ function getTrailerDict(xrefInfo, changes, useXrefStream) {
} else {
newXref.set("Size", refForXrefTable.num);
}
if (xrefInfo.rootRef !== null) {
newXref.set("Root", xrefInfo.rootRef);
}
if (xrefInfo.infoRef !== null) {
newXref.set("Info", xrefInfo.infoRef);
}
if (xrefInfo.encryptRef !== null) {
newXref.set("Encrypt", xrefInfo.encryptRef);
}
newXref.setIfDefined("Root", xrefInfo?.rootRef);
newXref.setIfDefined("Info", xrefInfo?.infoRef);
newXref.setIfDefined("Encrypt", xrefInfo?.encryptRef);
return newXref;
}
async function writeChanges(changes, xref, buffer = []) {
const newRefs = [];
for (const [ref, { data }] of changes.items()) {
for (const [ref, { data, objStreamRef, index }] of changes.items()) {
if (objStreamRef) {
newRefs.push({ ref, data, objStreamRef, index });
continue;
}
if (data === null || typeof data === "string") {
newRefs.push({ ref, data });
continue;
@ -483,4 +494,4 @@ async function incrementalUpdate({
return array;
}
export { incrementalUpdate, writeChanges, writeDict, writeObject };
export { incrementalUpdate, writeChanges, writeDict, writeObject, writeValue };

View File

@ -1025,6 +1025,24 @@ class PDFDocumentProxy {
return this._transport.saveDocument();
}
/**
* @typedef {Object} PageInfo
* @property {null|Uint8Array} document
* @property {Array<Array<number>|number>} [includePages]
* included ranges or indices.
* @property {Array<Array<number>|number>} [excludePages]
* excluded ranges or indices.
*/
/**
* @param {Array<PageInfo>} pageInfos - The pages to extract.
* @returns {Promise<Uint8Array>} A promise that is resolved with a
* {Uint8Array} containing the full data of the saved document.
*/
extractPages(pageInfos) {
return this._transport.extractPages(pageInfos);
}
/**
* @returns {Promise<{ length: number }>} A promise that is resolved when the
* document's data is loaded. It is resolved with an {Object} that contains
@ -2900,6 +2918,10 @@ class WorkerTransport {
});
}
extractPages(pageInfos) {
return this.messageHandler.sendWithPromise("ExtractPages", { pageInfos });
}
getPage(pageNumber) {
if (
!Number.isInteger(pageNumber) ||

View File

@ -506,6 +506,7 @@ class Driver {
this.inFlightRequests = 0;
this.testFilter = JSON.parse(params.get("testfilter") || "[]");
this.xfaOnly = params.get("xfaonly") === "true";
this.masterMode = params.get("mastermode") === "true";
// Create a working canvas
this.canvas = document.createElement("canvas");
@ -591,6 +592,25 @@ class Driver {
task.stats = { times: [] };
task.enableXfa = task.enableXfa === true;
if (task.includePages && task.type === "extract") {
if (this.masterMode) {
const includePages = [];
for (const page of task.includePages) {
if (Array.isArray(page)) {
for (let i = page[0]; i <= page[1]; i++) {
includePages.push(i);
}
} else {
includePages.push(page);
}
}
task.numberOfTasks = includePages.length;
task.includePages = includePages;
} else {
delete task.pageMapping;
}
}
const prevFile = md5FileMap.get(task.md5);
if (prevFile) {
if (task.file !== prevFile) {
@ -658,6 +678,20 @@ class Driver {
});
let promise = loadingTask.promise;
if (!this.masterMode && task.type === "extract") {
promise = promise.then(async doc => {
const data = await doc.extractPages([
{
document: null,
includePages: task.includePages,
},
]);
await loadingTask.destroy();
delete task.includePages;
return getDocument(data).promise;
});
}
if (task.annotationStorage) {
for (const annotation of Object.values(task.annotationStorage)) {
const { bitmapName, quadPoints, paths, outlines } = annotation;
@ -862,7 +896,12 @@ class Driver {
}
}
if (task.skipPages?.includes(task.pageNum)) {
if (
task.skipPages?.includes(task.pageNum) ||
(this.masterMode &&
task.includePages &&
!task.includePages.includes(task.pageNum - 1))
) {
this._log(
` Skipping page ${task.pageNum}/${task.pdfDoc.numPages}...\n`
);
@ -1274,10 +1313,11 @@ class Driver {
id: task.id,
numPages: task.pdfDoc ? task.lastPage || task.pdfDoc.numPages : 0,
lastPageNum: this._getLastPageNumber(task),
numberOfTasks: task.numberOfTasks ?? -1,
failure,
file: task.file,
round: task.round,
page: task.pageNum,
page: task.pageMapping?.[task.pageNum] ?? task.pageNum,
snapshot,
baselineSnapshot,
stats: task.stats.times,

View File

@ -754,3 +754,6 @@
!bug1937438_from_word.pdf
!bug1937438_mml_from_latex.pdf
!bug1997343.pdf
!doc_1_3_pages.pdf
!doc_2_3_pages.pdf
!doc_3_3_pages.pdf

BIN
test/pdfs/doc_1_3_pages.pdf Executable file

Binary file not shown.

BIN
test/pdfs/doc_2_3_pages.pdf Executable file

Binary file not shown.

BIN
test/pdfs/doc_3_3_pages.pdf Executable file

Binary file not shown.

View File

@ -672,6 +672,7 @@ function checkRefTestResults(browser, id, results) {
case "partial":
case "text":
case "highlight":
case "extract":
checkEq(task, results, browser, session.masterMode);
break;
case "fbf":
@ -731,6 +732,7 @@ function refTestPostHandler(parsedUrl, req, res) {
var snapshot = data.snapshot;
var baselineSnapshot = data.baselineSnapshot;
var lastPageNum = data.lastPageNum;
var numberOfTasks = data.numberOfTasks;
session = getSession(browser);
monitorBrowserTimeout(session, handleSessionTimeout);
@ -773,7 +775,10 @@ function refTestPostHandler(parsedUrl, req, res) {
});
}
var isDone = taskResults.at(-1)?.[lastPageNum - 1];
const lastTaskResults = taskResults.at(-1);
const isDone =
lastTaskResults?.[lastPageNum - 1] ||
lastTaskResults?.filter(result => !!result).length === numberOfTasks;
if (isDone) {
checkRefTestResults(browser, id, taskResults);
session.remaining--;

View File

@ -13049,5 +13049,23 @@
"rotation": 0
}
}
},
{
"id": "tracemonkey-extract_0_2_12",
"file": "pdfs/tracemonkey.pdf",
"md5": "9a192d8b1a7dc652a19835f6f08098bd",
"rounds": 1,
"type": "extract",
"includePages": [0, 2, 12],
"pageMapping": { "1": 1, "3": 2, "13": 3 }
},
{
"id": "bug900822-encrypted-extract_0",
"file": "pdfs/bug900822.pdf",
"md5": "70e2a3c5922574eeda169c955cf9d084",
"rounds": 1,
"type": "extract",
"includePages": [0],
"pageMapping": { "1": 1 }
}
]

View File

@ -5335,4 +5335,212 @@ deployment as easy as distributing a source file. They are used for
small scripts as well as for`);
});
});
describe("PDF page editing", function () {
describe("Merge pdfs", function () {
it("should merge three PDFs", async function () {
const loadingTask = getDocument(
buildGetDocumentParams("doc_1_3_pages.pdf")
);
const pdfDoc = await loadingTask.promise;
const pdfData2 = await DefaultFileReaderFactory.fetch({
path: TEST_PDFS_PATH + "doc_2_3_pages.pdf",
});
const pdfData3 = await DefaultFileReaderFactory.fetch({
path: TEST_PDFS_PATH + "doc_3_3_pages.pdf",
});
let data = await pdfDoc.extractPages([
{ document: null },
{ document: pdfData2 },
{ document: pdfData3 },
]);
let newLoadingTask = getDocument(data);
let newPdfDoc = await newLoadingTask.promise;
expect(newPdfDoc.numPages).toEqual(9);
for (let i = 1; i <= 9; i++) {
const pdfPage = await newPdfDoc.getPage(i);
const { items: textItems } = await pdfPage.getTextContent();
expect(mergeText(textItems)).toEqual(
`Document ${Math.ceil(i / 3)}:Page ${((i - 1) % 3) + 1}`
);
}
await newLoadingTask.destroy();
data = await pdfDoc.extractPages([
{ document: pdfData3 },
{ document: pdfData2 },
{ document: null },
]);
newLoadingTask = getDocument(data);
newPdfDoc = await newLoadingTask.promise;
expect(newPdfDoc.numPages).toEqual(9);
for (let i = 1; i <= 9; i++) {
const pdfPage = await newPdfDoc.getPage(i);
const { items: textItems } = await pdfPage.getTextContent();
expect(mergeText(textItems)).toEqual(
`Document ${Math.ceil((10 - i) / 3)}:Page ${((i - 1) % 3) + 1}`
);
}
await newLoadingTask.destroy();
data = await pdfDoc.extractPages([
{ document: null, includePages: [0] },
{ document: pdfData2, includePages: [0] },
{ document: pdfData3, includePages: [0] },
]);
newLoadingTask = getDocument(data);
newPdfDoc = await newLoadingTask.promise;
expect(newPdfDoc.numPages).toEqual(3);
for (let i = 1; i <= 3; i++) {
const pdfPage = await newPdfDoc.getPage(i);
const { items: textItems } = await pdfPage.getTextContent();
expect(mergeText(textItems)).toEqual(`Document ${i}:Page 1`);
}
await newLoadingTask.destroy();
data = await pdfDoc.extractPages([
{ document: null, excludePages: [0] },
{ document: pdfData2, excludePages: [0] },
{ document: pdfData3, excludePages: [0] },
]);
newLoadingTask = getDocument(data);
newPdfDoc = await newLoadingTask.promise;
expect(newPdfDoc.numPages).toEqual(6);
for (let i = 1; i <= 6; i++) {
const pdfPage = await newPdfDoc.getPage(i);
const { items: textItems } = await pdfPage.getTextContent();
expect(mergeText(textItems)).toEqual(
`Document ${Math.ceil(i / 2)}:Page ${((i - 1) % 2) + 2}`
);
}
await newLoadingTask.destroy();
await loadingTask.destroy();
});
it("should merge two PDFs with page included ranges", async function () {
const loadingTask = getDocument(
buildGetDocumentParams("tracemonkey.pdf")
);
const pdfDoc = await loadingTask.promise;
const pdfData1 = await DefaultFileReaderFactory.fetch({
path: TEST_PDFS_PATH + "doc_1_3_pages.pdf",
});
const data = await pdfDoc.extractPages([
{ document: pdfData1, includePages: [[0, 0], 2] },
{ document: null, includePages: [[2, 4], 7] },
]);
const newLoadingTask = getDocument(data);
const newPdfDoc = await newLoadingTask.promise;
expect(newPdfDoc.numPages).toEqual(6);
for (let i = 1; i <= 2; i++) {
const pdfPage = await newPdfDoc.getPage(i);
const { items: textItems } = await pdfPage.getTextContent();
expect(mergeText(textItems)).toEqual(`Document 1:Page ${2 * i - 1}`);
}
const expectedPagesText = [
"v0 := ld s",
"i=4. On th",
"resentatio",
"5.1 Optimi",
];
for (let i = 3; i <= 6; i++) {
const pdfPage = await newPdfDoc.getPage(i);
const { items: textItems } = await pdfPage.getTextContent();
const text = mergeText(textItems);
expect(text.substring(0, 10)).toEqual(expectedPagesText[i - 3]);
}
await newLoadingTask.destroy();
await loadingTask.destroy();
});
it("should merge two PDFs with page excluded ranges", async function () {
const loadingTask = getDocument(
buildGetDocumentParams("tracemonkey.pdf")
);
const pdfDoc = await loadingTask.promise;
const pdfData1 = await DefaultFileReaderFactory.fetch({
path: TEST_PDFS_PATH + "doc_1_3_pages.pdf",
});
const data = await pdfDoc.extractPages([
{ document: pdfData1, excludePages: [[1, 1]] },
{
document: null,
excludePages: [
[0, 1],
[5, 6],
[8, 13],
],
},
]);
const newLoadingTask = getDocument(data);
const newPdfDoc = await newLoadingTask.promise;
expect(newPdfDoc.numPages).toEqual(6);
for (let i = 1; i <= 2; i++) {
const pdfPage = await newPdfDoc.getPage(i);
const { items: textItems } = await pdfPage.getTextContent();
expect(mergeText(textItems)).toEqual(`Document 1:Page ${2 * i - 1}`);
}
const expectedPagesText = [
"v0 := ld s",
"i=4. On th",
"resentatio",
"5.1 Optimi",
];
for (let i = 3; i <= 6; i++) {
const pdfPage = await newPdfDoc.getPage(i);
const { items: textItems } = await pdfPage.getTextContent();
const text = mergeText(textItems);
expect(text.substring(0, 10)).toEqual(expectedPagesText[i - 3]);
}
await newLoadingTask.destroy();
await loadingTask.destroy();
});
it("should merge two PDFs with one with a password", async function () {
const loadingTask = getDocument(
buildGetDocumentParams("doc_1_3_pages.pdf")
);
const pdfDoc = await loadingTask.promise;
const pdfData1 = await DefaultFileReaderFactory.fetch({
path: TEST_PDFS_PATH + "pr6531_2.pdf",
});
const data = await pdfDoc.extractPages([
{ document: null, includePages: [0] },
{ document: pdfData1, password: "asdfasdf" },
]);
const newLoadingTask = getDocument(data);
const newPdfDoc = await newLoadingTask.promise;
expect(newPdfDoc.numPages).toEqual(2);
const expectedPagesText = ["Document 1:Page 1", ""];
for (let i = 1; i <= 2; i++) {
const pdfPage = await newPdfDoc.getPage(i);
const { items: textItems } = await pdfPage.getTextContent();
expect(mergeText(textItems)).toEqual(expectedPagesText[i - 1]);
}
const page2 = await newPdfDoc.getPage(2);
const annots = await page2.getAnnotations();
expect(annots.length).toEqual(1);
expect(annots[0].contentsObj.str).toEqual(
"Bluebeam should be encrypting this."
);
await newLoadingTask.destroy();
await loadingTask.destroy();
});
});
});
});

View File

@ -310,6 +310,16 @@ describe("primitives", function () {
expect(rawValues2.sort()).toEqual(expectedRawValues2);
});
it("should get all raw entries", function () {
const expectedRawEntries = [
["FontFile", testFontFile],
["FontFile2", testFontFile2],
["FontFile3", testFontFile3],
];
const rawEntries = Array.from(dictWithManyKeys.getRawEntries());
expect(rawEntries.sort()).toEqual(expectedRawEntries);
});
it("should create only one object for Dict.empty", function () {
const firstDictEmpty = Dict.empty;
const secondDictEmpty = Dict.empty;
@ -423,6 +433,12 @@ describe("primitives", function () {
dict.setIfName("k", 1234);
expect(dict.has("k")).toBeFalse();
dict.setIfDict("l", new Dict());
expect(dict.get("l")).toEqual(new Dict());
dict.setIfDict("m", "not a dict");
expect(dict.has("m")).toBeFalse();
});
});

View File

@ -170,8 +170,8 @@ describe("Writer", function () {
const expected =
"<< /A /B /B 123 456 R /C 789 /D (hello world) " +
"/E (\\(hello\\\\world\\)) /F [1.23 4.5 6] " +
"/G << /H 123 /I << /Length 8>> stream\n" +
"/E (\\(hello\\\\world\\)) /F [1.23001 4.50001 6] " +
"/G << /H 123.00001 /I << /Length 8>> stream\n" +
"a stream\n" +
"endstream>> /J true /K false " +
"/NullArr [null 10] /NullVal null>>";