Merge pull request #20409 from calixteman/split_merge_p1

Add the possibility to create a pdf from different ones (bug 1997379)
This commit is contained in:
calixteman 2025-11-07 15:05:52 +01:00 committed by GitHub
commit 85ed401b82
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
19 changed files with 1089 additions and 44 deletions

View File

@ -131,6 +131,19 @@ class DecodeStream extends BaseStream {
getBaseStreams() {
return this.stream ? this.stream.getBaseStreams() : null;
}
clone() {
// Make sure it has been fully read.
while (!this.eof) {
this.readBlock();
}
return new Stream(
this.buffer,
this.start,
this.end - this.start,
this.dict.clone()
);
}
}
class StreamsSequenceStream extends DecodeStream {

View File

@ -52,6 +52,10 @@ class DecryptStream extends DecodeStream {
buffer.set(chunk, bufferLength);
this.bufferLength = newLength;
}
getOriginalStream() {
return this;
}
}
export { DecryptStream };

View File

@ -178,7 +178,7 @@ class Page {
);
}
#getBoundingBox(name) {
getBoundingBox(name) {
if (this.xfaData) {
return this.xfaData.bbox;
}
@ -201,7 +201,7 @@ class Page {
return shadow(
this,
"mediaBox",
this.#getBoundingBox("MediaBox") || LETTER_SIZE_MEDIABOX
this.getBoundingBox("MediaBox") || LETTER_SIZE_MEDIABOX
);
}
@ -210,7 +210,7 @@ class Page {
return shadow(
this,
"cropBox",
this.#getBoundingBox("CropBox") || this.mediaBox
this.getBoundingBox("CropBox") || this.mediaBox
);
}

View File

@ -0,0 +1,594 @@
/* Copyright 2025 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** @typedef {import("../document.js").PDFDocument} PDFDocument */
/** @typedef {import("../document.js").Page} Page */
/** @typedef {import("../xref.js").XRef} XRef */
import { Dict, isName, Ref, RefSetCache } from "../primitives.js";
import { getModificationDate, stringToPDFString } from "../../shared/util.js";
import { incrementalUpdate, writeValue } from "../writer.js";
import { BaseStream } from "../base_stream.js";
import { StringStream } from "../stream.js";
import { stringToAsciiOrUTF16BE } from "../core_utils.js";
const MAX_LEAVES_PER_PAGES_NODE = 16;
class PageData {
constructor(page, documentData) {
this.page = page;
this.documentData = documentData;
this.annotations = null;
documentData.pagesMap.put(page.ref, this);
}
}
class DocumentData {
constructor(document) {
this.document = document;
this.pagesMap = new RefSetCache();
this.oldRefMapping = new RefSetCache();
}
}
class PDFEditor {
constructor({ useObjectStreams = true, title = "", author = "" } = {}) {
this.hasSingleFile = false;
this.currentDocument = null;
this.oldPages = [];
this.newPages = [];
this.xref = [null];
this.newRefCount = 1;
[this.rootRef, this.rootDict] = this.newDict;
[this.infoRef, this.infoDict] = this.newDict;
[this.pagesRef, this.pagesDict] = this.newDict;
this.namesDict = null;
this.useObjectStreams = useObjectStreams;
this.objStreamRefs = useObjectStreams ? new Set() : null;
this.version = "1.7";
this.title = title;
this.author = author;
}
/**
* Get a new reference for an object in the PDF.
* @returns {Ref}
*/
get newRef() {
const ref = Ref.get(this.newRefCount++, 0);
return ref;
}
/**
* Create a new dictionary and its reference.
* @returns {[Ref, Dict]}
*/
get newDict() {
const ref = this.newRef;
const dict = (this.xref[ref.num] = new Dict());
return [ref, dict];
}
/**
* Clone an object in the PDF.
* @param {*} obj
* @param {XRef} xref
* @returns {Promise<Ref>}
*/
async #cloneObject(obj, xref) {
const ref = this.newRef;
this.xref[ref.num] = await this.#collectDependencies(obj, true, xref);
return ref;
}
/**
* Collect the dependencies of an object and create new references for each
* dependency.
* @param {*} obj
* @param {boolean} mustClone
* @param {XRef} xref
* @returns {Promise<*>}
*/
async #collectDependencies(obj, mustClone, xref) {
if (obj instanceof Ref) {
const {
currentDocument: { oldRefMapping },
} = this;
let newRef = oldRefMapping.get(obj);
if (newRef) {
return newRef;
}
newRef = this.newRef;
oldRefMapping.put(obj, newRef);
obj = await xref.fetchAsync(obj);
if (typeof PDFJSDev === "undefined" || PDFJSDev.test("TESTING")) {
if (
obj instanceof Dict &&
isName(obj.get("Type"), "Page") &&
!this.currentDocument.pagesMap.has(obj)
) {
throw new Error(
"Add a deleted page to the document is not supported."
);
}
}
this.xref[newRef.num] = await this.#collectDependencies(obj, true, xref);
return newRef;
}
const promises = [];
if (Array.isArray(obj)) {
if (mustClone) {
obj = obj.slice();
}
for (let i = 0, ii = obj.length; i < ii; i++) {
promises.push(
this.#collectDependencies(obj[i], true, xref).then(
newObj => (obj[i] = newObj)
)
);
}
await Promise.all(promises);
return obj;
}
let dict;
if (obj instanceof BaseStream) {
({ dict } = obj = obj.getOriginalStream().clone());
} else if (obj instanceof Dict) {
if (mustClone) {
obj = obj.clone();
}
dict = obj;
}
if (dict) {
for (const [key, rawObj] of dict.getRawEntries()) {
promises.push(
this.#collectDependencies(rawObj, true, xref).then(newObj =>
dict.set(key, newObj)
)
);
}
await Promise.all(promises);
}
return obj;
}
/**
* @typedef {Object} PageInfo
* @property {PDFDocument} document
* @property {Array<Array<number>|number>} [includePages]
* included ranges (inclusive) or indices.
* @property {Array<Array<number>|number>} [excludePages]
* excluded ranges (inclusive) or indices.
*/
/**
* Extract pages from the given documents.
* @param {Array<PageInfo>} pageInfos
* @return {Promise<void>}
*/
async extractPages(pageInfos) {
const promises = [];
let newIndex = 0;
this.hasSingleFile = pageInfos.length === 1;
for (const { document, includePages, excludePages } of pageInfos) {
if (!document) {
continue;
}
const documentData = new DocumentData(document);
promises.push(this.#collectDocumentData(documentData));
let keptIndices, keptRanges, deletedIndices, deletedRanges;
for (const page of includePages || []) {
if (Array.isArray(page)) {
(keptRanges ||= []).push(page);
} else {
(keptIndices ||= new Set()).add(page);
}
}
for (const page of excludePages || []) {
if (Array.isArray(page)) {
(deletedRanges ||= []).push(page);
} else {
(deletedIndices ||= new Set()).add(page);
}
}
for (let i = 0, ii = document.numPages; i < ii; i++) {
if (deletedIndices?.has(i)) {
continue;
}
if (deletedRanges) {
let isDeleted = false;
for (const [start, end] of deletedRanges) {
if (i >= start && i <= end) {
isDeleted = true;
break;
}
}
if (isDeleted) {
continue;
}
}
let takePage = false;
if (keptIndices) {
takePage = keptIndices.has(i);
}
if (!takePage && keptRanges) {
for (const [start, end] of keptRanges) {
if (i >= start && i <= end) {
takePage = true;
break;
}
}
}
if (!takePage && !keptIndices && !keptRanges) {
takePage = true;
}
if (!takePage) {
continue;
}
const newPageIndex = newIndex++;
promises.push(
document.getPage(i).then(page => {
this.oldPages[newPageIndex] = new PageData(page, documentData);
})
);
}
}
await Promise.all(promises);
promises.length = 0;
for (const page of this.oldPages) {
promises.push(this.#postCollectPageData(page));
}
await Promise.all(promises);
for (let i = 0, ii = this.oldPages.length; i < ii; i++) {
this.newPages[i] = await this.#makePageCopy(i, null);
}
return this.writePDF();
}
/**
* Collect the document data.
* @param {DocumentData} documentData
* @return {Promise<void>}
*/
async #collectDocumentData(documentData) {}
/**
* Post process the collected page data.
* @param {PageData} pageData
* @returns {Promise<void>}
*/
async #postCollectPageData(pageData) {
const {
page: { xref, annotations },
} = pageData;
if (!annotations) {
return;
}
const promises = [];
let newAnnotations = [];
let newIndex = 0;
// TODO: remove only links to deleted pages.
for (const annotationRef of annotations) {
const newAnnotationIndex = newIndex++;
promises.push(
xref.fetchIfRefAsync(annotationRef).then(async annotationDict => {
if (!isName(annotationDict.get("Subtype"), "Link")) {
newAnnotations[newAnnotationIndex] = annotationRef;
}
})
);
}
await Promise.all(promises);
newAnnotations = newAnnotations.filter(annot => !!annot);
pageData.annotations = newAnnotations.length > 0 ? newAnnotations : null;
}
/**
* Create a copy of a page.
* @param {number} pageIndex
* @returns {Promise<Ref>} the page reference in the new PDF document.
*/
async #makePageCopy(pageIndex) {
const { page, documentData, annotations } = this.oldPages[pageIndex];
this.currentDocument = documentData;
const { oldRefMapping } = documentData;
const { xref, rotate, mediaBox, resources, ref: oldPageRef } = page;
const pageRef = this.newRef;
const pageDict = (this.xref[pageRef.num] = page.pageDict.clone());
oldRefMapping.put(oldPageRef, pageRef);
// No need to keep these entries as we'll set them again later.
for (const key of [
"Rotate",
"MediaBox",
"CropBox",
"BleedBox",
"TrimBox",
"ArtBox",
"Resources",
"Annots",
"Parent",
"UserUnit",
]) {
pageDict.delete(key);
}
const lastRef = this.newRefCount;
await this.#collectDependencies(pageDict, false, xref);
pageDict.set("Rotate", rotate);
pageDict.set("MediaBox", mediaBox);
for (const boxName of ["CropBox", "BleedBox", "TrimBox", "ArtBox"]) {
const box = page.getBoundingBox(boxName);
if (box?.some((value, index) => value !== mediaBox[index])) {
// These boxes are optional and their default value is the MediaBox.
pageDict.set(boxName, box);
}
}
const userUnit = page.userUnit;
if (userUnit !== 1) {
pageDict.set("UserUnit", userUnit);
}
pageDict.setIfDict(
"Resources",
await this.#collectDependencies(resources, true, xref)
);
pageDict.setIfArray(
"Annots",
await this.#collectDependencies(annotations, true, xref)
);
if (this.useObjectStreams) {
const newLastRef = this.newRefCount;
const pageObjectRefs = [];
for (let i = lastRef; i < newLastRef; i++) {
const obj = this.xref[i];
if (obj instanceof BaseStream) {
continue;
}
pageObjectRefs.push(Ref.get(i, 0));
}
for (let i = 0; i < pageObjectRefs.length; i += 0xffff) {
const objStreamRef = this.newRef;
this.objStreamRefs.add(objStreamRef.num);
this.xref[objStreamRef.num] = pageObjectRefs.slice(i, i + 0xffff);
}
}
this.currentDocument = null;
return pageRef;
}
/**
* Create the page tree structure.
*/
#makePageTree() {
const { newPages: pages, rootDict, pagesRef, pagesDict } = this;
rootDict.set("Pages", pagesRef);
pagesDict.setIfName("Type", "Pages");
pagesDict.set("Count", pages.length);
const maxLeaves =
MAX_LEAVES_PER_PAGES_NODE <= 1 ? pages.length : MAX_LEAVES_PER_PAGES_NODE;
const stack = [{ dict: pagesDict, kids: pages, parentRef: pagesRef }];
while (stack.length > 0) {
const { dict, kids, parentRef } = stack.pop();
if (kids.length <= maxLeaves) {
dict.set("Kids", kids);
for (const ref of kids) {
this.xref[ref.num].set("Parent", parentRef);
}
continue;
}
const chunkSize = Math.max(maxLeaves, Math.ceil(kids.length / maxLeaves));
const kidsChunks = [];
for (let i = 0; i < kids.length; i += chunkSize) {
kidsChunks.push(kids.slice(i, i + chunkSize));
}
const kidsRefs = [];
dict.set("Kids", kidsRefs);
for (const chunk of kidsChunks) {
const [kidRef, kidDict] = this.newDict;
kidsRefs.push(kidRef);
kidDict.setIfName("Type", "Pages");
kidDict.set("Parent", parentRef);
kidDict.set("Count", chunk.length);
stack.push({ dict: kidDict, kids: chunk, parentRef: kidRef });
}
}
}
/**
* Create the root dictionary.
* @returns {Promise<void>}
*/
async #makeRoot() {
const { rootDict } = this;
rootDict.setIfName("Type", "Catalog");
rootDict.set("Version", this.version);
this.#makePageTree();
}
/**
* Create the info dictionary.
* @returns {Map} infoMap
*/
#makeInfo() {
const infoMap = new Map();
if (this.hasSingleFile) {
const {
xref: { trailer },
} = this.oldPages[0].documentData.document;
const oldInfoDict = trailer.get("Info");
for (const [key, value] of oldInfoDict || []) {
if (typeof value === "string") {
infoMap.set(key, stringToPDFString(value));
}
}
}
infoMap.delete("ModDate");
infoMap.set("CreationDate", getModificationDate());
infoMap.set("Creator", "PDF.js");
infoMap.set("Producer", "Firefox");
if (this.author) {
infoMap.set("Author", this.author);
}
if (this.title) {
infoMap.set("Title", this.title);
}
for (const [key, value] of infoMap) {
this.infoDict.set(key, stringToAsciiOrUTF16BE(value));
}
return infoMap;
}
/**
* Create the encryption dictionary if required.
* @returns {Promise<[Dict|null, CipherTransformFactory|null, Array|null]>}
*/
async #makeEncrypt() {
if (!this.hasSingleFile) {
return [null, null, null];
}
const { documentData } = this.oldPages[0];
const {
document: {
xref: { trailer, encrypt },
},
} = documentData;
if (!trailer.has("Encrypt")) {
return [null, null, null];
}
const encryptDict = trailer.get("Encrypt");
if (!(encryptDict instanceof Dict)) {
return [null, null, null];
}
this.currentDocument = documentData;
const result = [
await this.#cloneObject(encryptDict, trailer.xref),
encrypt,
trailer.get("ID"),
];
this.currentDocument = null;
return result;
}
/**
* Create the changes required to write the new PDF document.
* @returns {Promise<[RefSetCache, Ref]>}
*/
async #createChanges() {
const changes = new RefSetCache();
changes.put(Ref.get(0, 0xffff), { data: null });
for (let i = 1, ii = this.xref.length; i < ii; i++) {
if (this.objStreamRefs?.has(i)) {
await this.#createObjectStream(Ref.get(i, 0), this.xref[i], changes);
} else {
changes.put(Ref.get(i, 0), { data: this.xref[i] });
}
}
return [changes, this.newRef];
}
/**
* Create an object stream containing the given objects.
* @param {Ref} objStreamRef
* @param {Array<Ref>} objRefs
* @param {RefSetCache} changes
*/
async #createObjectStream(objStreamRef, objRefs, changes) {
const streamBuffer = [""];
const objOffsets = [];
let offset = 0;
const buffer = [];
for (let i = 0, ii = objRefs.length; i < ii; i++) {
const objRef = objRefs[i];
changes.put(objRef, { data: null, objStreamRef, index: i });
objOffsets.push(`${objRef.num} ${offset}`);
const data = this.xref[objRef.num];
await writeValue(data, buffer, /* transform = */ null);
const obj = buffer.join("");
buffer.length = 0;
streamBuffer.push(obj);
offset += obj.length + 1;
}
streamBuffer[0] = objOffsets.join("\n");
const objStream = new StringStream(streamBuffer.join("\n"));
const objStreamDict = (objStream.dict = new Dict());
objStreamDict.setIfName("Type", "ObjStm");
objStreamDict.set("N", objRefs.length);
objStreamDict.set("First", streamBuffer[0].length + 1);
changes.put(objStreamRef, { data: objStream });
}
/**
* Write the new PDF document to a Uint8Array.
* @returns {Promise<Uint8Array>}
*/
async writePDF() {
await this.#makeRoot();
const infoMap = this.#makeInfo();
const [encryptRef, encrypt, fileIds] = await this.#makeEncrypt();
const [changes, xrefTableRef] = await this.#createChanges();
// Create the PDF header in order to help sniffers.
// PDF version must be in the range 1.0 to 1.7 inclusive.
// We add a binary comment line to ensure that the file is treated
// as a binary file by applications that open it.
const header = [
...`%PDF-${this.version}\n%`.split("").map(c => c.charCodeAt(0)),
0xfa,
0xde,
0xfa,
0xce,
];
return incrementalUpdate({
originalData: new Uint8Array(header),
changes,
xrefInfo: {
startXRef: null,
rootRef: this.rootRef,
infoRef: this.infoRef,
encryptRef,
newRef: xrefTableRef,
fileIds: fileIds || [null, null],
infoMap,
},
useXrefStream: this.useObjectStreams,
xref: {
encrypt,
encryptRef,
},
});
}
}
export { PDFEditor };

View File

@ -188,6 +188,10 @@ class Dict {
return [...this._map.values()];
}
getRawEntries() {
return this._map.entries();
}
set(key, value) {
if (typeof PDFJSDev === "undefined" || PDFJSDev.test("TESTING")) {
if (typeof key !== "string") {
@ -231,6 +235,12 @@ class Dict {
}
}
setIfDict(key, value) {
if (value instanceof Dict) {
this.set(key, value);
}
}
has(key) {
return this._map.has(key);
}

View File

@ -82,6 +82,15 @@ class Stream extends BaseStream {
makeSubStream(start, length, dict = null) {
return new Stream(this.bytes.buffer, start, length, dict);
}
clone() {
return new Stream(
this.bytes.buffer,
this.start,
this.end - this.start,
this.dict.clone()
);
}
}
class StringStream extends Stream {

View File

@ -36,6 +36,7 @@ import { MessageHandler, wrapReason } from "../shared/message_handler.js";
import { AnnotationFactory } from "./annotation.js";
import { clearGlobalCaches } from "./cleanup_helper.js";
import { incrementalUpdate } from "./writer.js";
import { PDFEditor } from "./editor/pdf_editor.js";
import { PDFWorkerStream } from "./worker_stream.js";
import { StructTreeRoot } from "./struct_tree.js";
@ -557,6 +558,97 @@ class WorkerMessageHandler {
return pdfManager.ensureDoc("calculationOrderIds");
});
handler.on("ExtractPages", async function ({ pageInfos }) {
if (!pageInfos) {
warn("extractPages: nothing to extract.");
return null;
}
if (!Array.isArray(pageInfos)) {
pageInfos = [pageInfos];
}
let newDocumentId = 0;
for (const pageInfo of pageInfos) {
if (pageInfo.document === null) {
pageInfo.document = pdfManager.pdfDocument;
} else if (ArrayBuffer.isView(pageInfo.document)) {
const manager = new LocalPdfManager({
source: pageInfo.document,
docId: `${docId}_extractPages_${newDocumentId++}`,
handler,
password: pageInfo.password ?? null,
evaluatorOptions: Object.assign({}, pdfManager.evaluatorOptions),
});
let recoveryMode = false;
let isValid = true;
while (true) {
try {
await manager.requestLoadedStream();
await manager.ensureDoc("checkHeader");
await manager.ensureDoc("parseStartXRef");
await manager.ensureDoc("parse", [recoveryMode]);
break;
} catch (e) {
if (e instanceof XRefParseException) {
if (recoveryMode === false) {
recoveryMode = true;
continue;
} else {
isValid = false;
warn("extractPages: XRefParseException.");
}
} else if (e instanceof PasswordException) {
const task = new WorkerTask(
`PasswordException: response ${e.code}`
);
startWorkerTask(task);
try {
const { password } = await handler.sendWithPromise(
"PasswordRequest",
e
);
manager.updatePassword(password);
} catch {
isValid = false;
warn("extractPages: invalid password.");
} finally {
finishWorkerTask(task);
}
} else {
isValid = false;
warn("extractPages: invalid document.");
}
if (!isValid) {
break;
}
}
}
if (!isValid) {
pageInfo.document = null;
}
const isPureXfa = await manager.ensureDoc("isPureXfa");
if (isPureXfa) {
pageInfo.document = null;
warn("extractPages does not support pure XFA documents.");
} else {
pageInfo.document = manager.pdfDocument;
}
} else {
warn("extractPages: invalid document.");
}
}
try {
const pdfEditor = new PDFEditor();
const buffer = await pdfEditor.extractPages(pageInfos);
return buffer;
} catch (reason) {
// eslint-disable-next-line no-console
console.error(reason);
return null;
}
});
handler.on(
"SaveDocument",
async function ({ isPureXfa, numPages, annotationStorage, filename }) {

View File

@ -19,7 +19,6 @@ import {
escapePDFName,
escapeString,
getSizeInBytes,
numberToString,
parseXFAPath,
} from "./core_utils.js";
import { SimpleDOMNode, SimpleXMLParser } from "./xml_parser.js";
@ -27,29 +26,34 @@ import { Stream, StringStream } from "./stream.js";
import { BaseStream } from "./base_stream.js";
import { calculateMD5 } from "./calculate_md5.js";
async function writeObject(ref, obj, buffer, { encrypt = null }) {
const transform = encrypt?.createCipherTransform(ref.num, ref.gen);
async function writeObject(
ref,
obj,
buffer,
{ encrypt = null, encryptRef = null }
) {
// Avoid to encrypt the encrypt dictionary.
const transform =
encrypt && encryptRef !== ref
? encrypt.createCipherTransform(ref.num, ref.gen)
: null;
buffer.push(`${ref.num} ${ref.gen} obj\n`);
if (obj instanceof Dict) {
await writeDict(obj, buffer, transform);
} else if (obj instanceof BaseStream) {
await writeStream(obj, buffer, transform);
} else if (Array.isArray(obj) || ArrayBuffer.isView(obj)) {
await writeArray(obj, buffer, transform);
}
await writeValue(obj, buffer, transform);
buffer.push("\nendobj\n");
}
async function writeDict(dict, buffer, transform) {
buffer.push("<<");
for (const key of dict.getKeys()) {
for (const [key, rawObj] of dict.getRawEntries()) {
buffer.push(` /${escapePDFName(key)} `);
await writeValue(dict.getRaw(key), buffer, transform);
await writeValue(rawObj, buffer, transform);
}
buffer.push(">>");
}
async function writeStream(stream, buffer, transform) {
stream = stream.getOriginalStream();
stream.reset();
let bytes = stream.getBytes();
const { dict } = stream;
@ -67,7 +71,7 @@ async function writeStream(stream, buffer, transform) {
// The number 256 is arbitrary, but it should be reasonable.
const MIN_LENGTH_FOR_COMPRESSING = 256;
if (bytes.length >= MIN_LENGTH_FOR_COMPRESSING || isFilterZeroFlateDecode) {
if (bytes.length >= MIN_LENGTH_FOR_COMPRESSING && !isFilterZeroFlateDecode) {
try {
const cs = new CompressionStream("deflate");
const writer = cs.writable.getWriter();
@ -120,14 +124,11 @@ async function writeStream(stream, buffer, transform) {
async function writeArray(array, buffer, transform) {
buffer.push("[");
let first = true;
for (const val of array) {
if (!first) {
for (let i = 0, ii = array.length; i < ii; i++) {
await writeValue(array[i], buffer, transform);
if (i < ii - 1) {
buffer.push(" ");
} else {
first = false;
}
await writeValue(val, buffer, transform);
}
buffer.push("]");
}
@ -145,7 +146,11 @@ async function writeValue(value, buffer, transform) {
}
buffer.push(`(${escapeString(value)})`);
} else if (typeof value === "number") {
buffer.push(numberToString(value));
// Don't try to round numbers in general, it could lead to have degenerate
// matrices (e.g. [0.000008 0 0 0.000008 0 0]).
// The numbers must be "rounded" only when pdf.js is producing them and the
// current transformation matrix is well known.
buffer.push(value.toString());
} else if (typeof value === "boolean") {
buffer.push(value.toString());
} else if (value instanceof Dict) {
@ -306,7 +311,7 @@ async function getXRefTable(xrefInfo, baseOffset, newRefs, newXref, buffer) {
}
computeIDs(baseOffset, xrefInfo, newXref);
buffer.push("trailer\n");
await writeDict(newXref, buffer);
await writeDict(newXref, buffer, null);
buffer.push("\nstartxref\n", baseOffset.toString(), "\n%%EOF\n");
}
@ -332,10 +337,17 @@ async function getXRefStreamTable(
const xrefTableData = [];
let maxOffset = 0;
let maxGen = 0;
for (const { ref, data } of newRefs) {
for (const { ref, data, objStreamRef, index } of newRefs) {
let gen;
maxOffset = Math.max(maxOffset, baseOffset);
if (data !== null) {
// The first number in each entry is the type (see 7.5.8.3):
// 0: free object
// 1: in-use object
// 2: compressed object
if (objStreamRef) {
gen = index;
xrefTableData.push([2, objStreamRef.num, gen]);
} else if (data !== null) {
gen = Math.min(ref.gen, 0xffff);
xrefTableData.push([1, baseOffset, gen]);
baseOffset += data.length;
@ -371,13 +383,13 @@ async function getXRefStreamTable(
function computeIDs(baseOffset, xrefInfo, newXref) {
if (Array.isArray(xrefInfo.fileIds) && xrefInfo.fileIds.length > 0) {
const md5 = computeMD5(baseOffset, xrefInfo);
newXref.set("ID", [xrefInfo.fileIds[0], md5]);
newXref.set("ID", [xrefInfo.fileIds[0] || md5, md5]);
}
}
function getTrailerDict(xrefInfo, changes, useXrefStream) {
const newXref = new Dict(null);
newXref.set("Prev", xrefInfo.startXRef);
newXref.setIfDefined("Prev", xrefInfo?.startXRef);
const refForXrefTable = xrefInfo.newRef;
if (useXrefStream) {
changes.put(refForXrefTable, { data: "" });
@ -386,21 +398,20 @@ function getTrailerDict(xrefInfo, changes, useXrefStream) {
} else {
newXref.set("Size", refForXrefTable.num);
}
if (xrefInfo.rootRef !== null) {
newXref.set("Root", xrefInfo.rootRef);
}
if (xrefInfo.infoRef !== null) {
newXref.set("Info", xrefInfo.infoRef);
}
if (xrefInfo.encryptRef !== null) {
newXref.set("Encrypt", xrefInfo.encryptRef);
}
newXref.setIfDefined("Root", xrefInfo?.rootRef);
newXref.setIfDefined("Info", xrefInfo?.infoRef);
newXref.setIfDefined("Encrypt", xrefInfo?.encryptRef);
return newXref;
}
async function writeChanges(changes, xref, buffer = []) {
const newRefs = [];
for (const [ref, { data }] of changes.items()) {
for (const [ref, { data, objStreamRef, index }] of changes.items()) {
if (objStreamRef) {
newRefs.push({ ref, data, objStreamRef, index });
continue;
}
if (data === null || typeof data === "string") {
newRefs.push({ ref, data });
continue;
@ -483,4 +494,4 @@ async function incrementalUpdate({
return array;
}
export { incrementalUpdate, writeChanges, writeDict, writeObject };
export { incrementalUpdate, writeChanges, writeDict, writeObject, writeValue };

View File

@ -1025,6 +1025,24 @@ class PDFDocumentProxy {
return this._transport.saveDocument();
}
/**
* @typedef {Object} PageInfo
* @property {null|Uint8Array} document
* @property {Array<Array<number>|number>} [includePages]
* included ranges or indices.
* @property {Array<Array<number>|number>} [excludePages]
* excluded ranges or indices.
*/
/**
* @param {Array<PageInfo>} pageInfos - The pages to extract.
* @returns {Promise<Uint8Array>} A promise that is resolved with a
* {Uint8Array} containing the full data of the saved document.
*/
extractPages(pageInfos) {
return this._transport.extractPages(pageInfos);
}
/**
* @returns {Promise<{ length: number }>} A promise that is resolved when the
* document's data is loaded. It is resolved with an {Object} that contains
@ -2900,6 +2918,10 @@ class WorkerTransport {
});
}
extractPages(pageInfos) {
return this.messageHandler.sendWithPromise("ExtractPages", { pageInfos });
}
getPage(pageNumber) {
if (
!Number.isInteger(pageNumber) ||

View File

@ -506,6 +506,7 @@ class Driver {
this.inFlightRequests = 0;
this.testFilter = JSON.parse(params.get("testfilter") || "[]");
this.xfaOnly = params.get("xfaonly") === "true";
this.masterMode = params.get("mastermode") === "true";
// Create a working canvas
this.canvas = document.createElement("canvas");
@ -591,6 +592,25 @@ class Driver {
task.stats = { times: [] };
task.enableXfa = task.enableXfa === true;
if (task.includePages && task.type === "extract") {
if (this.masterMode) {
const includePages = [];
for (const page of task.includePages) {
if (Array.isArray(page)) {
for (let i = page[0]; i <= page[1]; i++) {
includePages.push(i);
}
} else {
includePages.push(page);
}
}
task.numberOfTasks = includePages.length;
task.includePages = includePages;
} else {
delete task.pageMapping;
}
}
const prevFile = md5FileMap.get(task.md5);
if (prevFile) {
if (task.file !== prevFile) {
@ -658,6 +678,20 @@ class Driver {
});
let promise = loadingTask.promise;
if (!this.masterMode && task.type === "extract") {
promise = promise.then(async doc => {
const data = await doc.extractPages([
{
document: null,
includePages: task.includePages,
},
]);
await loadingTask.destroy();
delete task.includePages;
return getDocument(data).promise;
});
}
if (task.annotationStorage) {
for (const annotation of Object.values(task.annotationStorage)) {
const { bitmapName, quadPoints, paths, outlines } = annotation;
@ -862,7 +896,12 @@ class Driver {
}
}
if (task.skipPages?.includes(task.pageNum)) {
if (
task.skipPages?.includes(task.pageNum) ||
(this.masterMode &&
task.includePages &&
!task.includePages.includes(task.pageNum - 1))
) {
this._log(
` Skipping page ${task.pageNum}/${task.pdfDoc.numPages}...\n`
);
@ -1274,10 +1313,11 @@ class Driver {
id: task.id,
numPages: task.pdfDoc ? task.lastPage || task.pdfDoc.numPages : 0,
lastPageNum: this._getLastPageNumber(task),
numberOfTasks: task.numberOfTasks ?? -1,
failure,
file: task.file,
round: task.round,
page: task.pageNum,
page: task.pageMapping?.[task.pageNum] ?? task.pageNum,
snapshot,
baselineSnapshot,
stats: task.stats.times,

View File

@ -754,3 +754,6 @@
!bug1937438_from_word.pdf
!bug1937438_mml_from_latex.pdf
!bug1997343.pdf
!doc_1_3_pages.pdf
!doc_2_3_pages.pdf
!doc_3_3_pages.pdf

BIN
test/pdfs/doc_1_3_pages.pdf Executable file

Binary file not shown.

BIN
test/pdfs/doc_2_3_pages.pdf Executable file

Binary file not shown.

BIN
test/pdfs/doc_3_3_pages.pdf Executable file

Binary file not shown.

View File

@ -672,6 +672,7 @@ function checkRefTestResults(browser, id, results) {
case "partial":
case "text":
case "highlight":
case "extract":
checkEq(task, results, browser, session.masterMode);
break;
case "fbf":
@ -731,6 +732,7 @@ function refTestPostHandler(parsedUrl, req, res) {
var snapshot = data.snapshot;
var baselineSnapshot = data.baselineSnapshot;
var lastPageNum = data.lastPageNum;
var numberOfTasks = data.numberOfTasks;
session = getSession(browser);
monitorBrowserTimeout(session, handleSessionTimeout);
@ -773,7 +775,10 @@ function refTestPostHandler(parsedUrl, req, res) {
});
}
var isDone = taskResults.at(-1)?.[lastPageNum - 1];
const lastTaskResults = taskResults.at(-1);
const isDone =
lastTaskResults?.[lastPageNum - 1] ||
lastTaskResults?.filter(result => !!result).length === numberOfTasks;
if (isDone) {
checkRefTestResults(browser, id, taskResults);
session.remaining--;

View File

@ -13049,5 +13049,23 @@
"rotation": 0
}
}
},
{
"id": "tracemonkey-extract_0_2_12",
"file": "pdfs/tracemonkey.pdf",
"md5": "9a192d8b1a7dc652a19835f6f08098bd",
"rounds": 1,
"type": "extract",
"includePages": [0, 2, 12],
"pageMapping": { "1": 1, "3": 2, "13": 3 }
},
{
"id": "bug900822-encrypted-extract_0",
"file": "pdfs/bug900822.pdf",
"md5": "70e2a3c5922574eeda169c955cf9d084",
"rounds": 1,
"type": "extract",
"includePages": [0],
"pageMapping": { "1": 1 }
}
]

View File

@ -5335,4 +5335,212 @@ deployment as easy as distributing a source file. They are used for
small scripts as well as for`);
});
});
describe("PDF page editing", function () {
describe("Merge pdfs", function () {
it("should merge three PDFs", async function () {
const loadingTask = getDocument(
buildGetDocumentParams("doc_1_3_pages.pdf")
);
const pdfDoc = await loadingTask.promise;
const pdfData2 = await DefaultFileReaderFactory.fetch({
path: TEST_PDFS_PATH + "doc_2_3_pages.pdf",
});
const pdfData3 = await DefaultFileReaderFactory.fetch({
path: TEST_PDFS_PATH + "doc_3_3_pages.pdf",
});
let data = await pdfDoc.extractPages([
{ document: null },
{ document: pdfData2 },
{ document: pdfData3 },
]);
let newLoadingTask = getDocument(data);
let newPdfDoc = await newLoadingTask.promise;
expect(newPdfDoc.numPages).toEqual(9);
for (let i = 1; i <= 9; i++) {
const pdfPage = await newPdfDoc.getPage(i);
const { items: textItems } = await pdfPage.getTextContent();
expect(mergeText(textItems)).toEqual(
`Document ${Math.ceil(i / 3)}:Page ${((i - 1) % 3) + 1}`
);
}
await newLoadingTask.destroy();
data = await pdfDoc.extractPages([
{ document: pdfData3 },
{ document: pdfData2 },
{ document: null },
]);
newLoadingTask = getDocument(data);
newPdfDoc = await newLoadingTask.promise;
expect(newPdfDoc.numPages).toEqual(9);
for (let i = 1; i <= 9; i++) {
const pdfPage = await newPdfDoc.getPage(i);
const { items: textItems } = await pdfPage.getTextContent();
expect(mergeText(textItems)).toEqual(
`Document ${Math.ceil((10 - i) / 3)}:Page ${((i - 1) % 3) + 1}`
);
}
await newLoadingTask.destroy();
data = await pdfDoc.extractPages([
{ document: null, includePages: [0] },
{ document: pdfData2, includePages: [0] },
{ document: pdfData3, includePages: [0] },
]);
newLoadingTask = getDocument(data);
newPdfDoc = await newLoadingTask.promise;
expect(newPdfDoc.numPages).toEqual(3);
for (let i = 1; i <= 3; i++) {
const pdfPage = await newPdfDoc.getPage(i);
const { items: textItems } = await pdfPage.getTextContent();
expect(mergeText(textItems)).toEqual(`Document ${i}:Page 1`);
}
await newLoadingTask.destroy();
data = await pdfDoc.extractPages([
{ document: null, excludePages: [0] },
{ document: pdfData2, excludePages: [0] },
{ document: pdfData3, excludePages: [0] },
]);
newLoadingTask = getDocument(data);
newPdfDoc = await newLoadingTask.promise;
expect(newPdfDoc.numPages).toEqual(6);
for (let i = 1; i <= 6; i++) {
const pdfPage = await newPdfDoc.getPage(i);
const { items: textItems } = await pdfPage.getTextContent();
expect(mergeText(textItems)).toEqual(
`Document ${Math.ceil(i / 2)}:Page ${((i - 1) % 2) + 2}`
);
}
await newLoadingTask.destroy();
await loadingTask.destroy();
});
it("should merge two PDFs with page included ranges", async function () {
const loadingTask = getDocument(
buildGetDocumentParams("tracemonkey.pdf")
);
const pdfDoc = await loadingTask.promise;
const pdfData1 = await DefaultFileReaderFactory.fetch({
path: TEST_PDFS_PATH + "doc_1_3_pages.pdf",
});
const data = await pdfDoc.extractPages([
{ document: pdfData1, includePages: [[0, 0], 2] },
{ document: null, includePages: [[2, 4], 7] },
]);
const newLoadingTask = getDocument(data);
const newPdfDoc = await newLoadingTask.promise;
expect(newPdfDoc.numPages).toEqual(6);
for (let i = 1; i <= 2; i++) {
const pdfPage = await newPdfDoc.getPage(i);
const { items: textItems } = await pdfPage.getTextContent();
expect(mergeText(textItems)).toEqual(`Document 1:Page ${2 * i - 1}`);
}
const expectedPagesText = [
"v0 := ld s",
"i=4. On th",
"resentatio",
"5.1 Optimi",
];
for (let i = 3; i <= 6; i++) {
const pdfPage = await newPdfDoc.getPage(i);
const { items: textItems } = await pdfPage.getTextContent();
const text = mergeText(textItems);
expect(text.substring(0, 10)).toEqual(expectedPagesText[i - 3]);
}
await newLoadingTask.destroy();
await loadingTask.destroy();
});
it("should merge two PDFs with page excluded ranges", async function () {
const loadingTask = getDocument(
buildGetDocumentParams("tracemonkey.pdf")
);
const pdfDoc = await loadingTask.promise;
const pdfData1 = await DefaultFileReaderFactory.fetch({
path: TEST_PDFS_PATH + "doc_1_3_pages.pdf",
});
const data = await pdfDoc.extractPages([
{ document: pdfData1, excludePages: [[1, 1]] },
{
document: null,
excludePages: [
[0, 1],
[5, 6],
[8, 13],
],
},
]);
const newLoadingTask = getDocument(data);
const newPdfDoc = await newLoadingTask.promise;
expect(newPdfDoc.numPages).toEqual(6);
for (let i = 1; i <= 2; i++) {
const pdfPage = await newPdfDoc.getPage(i);
const { items: textItems } = await pdfPage.getTextContent();
expect(mergeText(textItems)).toEqual(`Document 1:Page ${2 * i - 1}`);
}
const expectedPagesText = [
"v0 := ld s",
"i=4. On th",
"resentatio",
"5.1 Optimi",
];
for (let i = 3; i <= 6; i++) {
const pdfPage = await newPdfDoc.getPage(i);
const { items: textItems } = await pdfPage.getTextContent();
const text = mergeText(textItems);
expect(text.substring(0, 10)).toEqual(expectedPagesText[i - 3]);
}
await newLoadingTask.destroy();
await loadingTask.destroy();
});
it("should merge two PDFs with one with a password", async function () {
const loadingTask = getDocument(
buildGetDocumentParams("doc_1_3_pages.pdf")
);
const pdfDoc = await loadingTask.promise;
const pdfData1 = await DefaultFileReaderFactory.fetch({
path: TEST_PDFS_PATH + "pr6531_2.pdf",
});
const data = await pdfDoc.extractPages([
{ document: null, includePages: [0] },
{ document: pdfData1, password: "asdfasdf" },
]);
const newLoadingTask = getDocument(data);
const newPdfDoc = await newLoadingTask.promise;
expect(newPdfDoc.numPages).toEqual(2);
const expectedPagesText = ["Document 1:Page 1", ""];
for (let i = 1; i <= 2; i++) {
const pdfPage = await newPdfDoc.getPage(i);
const { items: textItems } = await pdfPage.getTextContent();
expect(mergeText(textItems)).toEqual(expectedPagesText[i - 1]);
}
const page2 = await newPdfDoc.getPage(2);
const annots = await page2.getAnnotations();
expect(annots.length).toEqual(1);
expect(annots[0].contentsObj.str).toEqual(
"Bluebeam should be encrypting this."
);
await newLoadingTask.destroy();
await loadingTask.destroy();
});
});
});
});

View File

@ -310,6 +310,16 @@ describe("primitives", function () {
expect(rawValues2.sort()).toEqual(expectedRawValues2);
});
it("should get all raw entries", function () {
const expectedRawEntries = [
["FontFile", testFontFile],
["FontFile2", testFontFile2],
["FontFile3", testFontFile3],
];
const rawEntries = Array.from(dictWithManyKeys.getRawEntries());
expect(rawEntries.sort()).toEqual(expectedRawEntries);
});
it("should create only one object for Dict.empty", function () {
const firstDictEmpty = Dict.empty;
const secondDictEmpty = Dict.empty;
@ -423,6 +433,12 @@ describe("primitives", function () {
dict.setIfName("k", 1234);
expect(dict.has("k")).toBeFalse();
dict.setIfDict("l", new Dict());
expect(dict.get("l")).toEqual(new Dict());
dict.setIfDict("m", "not a dict");
expect(dict.has("m")).toBeFalse();
});
});

View File

@ -170,8 +170,8 @@ describe("Writer", function () {
const expected =
"<< /A /B /B 123 456 R /C 789 /D (hello world) " +
"/E (\\(hello\\\\world\\)) /F [1.23 4.5 6] " +
"/G << /H 123 /I << /Length 8>> stream\n" +
"/E (\\(hello\\\\world\\)) /F [1.23001 4.50001 6] " +
"/G << /H 123.00001 /I << /Length 8>> stream\n" +
"a stream\n" +
"endstream>> /J true /K false " +
"/NullArr [null 10] /NullVal null>>";