diff --git a/src/core/decode_stream.js b/src/core/decode_stream.js index 80bdcebd0..b541ed898 100644 --- a/src/core/decode_stream.js +++ b/src/core/decode_stream.js @@ -131,6 +131,19 @@ class DecodeStream extends BaseStream { getBaseStreams() { return this.stream ? this.stream.getBaseStreams() : null; } + + clone() { + // Make sure it has been fully read. + while (!this.eof) { + this.readBlock(); + } + return new Stream( + this.buffer, + this.start, + this.end - this.start, + this.dict.clone() + ); + } } class StreamsSequenceStream extends DecodeStream { diff --git a/src/core/decrypt_stream.js b/src/core/decrypt_stream.js index 8e93b9f86..78fbc5ae5 100644 --- a/src/core/decrypt_stream.js +++ b/src/core/decrypt_stream.js @@ -52,6 +52,10 @@ class DecryptStream extends DecodeStream { buffer.set(chunk, bufferLength); this.bufferLength = newLength; } + + getOriginalStream() { + return this; + } } export { DecryptStream }; diff --git a/src/core/document.js b/src/core/document.js index 7bc738bb5..f624458cb 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -178,7 +178,7 @@ class Page { ); } - #getBoundingBox(name) { + getBoundingBox(name) { if (this.xfaData) { return this.xfaData.bbox; } @@ -201,7 +201,7 @@ class Page { return shadow( this, "mediaBox", - this.#getBoundingBox("MediaBox") || LETTER_SIZE_MEDIABOX + this.getBoundingBox("MediaBox") || LETTER_SIZE_MEDIABOX ); } @@ -210,7 +210,7 @@ class Page { return shadow( this, "cropBox", - this.#getBoundingBox("CropBox") || this.mediaBox + this.getBoundingBox("CropBox") || this.mediaBox ); } diff --git a/src/core/editor/pdf_editor.js b/src/core/editor/pdf_editor.js new file mode 100644 index 000000000..7df909156 --- /dev/null +++ b/src/core/editor/pdf_editor.js @@ -0,0 +1,594 @@ +/* Copyright 2025 Mozilla Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** @typedef {import("../document.js").PDFDocument} PDFDocument */ +/** @typedef {import("../document.js").Page} Page */ +/** @typedef {import("../xref.js").XRef} XRef */ + +import { Dict, isName, Ref, RefSetCache } from "../primitives.js"; +import { getModificationDate, stringToPDFString } from "../../shared/util.js"; +import { incrementalUpdate, writeValue } from "../writer.js"; +import { BaseStream } from "../base_stream.js"; +import { StringStream } from "../stream.js"; +import { stringToAsciiOrUTF16BE } from "../core_utils.js"; + +const MAX_LEAVES_PER_PAGES_NODE = 16; + +class PageData { + constructor(page, documentData) { + this.page = page; + this.documentData = documentData; + this.annotations = null; + + documentData.pagesMap.put(page.ref, this); + } +} + +class DocumentData { + constructor(document) { + this.document = document; + this.pagesMap = new RefSetCache(); + this.oldRefMapping = new RefSetCache(); + } +} + +class PDFEditor { + constructor({ useObjectStreams = true, title = "", author = "" } = {}) { + this.hasSingleFile = false; + this.currentDocument = null; + this.oldPages = []; + this.newPages = []; + this.xref = [null]; + this.newRefCount = 1; + [this.rootRef, this.rootDict] = this.newDict; + [this.infoRef, this.infoDict] = this.newDict; + [this.pagesRef, this.pagesDict] = this.newDict; + this.namesDict = null; + this.useObjectStreams = useObjectStreams; + this.objStreamRefs = useObjectStreams ? new Set() : null; + this.version = "1.7"; + this.title = title; + this.author = author; + } + + /** + * Get a new reference for an object in the PDF. + * @returns {Ref} + */ + get newRef() { + const ref = Ref.get(this.newRefCount++, 0); + return ref; + } + + /** + * Create a new dictionary and its reference. + * @returns {[Ref, Dict]} + */ + get newDict() { + const ref = this.newRef; + const dict = (this.xref[ref.num] = new Dict()); + return [ref, dict]; + } + + /** + * Clone an object in the PDF. + * @param {*} obj + * @param {XRef} xref + * @returns {Promise} + */ + async #cloneObject(obj, xref) { + const ref = this.newRef; + this.xref[ref.num] = await this.#collectDependencies(obj, true, xref); + return ref; + } + + /** + * Collect the dependencies of an object and create new references for each + * dependency. + * @param {*} obj + * @param {boolean} mustClone + * @param {XRef} xref + * @returns {Promise<*>} + */ + async #collectDependencies(obj, mustClone, xref) { + if (obj instanceof Ref) { + const { + currentDocument: { oldRefMapping }, + } = this; + let newRef = oldRefMapping.get(obj); + if (newRef) { + return newRef; + } + newRef = this.newRef; + oldRefMapping.put(obj, newRef); + obj = await xref.fetchAsync(obj); + + if (typeof PDFJSDev === "undefined" || PDFJSDev.test("TESTING")) { + if ( + obj instanceof Dict && + isName(obj.get("Type"), "Page") && + !this.currentDocument.pagesMap.has(obj) + ) { + throw new Error( + "Add a deleted page to the document is not supported." + ); + } + } + + this.xref[newRef.num] = await this.#collectDependencies(obj, true, xref); + return newRef; + } + const promises = []; + if (Array.isArray(obj)) { + if (mustClone) { + obj = obj.slice(); + } + for (let i = 0, ii = obj.length; i < ii; i++) { + promises.push( + this.#collectDependencies(obj[i], true, xref).then( + newObj => (obj[i] = newObj) + ) + ); + } + await Promise.all(promises); + return obj; + } + let dict; + if (obj instanceof BaseStream) { + ({ dict } = obj = obj.getOriginalStream().clone()); + } else if (obj instanceof Dict) { + if (mustClone) { + obj = obj.clone(); + } + dict = obj; + } + if (dict) { + for (const [key, rawObj] of dict.getRawEntries()) { + promises.push( + this.#collectDependencies(rawObj, true, xref).then(newObj => + dict.set(key, newObj) + ) + ); + } + await Promise.all(promises); + } + + return obj; + } + + /** + * @typedef {Object} PageInfo + * @property {PDFDocument} document + * @property {Array|number>} [includePages] + * included ranges (inclusive) or indices. + * @property {Array|number>} [excludePages] + * excluded ranges (inclusive) or indices. + */ + + /** + * Extract pages from the given documents. + * @param {Array} pageInfos + * @return {Promise} + */ + async extractPages(pageInfos) { + const promises = []; + let newIndex = 0; + this.hasSingleFile = pageInfos.length === 1; + for (const { document, includePages, excludePages } of pageInfos) { + if (!document) { + continue; + } + const documentData = new DocumentData(document); + promises.push(this.#collectDocumentData(documentData)); + let keptIndices, keptRanges, deletedIndices, deletedRanges; + for (const page of includePages || []) { + if (Array.isArray(page)) { + (keptRanges ||= []).push(page); + } else { + (keptIndices ||= new Set()).add(page); + } + } + for (const page of excludePages || []) { + if (Array.isArray(page)) { + (deletedRanges ||= []).push(page); + } else { + (deletedIndices ||= new Set()).add(page); + } + } + for (let i = 0, ii = document.numPages; i < ii; i++) { + if (deletedIndices?.has(i)) { + continue; + } + if (deletedRanges) { + let isDeleted = false; + for (const [start, end] of deletedRanges) { + if (i >= start && i <= end) { + isDeleted = true; + break; + } + } + if (isDeleted) { + continue; + } + } + + let takePage = false; + if (keptIndices) { + takePage = keptIndices.has(i); + } + if (!takePage && keptRanges) { + for (const [start, end] of keptRanges) { + if (i >= start && i <= end) { + takePage = true; + break; + } + } + } + if (!takePage && !keptIndices && !keptRanges) { + takePage = true; + } + if (!takePage) { + continue; + } + const newPageIndex = newIndex++; + promises.push( + document.getPage(i).then(page => { + this.oldPages[newPageIndex] = new PageData(page, documentData); + }) + ); + } + } + await Promise.all(promises); + promises.length = 0; + + for (const page of this.oldPages) { + promises.push(this.#postCollectPageData(page)); + } + await Promise.all(promises); + + for (let i = 0, ii = this.oldPages.length; i < ii; i++) { + this.newPages[i] = await this.#makePageCopy(i, null); + } + + return this.writePDF(); + } + + /** + * Collect the document data. + * @param {DocumentData} documentData + * @return {Promise} + */ + async #collectDocumentData(documentData) {} + + /** + * Post process the collected page data. + * @param {PageData} pageData + * @returns {Promise} + */ + async #postCollectPageData(pageData) { + const { + page: { xref, annotations }, + } = pageData; + + if (!annotations) { + return; + } + + const promises = []; + let newAnnotations = []; + let newIndex = 0; + + // TODO: remove only links to deleted pages. + for (const annotationRef of annotations) { + const newAnnotationIndex = newIndex++; + promises.push( + xref.fetchIfRefAsync(annotationRef).then(async annotationDict => { + if (!isName(annotationDict.get("Subtype"), "Link")) { + newAnnotations[newAnnotationIndex] = annotationRef; + } + }) + ); + } + await Promise.all(promises); + newAnnotations = newAnnotations.filter(annot => !!annot); + pageData.annotations = newAnnotations.length > 0 ? newAnnotations : null; + } + + /** + * Create a copy of a page. + * @param {number} pageIndex + * @returns {Promise} the page reference in the new PDF document. + */ + async #makePageCopy(pageIndex) { + const { page, documentData, annotations } = this.oldPages[pageIndex]; + this.currentDocument = documentData; + const { oldRefMapping } = documentData; + const { xref, rotate, mediaBox, resources, ref: oldPageRef } = page; + const pageRef = this.newRef; + const pageDict = (this.xref[pageRef.num] = page.pageDict.clone()); + oldRefMapping.put(oldPageRef, pageRef); + + // No need to keep these entries as we'll set them again later. + for (const key of [ + "Rotate", + "MediaBox", + "CropBox", + "BleedBox", + "TrimBox", + "ArtBox", + "Resources", + "Annots", + "Parent", + "UserUnit", + ]) { + pageDict.delete(key); + } + + const lastRef = this.newRefCount; + await this.#collectDependencies(pageDict, false, xref); + + pageDict.set("Rotate", rotate); + pageDict.set("MediaBox", mediaBox); + for (const boxName of ["CropBox", "BleedBox", "TrimBox", "ArtBox"]) { + const box = page.getBoundingBox(boxName); + if (box?.some((value, index) => value !== mediaBox[index])) { + // These boxes are optional and their default value is the MediaBox. + pageDict.set(boxName, box); + } + } + const userUnit = page.userUnit; + if (userUnit !== 1) { + pageDict.set("UserUnit", userUnit); + } + pageDict.setIfDict( + "Resources", + await this.#collectDependencies(resources, true, xref) + ); + pageDict.setIfArray( + "Annots", + await this.#collectDependencies(annotations, true, xref) + ); + + if (this.useObjectStreams) { + const newLastRef = this.newRefCount; + const pageObjectRefs = []; + for (let i = lastRef; i < newLastRef; i++) { + const obj = this.xref[i]; + if (obj instanceof BaseStream) { + continue; + } + pageObjectRefs.push(Ref.get(i, 0)); + } + for (let i = 0; i < pageObjectRefs.length; i += 0xffff) { + const objStreamRef = this.newRef; + this.objStreamRefs.add(objStreamRef.num); + this.xref[objStreamRef.num] = pageObjectRefs.slice(i, i + 0xffff); + } + } + + this.currentDocument = null; + + return pageRef; + } + + /** + * Create the page tree structure. + */ + #makePageTree() { + const { newPages: pages, rootDict, pagesRef, pagesDict } = this; + rootDict.set("Pages", pagesRef); + pagesDict.setIfName("Type", "Pages"); + pagesDict.set("Count", pages.length); + + const maxLeaves = + MAX_LEAVES_PER_PAGES_NODE <= 1 ? pages.length : MAX_LEAVES_PER_PAGES_NODE; + const stack = [{ dict: pagesDict, kids: pages, parentRef: pagesRef }]; + + while (stack.length > 0) { + const { dict, kids, parentRef } = stack.pop(); + if (kids.length <= maxLeaves) { + dict.set("Kids", kids); + for (const ref of kids) { + this.xref[ref.num].set("Parent", parentRef); + } + continue; + } + const chunkSize = Math.max(maxLeaves, Math.ceil(kids.length / maxLeaves)); + const kidsChunks = []; + for (let i = 0; i < kids.length; i += chunkSize) { + kidsChunks.push(kids.slice(i, i + chunkSize)); + } + const kidsRefs = []; + dict.set("Kids", kidsRefs); + for (const chunk of kidsChunks) { + const [kidRef, kidDict] = this.newDict; + kidsRefs.push(kidRef); + kidDict.setIfName("Type", "Pages"); + kidDict.set("Parent", parentRef); + kidDict.set("Count", chunk.length); + stack.push({ dict: kidDict, kids: chunk, parentRef: kidRef }); + } + } + } + + /** + * Create the root dictionary. + * @returns {Promise} + */ + async #makeRoot() { + const { rootDict } = this; + rootDict.setIfName("Type", "Catalog"); + rootDict.set("Version", this.version); + this.#makePageTree(); + } + + /** + * Create the info dictionary. + * @returns {Map} infoMap + */ + #makeInfo() { + const infoMap = new Map(); + if (this.hasSingleFile) { + const { + xref: { trailer }, + } = this.oldPages[0].documentData.document; + const oldInfoDict = trailer.get("Info"); + for (const [key, value] of oldInfoDict || []) { + if (typeof value === "string") { + infoMap.set(key, stringToPDFString(value)); + } + } + } + infoMap.delete("ModDate"); + infoMap.set("CreationDate", getModificationDate()); + infoMap.set("Creator", "PDF.js"); + infoMap.set("Producer", "Firefox"); + + if (this.author) { + infoMap.set("Author", this.author); + } + if (this.title) { + infoMap.set("Title", this.title); + } + for (const [key, value] of infoMap) { + this.infoDict.set(key, stringToAsciiOrUTF16BE(value)); + } + return infoMap; + } + + /** + * Create the encryption dictionary if required. + * @returns {Promise<[Dict|null, CipherTransformFactory|null, Array|null]>} + */ + async #makeEncrypt() { + if (!this.hasSingleFile) { + return [null, null, null]; + } + const { documentData } = this.oldPages[0]; + const { + document: { + xref: { trailer, encrypt }, + }, + } = documentData; + if (!trailer.has("Encrypt")) { + return [null, null, null]; + } + const encryptDict = trailer.get("Encrypt"); + if (!(encryptDict instanceof Dict)) { + return [null, null, null]; + } + this.currentDocument = documentData; + const result = [ + await this.#cloneObject(encryptDict, trailer.xref), + encrypt, + trailer.get("ID"), + ]; + this.currentDocument = null; + return result; + } + + /** + * Create the changes required to write the new PDF document. + * @returns {Promise<[RefSetCache, Ref]>} + */ + async #createChanges() { + const changes = new RefSetCache(); + changes.put(Ref.get(0, 0xffff), { data: null }); + for (let i = 1, ii = this.xref.length; i < ii; i++) { + if (this.objStreamRefs?.has(i)) { + await this.#createObjectStream(Ref.get(i, 0), this.xref[i], changes); + } else { + changes.put(Ref.get(i, 0), { data: this.xref[i] }); + } + } + + return [changes, this.newRef]; + } + + /** + * Create an object stream containing the given objects. + * @param {Ref} objStreamRef + * @param {Array} objRefs + * @param {RefSetCache} changes + */ + async #createObjectStream(objStreamRef, objRefs, changes) { + const streamBuffer = [""]; + const objOffsets = []; + let offset = 0; + const buffer = []; + for (let i = 0, ii = objRefs.length; i < ii; i++) { + const objRef = objRefs[i]; + changes.put(objRef, { data: null, objStreamRef, index: i }); + objOffsets.push(`${objRef.num} ${offset}`); + const data = this.xref[objRef.num]; + await writeValue(data, buffer, /* transform = */ null); + const obj = buffer.join(""); + buffer.length = 0; + streamBuffer.push(obj); + offset += obj.length + 1; + } + streamBuffer[0] = objOffsets.join("\n"); + const objStream = new StringStream(streamBuffer.join("\n")); + const objStreamDict = (objStream.dict = new Dict()); + objStreamDict.setIfName("Type", "ObjStm"); + objStreamDict.set("N", objRefs.length); + objStreamDict.set("First", streamBuffer[0].length + 1); + + changes.put(objStreamRef, { data: objStream }); + } + + /** + * Write the new PDF document to a Uint8Array. + * @returns {Promise} + */ + async writePDF() { + await this.#makeRoot(); + const infoMap = this.#makeInfo(); + const [encryptRef, encrypt, fileIds] = await this.#makeEncrypt(); + const [changes, xrefTableRef] = await this.#createChanges(); + + // Create the PDF header in order to help sniffers. + // PDF version must be in the range 1.0 to 1.7 inclusive. + // We add a binary comment line to ensure that the file is treated + // as a binary file by applications that open it. + const header = [ + ...`%PDF-${this.version}\n%`.split("").map(c => c.charCodeAt(0)), + 0xfa, + 0xde, + 0xfa, + 0xce, + ]; + return incrementalUpdate({ + originalData: new Uint8Array(header), + changes, + xrefInfo: { + startXRef: null, + rootRef: this.rootRef, + infoRef: this.infoRef, + encryptRef, + newRef: xrefTableRef, + fileIds: fileIds || [null, null], + infoMap, + }, + useXrefStream: this.useObjectStreams, + xref: { + encrypt, + encryptRef, + }, + }); + } +} + +export { PDFEditor }; diff --git a/src/core/primitives.js b/src/core/primitives.js index decd4338c..22cdd2527 100644 --- a/src/core/primitives.js +++ b/src/core/primitives.js @@ -188,6 +188,10 @@ class Dict { return [...this._map.values()]; } + getRawEntries() { + return this._map.entries(); + } + set(key, value) { if (typeof PDFJSDev === "undefined" || PDFJSDev.test("TESTING")) { if (typeof key !== "string") { @@ -231,6 +235,12 @@ class Dict { } } + setIfDict(key, value) { + if (value instanceof Dict) { + this.set(key, value); + } + } + has(key) { return this._map.has(key); } diff --git a/src/core/stream.js b/src/core/stream.js index 7bc9791ed..710b92f8c 100644 --- a/src/core/stream.js +++ b/src/core/stream.js @@ -82,6 +82,15 @@ class Stream extends BaseStream { makeSubStream(start, length, dict = null) { return new Stream(this.bytes.buffer, start, length, dict); } + + clone() { + return new Stream( + this.bytes.buffer, + this.start, + this.end - this.start, + this.dict.clone() + ); + } } class StringStream extends Stream { diff --git a/src/core/worker.js b/src/core/worker.js index 578ea2bdb..8ef7b10f4 100644 --- a/src/core/worker.js +++ b/src/core/worker.js @@ -36,6 +36,7 @@ import { MessageHandler, wrapReason } from "../shared/message_handler.js"; import { AnnotationFactory } from "./annotation.js"; import { clearGlobalCaches } from "./cleanup_helper.js"; import { incrementalUpdate } from "./writer.js"; +import { PDFEditor } from "./editor/pdf_editor.js"; import { PDFWorkerStream } from "./worker_stream.js"; import { StructTreeRoot } from "./struct_tree.js"; @@ -557,6 +558,97 @@ class WorkerMessageHandler { return pdfManager.ensureDoc("calculationOrderIds"); }); + handler.on("ExtractPages", async function ({ pageInfos }) { + if (!pageInfos) { + warn("extractPages: nothing to extract."); + return null; + } + if (!Array.isArray(pageInfos)) { + pageInfos = [pageInfos]; + } + let newDocumentId = 0; + for (const pageInfo of pageInfos) { + if (pageInfo.document === null) { + pageInfo.document = pdfManager.pdfDocument; + } else if (ArrayBuffer.isView(pageInfo.document)) { + const manager = new LocalPdfManager({ + source: pageInfo.document, + docId: `${docId}_extractPages_${newDocumentId++}`, + handler, + password: pageInfo.password ?? null, + evaluatorOptions: Object.assign({}, pdfManager.evaluatorOptions), + }); + let recoveryMode = false; + let isValid = true; + while (true) { + try { + await manager.requestLoadedStream(); + await manager.ensureDoc("checkHeader"); + await manager.ensureDoc("parseStartXRef"); + await manager.ensureDoc("parse", [recoveryMode]); + break; + } catch (e) { + if (e instanceof XRefParseException) { + if (recoveryMode === false) { + recoveryMode = true; + continue; + } else { + isValid = false; + warn("extractPages: XRefParseException."); + } + } else if (e instanceof PasswordException) { + const task = new WorkerTask( + `PasswordException: response ${e.code}` + ); + + startWorkerTask(task); + + try { + const { password } = await handler.sendWithPromise( + "PasswordRequest", + e + ); + manager.updatePassword(password); + } catch { + isValid = false; + warn("extractPages: invalid password."); + } finally { + finishWorkerTask(task); + } + } else { + isValid = false; + warn("extractPages: invalid document."); + } + if (!isValid) { + break; + } + } + } + if (!isValid) { + pageInfo.document = null; + } + const isPureXfa = await manager.ensureDoc("isPureXfa"); + if (isPureXfa) { + pageInfo.document = null; + warn("extractPages does not support pure XFA documents."); + } else { + pageInfo.document = manager.pdfDocument; + } + } else { + warn("extractPages: invalid document."); + } + } + try { + const pdfEditor = new PDFEditor(); + const buffer = await pdfEditor.extractPages(pageInfos); + return buffer; + } catch (reason) { + // eslint-disable-next-line no-console + console.error(reason); + return null; + } + }); + handler.on( "SaveDocument", async function ({ isPureXfa, numPages, annotationStorage, filename }) { diff --git a/src/core/writer.js b/src/core/writer.js index bf66226a2..921936b3b 100644 --- a/src/core/writer.js +++ b/src/core/writer.js @@ -19,7 +19,6 @@ import { escapePDFName, escapeString, getSizeInBytes, - numberToString, parseXFAPath, } from "./core_utils.js"; import { SimpleDOMNode, SimpleXMLParser } from "./xml_parser.js"; @@ -27,29 +26,34 @@ import { Stream, StringStream } from "./stream.js"; import { BaseStream } from "./base_stream.js"; import { calculateMD5 } from "./calculate_md5.js"; -async function writeObject(ref, obj, buffer, { encrypt = null }) { - const transform = encrypt?.createCipherTransform(ref.num, ref.gen); +async function writeObject( + ref, + obj, + buffer, + { encrypt = null, encryptRef = null } +) { + // Avoid to encrypt the encrypt dictionary. + const transform = + encrypt && encryptRef !== ref + ? encrypt.createCipherTransform(ref.num, ref.gen) + : null; buffer.push(`${ref.num} ${ref.gen} obj\n`); - if (obj instanceof Dict) { - await writeDict(obj, buffer, transform); - } else if (obj instanceof BaseStream) { - await writeStream(obj, buffer, transform); - } else if (Array.isArray(obj) || ArrayBuffer.isView(obj)) { - await writeArray(obj, buffer, transform); - } + await writeValue(obj, buffer, transform); buffer.push("\nendobj\n"); } async function writeDict(dict, buffer, transform) { buffer.push("<<"); - for (const key of dict.getKeys()) { + for (const [key, rawObj] of dict.getRawEntries()) { buffer.push(` /${escapePDFName(key)} `); - await writeValue(dict.getRaw(key), buffer, transform); + await writeValue(rawObj, buffer, transform); } buffer.push(">>"); } async function writeStream(stream, buffer, transform) { + stream = stream.getOriginalStream(); + stream.reset(); let bytes = stream.getBytes(); const { dict } = stream; @@ -67,7 +71,7 @@ async function writeStream(stream, buffer, transform) { // The number 256 is arbitrary, but it should be reasonable. const MIN_LENGTH_FOR_COMPRESSING = 256; - if (bytes.length >= MIN_LENGTH_FOR_COMPRESSING || isFilterZeroFlateDecode) { + if (bytes.length >= MIN_LENGTH_FOR_COMPRESSING && !isFilterZeroFlateDecode) { try { const cs = new CompressionStream("deflate"); const writer = cs.writable.getWriter(); @@ -120,14 +124,11 @@ async function writeStream(stream, buffer, transform) { async function writeArray(array, buffer, transform) { buffer.push("["); - let first = true; - for (const val of array) { - if (!first) { + for (let i = 0, ii = array.length; i < ii; i++) { + await writeValue(array[i], buffer, transform); + if (i < ii - 1) { buffer.push(" "); - } else { - first = false; } - await writeValue(val, buffer, transform); } buffer.push("]"); } @@ -145,7 +146,11 @@ async function writeValue(value, buffer, transform) { } buffer.push(`(${escapeString(value)})`); } else if (typeof value === "number") { - buffer.push(numberToString(value)); + // Don't try to round numbers in general, it could lead to have degenerate + // matrices (e.g. [0.000008 0 0 0.000008 0 0]). + // The numbers must be "rounded" only when pdf.js is producing them and the + // current transformation matrix is well known. + buffer.push(value.toString()); } else if (typeof value === "boolean") { buffer.push(value.toString()); } else if (value instanceof Dict) { @@ -306,7 +311,7 @@ async function getXRefTable(xrefInfo, baseOffset, newRefs, newXref, buffer) { } computeIDs(baseOffset, xrefInfo, newXref); buffer.push("trailer\n"); - await writeDict(newXref, buffer); + await writeDict(newXref, buffer, null); buffer.push("\nstartxref\n", baseOffset.toString(), "\n%%EOF\n"); } @@ -332,10 +337,17 @@ async function getXRefStreamTable( const xrefTableData = []; let maxOffset = 0; let maxGen = 0; - for (const { ref, data } of newRefs) { + for (const { ref, data, objStreamRef, index } of newRefs) { let gen; maxOffset = Math.max(maxOffset, baseOffset); - if (data !== null) { + // The first number in each entry is the type (see 7.5.8.3): + // 0: free object + // 1: in-use object + // 2: compressed object + if (objStreamRef) { + gen = index; + xrefTableData.push([2, objStreamRef.num, gen]); + } else if (data !== null) { gen = Math.min(ref.gen, 0xffff); xrefTableData.push([1, baseOffset, gen]); baseOffset += data.length; @@ -371,13 +383,13 @@ async function getXRefStreamTable( function computeIDs(baseOffset, xrefInfo, newXref) { if (Array.isArray(xrefInfo.fileIds) && xrefInfo.fileIds.length > 0) { const md5 = computeMD5(baseOffset, xrefInfo); - newXref.set("ID", [xrefInfo.fileIds[0], md5]); + newXref.set("ID", [xrefInfo.fileIds[0] || md5, md5]); } } function getTrailerDict(xrefInfo, changes, useXrefStream) { const newXref = new Dict(null); - newXref.set("Prev", xrefInfo.startXRef); + newXref.setIfDefined("Prev", xrefInfo?.startXRef); const refForXrefTable = xrefInfo.newRef; if (useXrefStream) { changes.put(refForXrefTable, { data: "" }); @@ -386,21 +398,20 @@ function getTrailerDict(xrefInfo, changes, useXrefStream) { } else { newXref.set("Size", refForXrefTable.num); } - if (xrefInfo.rootRef !== null) { - newXref.set("Root", xrefInfo.rootRef); - } - if (xrefInfo.infoRef !== null) { - newXref.set("Info", xrefInfo.infoRef); - } - if (xrefInfo.encryptRef !== null) { - newXref.set("Encrypt", xrefInfo.encryptRef); - } + newXref.setIfDefined("Root", xrefInfo?.rootRef); + newXref.setIfDefined("Info", xrefInfo?.infoRef); + newXref.setIfDefined("Encrypt", xrefInfo?.encryptRef); + return newXref; } async function writeChanges(changes, xref, buffer = []) { const newRefs = []; - for (const [ref, { data }] of changes.items()) { + for (const [ref, { data, objStreamRef, index }] of changes.items()) { + if (objStreamRef) { + newRefs.push({ ref, data, objStreamRef, index }); + continue; + } if (data === null || typeof data === "string") { newRefs.push({ ref, data }); continue; @@ -483,4 +494,4 @@ async function incrementalUpdate({ return array; } -export { incrementalUpdate, writeChanges, writeDict, writeObject }; +export { incrementalUpdate, writeChanges, writeDict, writeObject, writeValue }; diff --git a/src/display/api.js b/src/display/api.js index 149ceb237..b279df994 100644 --- a/src/display/api.js +++ b/src/display/api.js @@ -1025,6 +1025,24 @@ class PDFDocumentProxy { return this._transport.saveDocument(); } + /** + * @typedef {Object} PageInfo + * @property {null|Uint8Array} document + * @property {Array|number>} [includePages] + * included ranges or indices. + * @property {Array|number>} [excludePages] + * excluded ranges or indices. + */ + + /** + * @param {Array} pageInfos - The pages to extract. + * @returns {Promise} A promise that is resolved with a + * {Uint8Array} containing the full data of the saved document. + */ + extractPages(pageInfos) { + return this._transport.extractPages(pageInfos); + } + /** * @returns {Promise<{ length: number }>} A promise that is resolved when the * document's data is loaded. It is resolved with an {Object} that contains @@ -2900,6 +2918,10 @@ class WorkerTransport { }); } + extractPages(pageInfos) { + return this.messageHandler.sendWithPromise("ExtractPages", { pageInfos }); + } + getPage(pageNumber) { if ( !Number.isInteger(pageNumber) || diff --git a/test/driver.js b/test/driver.js index a21a4a610..212a0e85e 100644 --- a/test/driver.js +++ b/test/driver.js @@ -506,6 +506,7 @@ class Driver { this.inFlightRequests = 0; this.testFilter = JSON.parse(params.get("testfilter") || "[]"); this.xfaOnly = params.get("xfaonly") === "true"; + this.masterMode = params.get("mastermode") === "true"; // Create a working canvas this.canvas = document.createElement("canvas"); @@ -591,6 +592,25 @@ class Driver { task.stats = { times: [] }; task.enableXfa = task.enableXfa === true; + if (task.includePages && task.type === "extract") { + if (this.masterMode) { + const includePages = []; + for (const page of task.includePages) { + if (Array.isArray(page)) { + for (let i = page[0]; i <= page[1]; i++) { + includePages.push(i); + } + } else { + includePages.push(page); + } + } + task.numberOfTasks = includePages.length; + task.includePages = includePages; + } else { + delete task.pageMapping; + } + } + const prevFile = md5FileMap.get(task.md5); if (prevFile) { if (task.file !== prevFile) { @@ -658,6 +678,20 @@ class Driver { }); let promise = loadingTask.promise; + if (!this.masterMode && task.type === "extract") { + promise = promise.then(async doc => { + const data = await doc.extractPages([ + { + document: null, + includePages: task.includePages, + }, + ]); + await loadingTask.destroy(); + delete task.includePages; + return getDocument(data).promise; + }); + } + if (task.annotationStorage) { for (const annotation of Object.values(task.annotationStorage)) { const { bitmapName, quadPoints, paths, outlines } = annotation; @@ -862,7 +896,12 @@ class Driver { } } - if (task.skipPages?.includes(task.pageNum)) { + if ( + task.skipPages?.includes(task.pageNum) || + (this.masterMode && + task.includePages && + !task.includePages.includes(task.pageNum - 1)) + ) { this._log( ` Skipping page ${task.pageNum}/${task.pdfDoc.numPages}...\n` ); @@ -1274,10 +1313,11 @@ class Driver { id: task.id, numPages: task.pdfDoc ? task.lastPage || task.pdfDoc.numPages : 0, lastPageNum: this._getLastPageNumber(task), + numberOfTasks: task.numberOfTasks ?? -1, failure, file: task.file, round: task.round, - page: task.pageNum, + page: task.pageMapping?.[task.pageNum] ?? task.pageNum, snapshot, baselineSnapshot, stats: task.stats.times, diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 1eb2dce1b..91091a44e 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -754,3 +754,6 @@ !bug1937438_from_word.pdf !bug1937438_mml_from_latex.pdf !bug1997343.pdf +!doc_1_3_pages.pdf +!doc_2_3_pages.pdf +!doc_3_3_pages.pdf diff --git a/test/pdfs/doc_1_3_pages.pdf b/test/pdfs/doc_1_3_pages.pdf new file mode 100755 index 000000000..f71ed36ab Binary files /dev/null and b/test/pdfs/doc_1_3_pages.pdf differ diff --git a/test/pdfs/doc_2_3_pages.pdf b/test/pdfs/doc_2_3_pages.pdf new file mode 100755 index 000000000..3ccb2f422 Binary files /dev/null and b/test/pdfs/doc_2_3_pages.pdf differ diff --git a/test/pdfs/doc_3_3_pages.pdf b/test/pdfs/doc_3_3_pages.pdf new file mode 100755 index 000000000..63222e899 Binary files /dev/null and b/test/pdfs/doc_3_3_pages.pdf differ diff --git a/test/test.mjs b/test/test.mjs index c6e432e1f..ee9cf7340 100644 --- a/test/test.mjs +++ b/test/test.mjs @@ -672,6 +672,7 @@ function checkRefTestResults(browser, id, results) { case "partial": case "text": case "highlight": + case "extract": checkEq(task, results, browser, session.masterMode); break; case "fbf": @@ -731,6 +732,7 @@ function refTestPostHandler(parsedUrl, req, res) { var snapshot = data.snapshot; var baselineSnapshot = data.baselineSnapshot; var lastPageNum = data.lastPageNum; + var numberOfTasks = data.numberOfTasks; session = getSession(browser); monitorBrowserTimeout(session, handleSessionTimeout); @@ -773,7 +775,10 @@ function refTestPostHandler(parsedUrl, req, res) { }); } - var isDone = taskResults.at(-1)?.[lastPageNum - 1]; + const lastTaskResults = taskResults.at(-1); + const isDone = + lastTaskResults?.[lastPageNum - 1] || + lastTaskResults?.filter(result => !!result).length === numberOfTasks; if (isDone) { checkRefTestResults(browser, id, taskResults); session.remaining--; diff --git a/test/test_manifest.json b/test/test_manifest.json index 03d1f1d71..f864c2be9 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -13049,5 +13049,23 @@ "rotation": 0 } } + }, + { + "id": "tracemonkey-extract_0_2_12", + "file": "pdfs/tracemonkey.pdf", + "md5": "9a192d8b1a7dc652a19835f6f08098bd", + "rounds": 1, + "type": "extract", + "includePages": [0, 2, 12], + "pageMapping": { "1": 1, "3": 2, "13": 3 } + }, + { + "id": "bug900822-encrypted-extract_0", + "file": "pdfs/bug900822.pdf", + "md5": "70e2a3c5922574eeda169c955cf9d084", + "rounds": 1, + "type": "extract", + "includePages": [0], + "pageMapping": { "1": 1 } } ] diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index a8e0fbc07..0729875b6 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -5335,4 +5335,212 @@ deployment as easy as distributing a source file. They are used for small scripts as well as for`); }); }); + + describe("PDF page editing", function () { + describe("Merge pdfs", function () { + it("should merge three PDFs", async function () { + const loadingTask = getDocument( + buildGetDocumentParams("doc_1_3_pages.pdf") + ); + const pdfDoc = await loadingTask.promise; + const pdfData2 = await DefaultFileReaderFactory.fetch({ + path: TEST_PDFS_PATH + "doc_2_3_pages.pdf", + }); + const pdfData3 = await DefaultFileReaderFactory.fetch({ + path: TEST_PDFS_PATH + "doc_3_3_pages.pdf", + }); + + let data = await pdfDoc.extractPages([ + { document: null }, + { document: pdfData2 }, + { document: pdfData3 }, + ]); + let newLoadingTask = getDocument(data); + let newPdfDoc = await newLoadingTask.promise; + expect(newPdfDoc.numPages).toEqual(9); + + for (let i = 1; i <= 9; i++) { + const pdfPage = await newPdfDoc.getPage(i); + const { items: textItems } = await pdfPage.getTextContent(); + expect(mergeText(textItems)).toEqual( + `Document ${Math.ceil(i / 3)}:Page ${((i - 1) % 3) + 1}` + ); + } + await newLoadingTask.destroy(); + + data = await pdfDoc.extractPages([ + { document: pdfData3 }, + { document: pdfData2 }, + { document: null }, + ]); + newLoadingTask = getDocument(data); + newPdfDoc = await newLoadingTask.promise; + expect(newPdfDoc.numPages).toEqual(9); + for (let i = 1; i <= 9; i++) { + const pdfPage = await newPdfDoc.getPage(i); + const { items: textItems } = await pdfPage.getTextContent(); + expect(mergeText(textItems)).toEqual( + `Document ${Math.ceil((10 - i) / 3)}:Page ${((i - 1) % 3) + 1}` + ); + } + await newLoadingTask.destroy(); + + data = await pdfDoc.extractPages([ + { document: null, includePages: [0] }, + { document: pdfData2, includePages: [0] }, + { document: pdfData3, includePages: [0] }, + ]); + newLoadingTask = getDocument(data); + newPdfDoc = await newLoadingTask.promise; + expect(newPdfDoc.numPages).toEqual(3); + for (let i = 1; i <= 3; i++) { + const pdfPage = await newPdfDoc.getPage(i); + const { items: textItems } = await pdfPage.getTextContent(); + expect(mergeText(textItems)).toEqual(`Document ${i}:Page 1`); + } + await newLoadingTask.destroy(); + + data = await pdfDoc.extractPages([ + { document: null, excludePages: [0] }, + { document: pdfData2, excludePages: [0] }, + { document: pdfData3, excludePages: [0] }, + ]); + newLoadingTask = getDocument(data); + newPdfDoc = await newLoadingTask.promise; + expect(newPdfDoc.numPages).toEqual(6); + for (let i = 1; i <= 6; i++) { + const pdfPage = await newPdfDoc.getPage(i); + const { items: textItems } = await pdfPage.getTextContent(); + expect(mergeText(textItems)).toEqual( + `Document ${Math.ceil(i / 2)}:Page ${((i - 1) % 2) + 2}` + ); + } + await newLoadingTask.destroy(); + + await loadingTask.destroy(); + }); + + it("should merge two PDFs with page included ranges", async function () { + const loadingTask = getDocument( + buildGetDocumentParams("tracemonkey.pdf") + ); + const pdfDoc = await loadingTask.promise; + const pdfData1 = await DefaultFileReaderFactory.fetch({ + path: TEST_PDFS_PATH + "doc_1_3_pages.pdf", + }); + + const data = await pdfDoc.extractPages([ + { document: pdfData1, includePages: [[0, 0], 2] }, + { document: null, includePages: [[2, 4], 7] }, + ]); + const newLoadingTask = getDocument(data); + const newPdfDoc = await newLoadingTask.promise; + expect(newPdfDoc.numPages).toEqual(6); + + for (let i = 1; i <= 2; i++) { + const pdfPage = await newPdfDoc.getPage(i); + const { items: textItems } = await pdfPage.getTextContent(); + expect(mergeText(textItems)).toEqual(`Document 1:Page ${2 * i - 1}`); + } + + const expectedPagesText = [ + "v0 := ld s", + "i=4. On th", + "resentatio", + "5.1 Optimi", + ]; + for (let i = 3; i <= 6; i++) { + const pdfPage = await newPdfDoc.getPage(i); + const { items: textItems } = await pdfPage.getTextContent(); + const text = mergeText(textItems); + expect(text.substring(0, 10)).toEqual(expectedPagesText[i - 3]); + } + + await newLoadingTask.destroy(); + await loadingTask.destroy(); + }); + + it("should merge two PDFs with page excluded ranges", async function () { + const loadingTask = getDocument( + buildGetDocumentParams("tracemonkey.pdf") + ); + const pdfDoc = await loadingTask.promise; + const pdfData1 = await DefaultFileReaderFactory.fetch({ + path: TEST_PDFS_PATH + "doc_1_3_pages.pdf", + }); + + const data = await pdfDoc.extractPages([ + { document: pdfData1, excludePages: [[1, 1]] }, + { + document: null, + excludePages: [ + [0, 1], + [5, 6], + [8, 13], + ], + }, + ]); + const newLoadingTask = getDocument(data); + const newPdfDoc = await newLoadingTask.promise; + expect(newPdfDoc.numPages).toEqual(6); + + for (let i = 1; i <= 2; i++) { + const pdfPage = await newPdfDoc.getPage(i); + const { items: textItems } = await pdfPage.getTextContent(); + expect(mergeText(textItems)).toEqual(`Document 1:Page ${2 * i - 1}`); + } + + const expectedPagesText = [ + "v0 := ld s", + "i=4. On th", + "resentatio", + "5.1 Optimi", + ]; + for (let i = 3; i <= 6; i++) { + const pdfPage = await newPdfDoc.getPage(i); + const { items: textItems } = await pdfPage.getTextContent(); + const text = mergeText(textItems); + expect(text.substring(0, 10)).toEqual(expectedPagesText[i - 3]); + } + + await newLoadingTask.destroy(); + await loadingTask.destroy(); + }); + + it("should merge two PDFs with one with a password", async function () { + const loadingTask = getDocument( + buildGetDocumentParams("doc_1_3_pages.pdf") + ); + const pdfDoc = await loadingTask.promise; + const pdfData1 = await DefaultFileReaderFactory.fetch({ + path: TEST_PDFS_PATH + "pr6531_2.pdf", + }); + + const data = await pdfDoc.extractPages([ + { document: null, includePages: [0] }, + { document: pdfData1, password: "asdfasdf" }, + ]); + const newLoadingTask = getDocument(data); + const newPdfDoc = await newLoadingTask.promise; + expect(newPdfDoc.numPages).toEqual(2); + + const expectedPagesText = ["Document 1:Page 1", ""]; + for (let i = 1; i <= 2; i++) { + const pdfPage = await newPdfDoc.getPage(i); + const { items: textItems } = await pdfPage.getTextContent(); + expect(mergeText(textItems)).toEqual(expectedPagesText[i - 1]); + } + + const page2 = await newPdfDoc.getPage(2); + const annots = await page2.getAnnotations(); + expect(annots.length).toEqual(1); + expect(annots[0].contentsObj.str).toEqual( + "Bluebeam should be encrypting this." + ); + + await newLoadingTask.destroy(); + await loadingTask.destroy(); + }); + }); + }); }); diff --git a/test/unit/primitives_spec.js b/test/unit/primitives_spec.js index 04c92009b..b71df0b83 100644 --- a/test/unit/primitives_spec.js +++ b/test/unit/primitives_spec.js @@ -310,6 +310,16 @@ describe("primitives", function () { expect(rawValues2.sort()).toEqual(expectedRawValues2); }); + it("should get all raw entries", function () { + const expectedRawEntries = [ + ["FontFile", testFontFile], + ["FontFile2", testFontFile2], + ["FontFile3", testFontFile3], + ]; + const rawEntries = Array.from(dictWithManyKeys.getRawEntries()); + expect(rawEntries.sort()).toEqual(expectedRawEntries); + }); + it("should create only one object for Dict.empty", function () { const firstDictEmpty = Dict.empty; const secondDictEmpty = Dict.empty; @@ -423,6 +433,12 @@ describe("primitives", function () { dict.setIfName("k", 1234); expect(dict.has("k")).toBeFalse(); + + dict.setIfDict("l", new Dict()); + expect(dict.get("l")).toEqual(new Dict()); + + dict.setIfDict("m", "not a dict"); + expect(dict.has("m")).toBeFalse(); }); }); diff --git a/test/unit/writer_spec.js b/test/unit/writer_spec.js index 15866ee14..394c74429 100644 --- a/test/unit/writer_spec.js +++ b/test/unit/writer_spec.js @@ -170,8 +170,8 @@ describe("Writer", function () { const expected = "<< /A /B /B 123 456 R /C 789 /D (hello world) " + - "/E (\\(hello\\\\world\\)) /F [1.23 4.5 6] " + - "/G << /H 123 /I << /Length 8>> stream\n" + + "/E (\\(hello\\\\world\\)) /F [1.23001 4.50001 6] " + + "/G << /H 123.00001 /I << /Length 8>> stream\n" + "a stream\n" + "endstream>> /J true /K false " + "/NullArr [null 10] /NullVal null>>";