Merge pull request #20409 from calixteman/split_merge_p1

Add the possibility to create a pdf from different ones (bug 1997379)
2025-11-07 15:05:52 +01:00 · 2025-11-07 15:05:52 +01:00 · 85ed401b82
commit 85ed401b82
parent 2201777d0f bc87f4e8d6
19 changed files with 1089 additions and 44 deletions
--- a/src/core/decode_stream.js
+++ b/src/core/decode_stream.js
@ -131,6 +131,19 @@ class DecodeStream extends BaseStream {
  getBaseStreams() {
    return this.stream ? this.stream.getBaseStreams() : null;
  }
+
+  clone() {
+    // Make sure it has been fully read.
+    while (!this.eof) {
+      this.readBlock();
+    }
+    return new Stream(
+      this.buffer,
+      this.start,
+      this.end - this.start,
+      this.dict.clone()
+    );
+  }
 }

 class StreamsSequenceStream extends DecodeStream {
--- a/src/core/decrypt_stream.js
+++ b/src/core/decrypt_stream.js
@ -52,6 +52,10 @@ class DecryptStream extends DecodeStream {
    buffer.set(chunk, bufferLength);
    this.bufferLength = newLength;
  }
+
+  getOriginalStream() {
+    return this;
+  }
 }

 export { DecryptStream };
--- a/src/core/document.js
+++ b/src/core/document.js
@ -178,7 +178,7 @@ class Page {
    );
  }

-  #getBoundingBox(name) {
+  getBoundingBox(name) {
    if (this.xfaData) {
      return this.xfaData.bbox;
    }
@ -201,7 +201,7 @@ class Page {
    return shadow(
      this,
      "mediaBox",
-      this.#getBoundingBox("MediaBox") || LETTER_SIZE_MEDIABOX
+      this.getBoundingBox("MediaBox") || LETTER_SIZE_MEDIABOX
    );
  }

@ -210,7 +210,7 @@ class Page {
    return shadow(
      this,
      "cropBox",
-      this.#getBoundingBox("CropBox") || this.mediaBox
+      this.getBoundingBox("CropBox") || this.mediaBox
    );
  }

--- a/src/core/editor/pdf_editor.js
+++ b/src/core/editor/pdf_editor.js
@ -0,0 +1,594 @@
+/* Copyright 2025 Mozilla Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** @typedef {import("../document.js").PDFDocument} PDFDocument */
+/** @typedef {import("../document.js").Page} Page */
+/** @typedef {import("../xref.js").XRef} XRef */
+
+import { Dict, isName, Ref, RefSetCache } from "../primitives.js";
+import { getModificationDate, stringToPDFString } from "../../shared/util.js";
+import { incrementalUpdate, writeValue } from "../writer.js";
+import { BaseStream } from "../base_stream.js";
+import { StringStream } from "../stream.js";
+import { stringToAsciiOrUTF16BE } from "../core_utils.js";
+
+const MAX_LEAVES_PER_PAGES_NODE = 16;
+
+class PageData {
+  constructor(page, documentData) {
+    this.page = page;
+    this.documentData = documentData;
+    this.annotations = null;
+
+    documentData.pagesMap.put(page.ref, this);
+  }
+}
+
+class DocumentData {
+  constructor(document) {
+    this.document = document;
+    this.pagesMap = new RefSetCache();
+    this.oldRefMapping = new RefSetCache();
+  }
+}
+
+class PDFEditor {
+  constructor({ useObjectStreams = true, title = "", author = "" } = {}) {
+    this.hasSingleFile = false;
+    this.currentDocument = null;
+    this.oldPages = [];
+    this.newPages = [];
+    this.xref = [null];
+    this.newRefCount = 1;
+    [this.rootRef, this.rootDict] = this.newDict;
+    [this.infoRef, this.infoDict] = this.newDict;
+    [this.pagesRef, this.pagesDict] = this.newDict;
+    this.namesDict = null;
+    this.useObjectStreams = useObjectStreams;
+    this.objStreamRefs = useObjectStreams ? new Set() : null;
+    this.version = "1.7";
+    this.title = title;
+    this.author = author;
+  }
+
+  /**
+   * Get a new reference for an object in the PDF.
+   * @returns {Ref}
+   */
+  get newRef() {
+    const ref = Ref.get(this.newRefCount++, 0);
+    return ref;
+  }
+
+  /**
+   * Create a new dictionary and its reference.
+   * @returns {[Ref, Dict]}
+   */
+  get newDict() {
+    const ref = this.newRef;
+    const dict = (this.xref[ref.num] = new Dict());
+    return [ref, dict];
+  }
+
+  /**
+   * Clone an object in the PDF.
+   * @param {*} obj
+   * @param {XRef} xref
+   * @returns {Promise<Ref>}
+   */
+  async #cloneObject(obj, xref) {
+    const ref = this.newRef;
+    this.xref[ref.num] = await this.#collectDependencies(obj, true, xref);
+    return ref;
+  }
+
+  /**
+   * Collect the dependencies of an object and create new references for each
+   * dependency.
+   * @param {*} obj
+   * @param {boolean} mustClone
+   * @param {XRef} xref
+   * @returns {Promise<*>}
+   */
+  async #collectDependencies(obj, mustClone, xref) {
+    if (obj instanceof Ref) {
+      const {
+        currentDocument: { oldRefMapping },
+      } = this;
+      let newRef = oldRefMapping.get(obj);
+      if (newRef) {
+        return newRef;
+      }
+      newRef = this.newRef;
+      oldRefMapping.put(obj, newRef);
+      obj = await xref.fetchAsync(obj);
+
+      if (typeof PDFJSDev === "undefined" || PDFJSDev.test("TESTING")) {
+        if (
+          obj instanceof Dict &&
+          isName(obj.get("Type"), "Page") &&
+          !this.currentDocument.pagesMap.has(obj)
+        ) {
+          throw new Error(
+            "Add a deleted page to the document is not supported."
+          );
+        }
+      }
+
+      this.xref[newRef.num] = await this.#collectDependencies(obj, true, xref);
+      return newRef;
+    }
+    const promises = [];
+    if (Array.isArray(obj)) {
+      if (mustClone) {
+        obj = obj.slice();
+      }
+      for (let i = 0, ii = obj.length; i < ii; i++) {
+        promises.push(
+          this.#collectDependencies(obj[i], true, xref).then(
+            newObj => (obj[i] = newObj)
+          )
+        );
+      }
+      await Promise.all(promises);
+      return obj;
+    }
+    let dict;
+    if (obj instanceof BaseStream) {
+      ({ dict } = obj = obj.getOriginalStream().clone());
+    } else if (obj instanceof Dict) {
+      if (mustClone) {
+        obj = obj.clone();
+      }
+      dict = obj;
+    }
+    if (dict) {
+      for (const [key, rawObj] of dict.getRawEntries()) {
+        promises.push(
+          this.#collectDependencies(rawObj, true, xref).then(newObj =>
+            dict.set(key, newObj)
+          )
+        );
+      }
+      await Promise.all(promises);
+    }
+
+    return obj;
+  }
+
+  /**
+   * @typedef {Object} PageInfo
+   * @property {PDFDocument} document
+   * @property {Array<Array<number>|number>} [includePages]
+   *  included ranges (inclusive) or indices.
+   * @property {Array<Array<number>|number>} [excludePages]
+   *  excluded ranges (inclusive) or indices.
+   */
+
+  /**
+   * Extract pages from the given documents.
+   * @param {Array<PageInfo>} pageInfos
+   * @return {Promise<void>}
+   */
+  async extractPages(pageInfos) {
+    const promises = [];
+    let newIndex = 0;
+    this.hasSingleFile = pageInfos.length === 1;
+    for (const { document, includePages, excludePages } of pageInfos) {
+      if (!document) {
+        continue;
+      }
+      const documentData = new DocumentData(document);
+      promises.push(this.#collectDocumentData(documentData));
+      let keptIndices, keptRanges, deletedIndices, deletedRanges;
+      for (const page of includePages || []) {
+        if (Array.isArray(page)) {
+          (keptRanges ||= []).push(page);
+        } else {
+          (keptIndices ||= new Set()).add(page);
+        }
+      }
+      for (const page of excludePages || []) {
+        if (Array.isArray(page)) {
+          (deletedRanges ||= []).push(page);
+        } else {
+          (deletedIndices ||= new Set()).add(page);
+        }
+      }
+      for (let i = 0, ii = document.numPages; i < ii; i++) {
+        if (deletedIndices?.has(i)) {
+          continue;
+        }
+        if (deletedRanges) {
+          let isDeleted = false;
+          for (const [start, end] of deletedRanges) {
+            if (i >= start && i <= end) {
+              isDeleted = true;
+              break;
+            }
+          }
+          if (isDeleted) {
+            continue;
+          }
+        }
+
+        let takePage = false;
+        if (keptIndices) {
+          takePage = keptIndices.has(i);
+        }
+        if (!takePage && keptRanges) {
+          for (const [start, end] of keptRanges) {
+            if (i >= start && i <= end) {
+              takePage = true;
+              break;
+            }
+          }
+        }
+        if (!takePage && !keptIndices && !keptRanges) {
+          takePage = true;
+        }
+        if (!takePage) {
+          continue;
+        }
+        const newPageIndex = newIndex++;
+        promises.push(
+          document.getPage(i).then(page => {
+            this.oldPages[newPageIndex] = new PageData(page, documentData);
+          })
+        );
+      }
+    }
+    await Promise.all(promises);
+    promises.length = 0;
+
+    for (const page of this.oldPages) {
+      promises.push(this.#postCollectPageData(page));
+    }
+    await Promise.all(promises);
+
+    for (let i = 0, ii = this.oldPages.length; i < ii; i++) {
+      this.newPages[i] = await this.#makePageCopy(i, null);
+    }
+
+    return this.writePDF();
+  }
+
+  /**
+   * Collect the document data.
+   * @param {DocumentData} documentData
+   * @return {Promise<void>}
+   */
+  async #collectDocumentData(documentData) {}
+
+  /**
+   * Post process the collected page data.
+   * @param {PageData} pageData
+   * @returns {Promise<void>}
+   */
+  async #postCollectPageData(pageData) {
+    const {
+      page: { xref, annotations },
+    } = pageData;
+
+    if (!annotations) {
+      return;
+    }
+
+    const promises = [];
+    let newAnnotations = [];
+    let newIndex = 0;
+
+    // TODO: remove only links to deleted pages.
+    for (const annotationRef of annotations) {
+      const newAnnotationIndex = newIndex++;
+      promises.push(
+        xref.fetchIfRefAsync(annotationRef).then(async annotationDict => {
+          if (!isName(annotationDict.get("Subtype"), "Link")) {
+            newAnnotations[newAnnotationIndex] = annotationRef;
+          }
+        })
+      );
+    }
+    await Promise.all(promises);
+    newAnnotations = newAnnotations.filter(annot => !!annot);
+    pageData.annotations = newAnnotations.length > 0 ? newAnnotations : null;
+  }
+
+  /**
+   * Create a copy of a page.
+   * @param {number} pageIndex
+   * @returns {Promise<Ref>} the page reference in the new PDF document.
+   */
+  async #makePageCopy(pageIndex) {
+    const { page, documentData, annotations } = this.oldPages[pageIndex];
+    this.currentDocument = documentData;
+    const { oldRefMapping } = documentData;
+    const { xref, rotate, mediaBox, resources, ref: oldPageRef } = page;
+    const pageRef = this.newRef;
+    const pageDict = (this.xref[pageRef.num] = page.pageDict.clone());
+    oldRefMapping.put(oldPageRef, pageRef);
+
+    // No need to keep these entries as we'll set them again later.
+    for (const key of [
+      "Rotate",
+      "MediaBox",
+      "CropBox",
+      "BleedBox",
+      "TrimBox",
+      "ArtBox",
+      "Resources",
+      "Annots",
+      "Parent",
+      "UserUnit",
+    ]) {
+      pageDict.delete(key);
+    }
+
+    const lastRef = this.newRefCount;
+    await this.#collectDependencies(pageDict, false, xref);
+
+    pageDict.set("Rotate", rotate);
+    pageDict.set("MediaBox", mediaBox);
+    for (const boxName of ["CropBox", "BleedBox", "TrimBox", "ArtBox"]) {
+      const box = page.getBoundingBox(boxName);
+      if (box?.some((value, index) => value !== mediaBox[index])) {
+        // These boxes are optional and their default value is the MediaBox.
+        pageDict.set(boxName, box);
+      }
+    }
+    const userUnit = page.userUnit;
+    if (userUnit !== 1) {
+      pageDict.set("UserUnit", userUnit);
+    }
+    pageDict.setIfDict(
+      "Resources",
+      await this.#collectDependencies(resources, true, xref)
+    );
+    pageDict.setIfArray(
+      "Annots",
+      await this.#collectDependencies(annotations, true, xref)
+    );
+
+    if (this.useObjectStreams) {
+      const newLastRef = this.newRefCount;
+      const pageObjectRefs = [];
+      for (let i = lastRef; i < newLastRef; i++) {
+        const obj = this.xref[i];
+        if (obj instanceof BaseStream) {
+          continue;
+        }
+        pageObjectRefs.push(Ref.get(i, 0));
+      }
+      for (let i = 0; i < pageObjectRefs.length; i += 0xffff) {
+        const objStreamRef = this.newRef;
+        this.objStreamRefs.add(objStreamRef.num);
+        this.xref[objStreamRef.num] = pageObjectRefs.slice(i, i + 0xffff);
+      }
+    }
+
+    this.currentDocument = null;
+
+    return pageRef;
+  }
+
+  /**
+   * Create the page tree structure.
+   */
+  #makePageTree() {
+    const { newPages: pages, rootDict, pagesRef, pagesDict } = this;
+    rootDict.set("Pages", pagesRef);
+    pagesDict.setIfName("Type", "Pages");
+    pagesDict.set("Count", pages.length);
+
+    const maxLeaves =
+      MAX_LEAVES_PER_PAGES_NODE <= 1 ? pages.length : MAX_LEAVES_PER_PAGES_NODE;
+    const stack = [{ dict: pagesDict, kids: pages, parentRef: pagesRef }];
+
+    while (stack.length > 0) {
+      const { dict, kids, parentRef } = stack.pop();
+      if (kids.length <= maxLeaves) {
+        dict.set("Kids", kids);
+        for (const ref of kids) {
+          this.xref[ref.num].set("Parent", parentRef);
+        }
+        continue;
+      }
+      const chunkSize = Math.max(maxLeaves, Math.ceil(kids.length / maxLeaves));
+      const kidsChunks = [];
+      for (let i = 0; i < kids.length; i += chunkSize) {
+        kidsChunks.push(kids.slice(i, i + chunkSize));
+      }
+      const kidsRefs = [];
+      dict.set("Kids", kidsRefs);
+      for (const chunk of kidsChunks) {
+        const [kidRef, kidDict] = this.newDict;
+        kidsRefs.push(kidRef);
+        kidDict.setIfName("Type", "Pages");
+        kidDict.set("Parent", parentRef);
+        kidDict.set("Count", chunk.length);
+        stack.push({ dict: kidDict, kids: chunk, parentRef: kidRef });
+      }
+    }
+  }
+
+  /**
+   * Create the root dictionary.
+   * @returns {Promise<void>}
+   */
+  async #makeRoot() {
+    const { rootDict } = this;
+    rootDict.setIfName("Type", "Catalog");
+    rootDict.set("Version", this.version);
+    this.#makePageTree();
+  }
+
+  /**
+   * Create the info dictionary.
+   * @returns {Map} infoMap
+   */
+  #makeInfo() {
+    const infoMap = new Map();
+    if (this.hasSingleFile) {
+      const {
+        xref: { trailer },
+      } = this.oldPages[0].documentData.document;
+      const oldInfoDict = trailer.get("Info");
+      for (const [key, value] of oldInfoDict || []) {
+        if (typeof value === "string") {
+          infoMap.set(key, stringToPDFString(value));
+        }
+      }
+    }
+    infoMap.delete("ModDate");
+    infoMap.set("CreationDate", getModificationDate());
+    infoMap.set("Creator", "PDF.js");
+    infoMap.set("Producer", "Firefox");
+
+    if (this.author) {
+      infoMap.set("Author", this.author);
+    }
+    if (this.title) {
+      infoMap.set("Title", this.title);
+    }
+    for (const [key, value] of infoMap) {
+      this.infoDict.set(key, stringToAsciiOrUTF16BE(value));
+    }
+    return infoMap;
+  }
+
+  /**
+   * Create the encryption dictionary if required.
+   * @returns {Promise<[Dict|null, CipherTransformFactory|null, Array|null]>}
+   */
+  async #makeEncrypt() {
+    if (!this.hasSingleFile) {
+      return [null, null, null];
+    }
+    const { documentData } = this.oldPages[0];
+    const {
+      document: {
+        xref: { trailer, encrypt },
+      },
+    } = documentData;
+    if (!trailer.has("Encrypt")) {
+      return [null, null, null];
+    }
+    const encryptDict = trailer.get("Encrypt");
+    if (!(encryptDict instanceof Dict)) {
+      return [null, null, null];
+    }
+    this.currentDocument = documentData;
+    const result = [
+      await this.#cloneObject(encryptDict, trailer.xref),
+      encrypt,
+      trailer.get("ID"),
+    ];
+    this.currentDocument = null;
+    return result;
+  }
+
+  /**
+   * Create the changes required to write the new PDF document.
+   * @returns {Promise<[RefSetCache, Ref]>}
+   */
+  async #createChanges() {
+    const changes = new RefSetCache();
+    changes.put(Ref.get(0, 0xffff), { data: null });
+    for (let i = 1, ii = this.xref.length; i < ii; i++) {
+      if (this.objStreamRefs?.has(i)) {
+        await this.#createObjectStream(Ref.get(i, 0), this.xref[i], changes);
+      } else {
+        changes.put(Ref.get(i, 0), { data: this.xref[i] });
+      }
+    }
+
+    return [changes, this.newRef];
+  }
+
+  /**
+   * Create an object stream containing the given objects.
+   * @param {Ref} objStreamRef
+   * @param {Array<Ref>} objRefs
+   * @param {RefSetCache} changes
+   */
+  async #createObjectStream(objStreamRef, objRefs, changes) {
+    const streamBuffer = [""];
+    const objOffsets = [];
+    let offset = 0;
+    const buffer = [];
+    for (let i = 0, ii = objRefs.length; i < ii; i++) {
+      const objRef = objRefs[i];
+      changes.put(objRef, { data: null, objStreamRef, index: i });
+      objOffsets.push(`${objRef.num} ${offset}`);
+      const data = this.xref[objRef.num];
+      await writeValue(data, buffer, /* transform = */ null);
+      const obj = buffer.join("");
+      buffer.length = 0;
+      streamBuffer.push(obj);
+      offset += obj.length + 1;
+    }
+    streamBuffer[0] = objOffsets.join("\n");
+    const objStream = new StringStream(streamBuffer.join("\n"));
+    const objStreamDict = (objStream.dict = new Dict());
+    objStreamDict.setIfName("Type", "ObjStm");
+    objStreamDict.set("N", objRefs.length);
+    objStreamDict.set("First", streamBuffer[0].length + 1);
+
+    changes.put(objStreamRef, { data: objStream });
+  }
+
+  /**
+   * Write the new PDF document to a Uint8Array.
+   * @returns {Promise<Uint8Array>}
+   */
+  async writePDF() {
+    await this.#makeRoot();
+    const infoMap = this.#makeInfo();
+    const [encryptRef, encrypt, fileIds] = await this.#makeEncrypt();
+    const [changes, xrefTableRef] = await this.#createChanges();
+
+    // Create the PDF header in order to help sniffers.
+    // PDF version must be in the range 1.0 to 1.7 inclusive.
+    // We add a binary comment line to ensure that the file is treated
+    // as a binary file by applications that open it.
+    const header = [
+      ...`%PDF-${this.version}\n%`.split("").map(c => c.charCodeAt(0)),
+      0xfa,
+      0xde,
+      0xfa,
+      0xce,
+    ];
+    return incrementalUpdate({
+      originalData: new Uint8Array(header),
+      changes,
+      xrefInfo: {
+        startXRef: null,
+        rootRef: this.rootRef,
+        infoRef: this.infoRef,
+        encryptRef,
+        newRef: xrefTableRef,
+        fileIds: fileIds || [null, null],
+        infoMap,
+      },
+      useXrefStream: this.useObjectStreams,
+      xref: {
+        encrypt,
+        encryptRef,
+      },
+    });
+  }
+}
+
+export { PDFEditor };
--- a/src/core/primitives.js
+++ b/src/core/primitives.js
@ -188,6 +188,10 @@ class Dict {
    return [...this._map.values()];
  }

+  getRawEntries() {
+    return this._map.entries();
+  }
+
  set(key, value) {
    if (typeof PDFJSDev === "undefined" || PDFJSDev.test("TESTING")) {
      if (typeof key !== "string") {
@ -231,6 +235,12 @@ class Dict {
    }
  }

+  setIfDict(key, value) {
+    if (value instanceof Dict) {
+      this.set(key, value);
+    }
+  }
+
  has(key) {
    return this._map.has(key);
  }
--- a/src/core/stream.js
+++ b/src/core/stream.js
@ -82,6 +82,15 @@ class Stream extends BaseStream {
  makeSubStream(start, length, dict = null) {
    return new Stream(this.bytes.buffer, start, length, dict);
  }
+
+  clone() {
+    return new Stream(
+      this.bytes.buffer,
+      this.start,
+      this.end - this.start,
+      this.dict.clone()
+    );
+  }
 }

 class StringStream extends Stream {
--- a/src/core/worker.js
+++ b/src/core/worker.js
@ -36,6 +36,7 @@ import { MessageHandler, wrapReason } from "../shared/message_handler.js";
 import { AnnotationFactory } from "./annotation.js";
 import { clearGlobalCaches } from "./cleanup_helper.js";
 import { incrementalUpdate } from "./writer.js";
+import { PDFEditor } from "./editor/pdf_editor.js";
 import { PDFWorkerStream } from "./worker_stream.js";
 import { StructTreeRoot } from "./struct_tree.js";

@ -557,6 +558,97 @@ class WorkerMessageHandler {
      return pdfManager.ensureDoc("calculationOrderIds");
    });

+    handler.on("ExtractPages", async function ({ pageInfos }) {
+      if (!pageInfos) {
+        warn("extractPages: nothing to extract.");
+        return null;
+      }
+      if (!Array.isArray(pageInfos)) {
+        pageInfos = [pageInfos];
+      }
+      let newDocumentId = 0;
+      for (const pageInfo of pageInfos) {
+        if (pageInfo.document === null) {
+          pageInfo.document = pdfManager.pdfDocument;
+        } else if (ArrayBuffer.isView(pageInfo.document)) {
+          const manager = new LocalPdfManager({
+            source: pageInfo.document,
+            docId: `${docId}_extractPages_${newDocumentId++}`,
+            handler,
+            password: pageInfo.password ?? null,
+            evaluatorOptions: Object.assign({}, pdfManager.evaluatorOptions),
+          });
+          let recoveryMode = false;
+          let isValid = true;
+          while (true) {
+            try {
+              await manager.requestLoadedStream();
+              await manager.ensureDoc("checkHeader");
+              await manager.ensureDoc("parseStartXRef");
+              await manager.ensureDoc("parse", [recoveryMode]);
+              break;
+            } catch (e) {
+              if (e instanceof XRefParseException) {
+                if (recoveryMode === false) {
+                  recoveryMode = true;
+                  continue;
+                } else {
+                  isValid = false;
+                  warn("extractPages: XRefParseException.");
+                }
+              } else if (e instanceof PasswordException) {
+                const task = new WorkerTask(
+                  `PasswordException: response ${e.code}`
+                );
+
+                startWorkerTask(task);
+
+                try {
+                  const { password } = await handler.sendWithPromise(
+                    "PasswordRequest",
+                    e
+                  );
+                  manager.updatePassword(password);
+                } catch {
+                  isValid = false;
+                  warn("extractPages: invalid password.");
+                } finally {
+                  finishWorkerTask(task);
+                }
+              } else {
+                isValid = false;
+                warn("extractPages: invalid document.");
+              }
+              if (!isValid) {
+                break;
+              }
+            }
+          }
+          if (!isValid) {
+            pageInfo.document = null;
+          }
+          const isPureXfa = await manager.ensureDoc("isPureXfa");
+          if (isPureXfa) {
+            pageInfo.document = null;
+            warn("extractPages does not support pure XFA documents.");
+          } else {
+            pageInfo.document = manager.pdfDocument;
+          }
+        } else {
+          warn("extractPages: invalid document.");
+        }
+      }
+      try {
+        const pdfEditor = new PDFEditor();
+        const buffer = await pdfEditor.extractPages(pageInfos);
+        return buffer;
+      } catch (reason) {
+        // eslint-disable-next-line no-console
+        console.error(reason);
+        return null;
+      }
+    });
+
    handler.on(
      "SaveDocument",
      async function ({ isPureXfa, numPages, annotationStorage, filename }) {
--- a/src/core/writer.js
+++ b/src/core/writer.js
@ -19,7 +19,6 @@ import {
  escapePDFName,
  escapeString,
  getSizeInBytes,
-  numberToString,
  parseXFAPath,
 } from "./core_utils.js";
 import { SimpleDOMNode, SimpleXMLParser } from "./xml_parser.js";
@ -27,29 +26,34 @@ import { Stream, StringStream } from "./stream.js";
 import { BaseStream } from "./base_stream.js";
 import { calculateMD5 } from "./calculate_md5.js";

-async function writeObject(ref, obj, buffer, { encrypt = null }) {
-  const transform = encrypt?.createCipherTransform(ref.num, ref.gen);
+async function writeObject(
+  ref,
+  obj,
+  buffer,
+  { encrypt = null, encryptRef = null }
+) {
+  // Avoid to encrypt the encrypt dictionary.
+  const transform =
+    encrypt && encryptRef !== ref
+      ? encrypt.createCipherTransform(ref.num, ref.gen)
+      : null;
  buffer.push(`${ref.num} ${ref.gen} obj\n`);
-  if (obj instanceof Dict) {
-    await writeDict(obj, buffer, transform);
-  } else if (obj instanceof BaseStream) {
-    await writeStream(obj, buffer, transform);
-  } else if (Array.isArray(obj) || ArrayBuffer.isView(obj)) {
-    await writeArray(obj, buffer, transform);
-  }
+  await writeValue(obj, buffer, transform);
  buffer.push("\nendobj\n");
 }

 async function writeDict(dict, buffer, transform) {
  buffer.push("<<");
-  for (const key of dict.getKeys()) {
+  for (const [key, rawObj] of dict.getRawEntries()) {
    buffer.push(` /${escapePDFName(key)} `);
-    await writeValue(dict.getRaw(key), buffer, transform);
+    await writeValue(rawObj, buffer, transform);
  }
  buffer.push(">>");
 }

 async function writeStream(stream, buffer, transform) {
+  stream = stream.getOriginalStream();
+  stream.reset();
  let bytes = stream.getBytes();
  const { dict } = stream;

@ -67,7 +71,7 @@ async function writeStream(stream, buffer, transform) {
  // The number 256 is arbitrary, but it should be reasonable.
  const MIN_LENGTH_FOR_COMPRESSING = 256;

-  if (bytes.length >= MIN_LENGTH_FOR_COMPRESSING || isFilterZeroFlateDecode) {
+  if (bytes.length >= MIN_LENGTH_FOR_COMPRESSING && !isFilterZeroFlateDecode) {
    try {
      const cs = new CompressionStream("deflate");
      const writer = cs.writable.getWriter();
@ -120,14 +124,11 @@ async function writeStream(stream, buffer, transform) {

 async function writeArray(array, buffer, transform) {
  buffer.push("[");
-  let first = true;
-  for (const val of array) {
-    if (!first) {
+  for (let i = 0, ii = array.length; i < ii; i++) {
+    await writeValue(array[i], buffer, transform);
+    if (i < ii - 1) {
      buffer.push(" ");
-    } else {
-      first = false;
    }
-    await writeValue(val, buffer, transform);
  }
  buffer.push("]");
 }
@ -145,7 +146,11 @@ async function writeValue(value, buffer, transform) {
    }
    buffer.push(`(${escapeString(value)})`);
  } else if (typeof value === "number") {
-    buffer.push(numberToString(value));
+    // Don't try to round numbers in general, it could lead to have degenerate
+    // matrices (e.g. [0.000008 0 0 0.000008 0 0]).
+    // The numbers must be "rounded" only when pdf.js is producing them and the
+    // current transformation matrix is well known.
+    buffer.push(value.toString());
  } else if (typeof value === "boolean") {
    buffer.push(value.toString());
  } else if (value instanceof Dict) {
@ -306,7 +311,7 @@ async function getXRefTable(xrefInfo, baseOffset, newRefs, newXref, buffer) {
  }
  computeIDs(baseOffset, xrefInfo, newXref);
  buffer.push("trailer\n");
-  await writeDict(newXref, buffer);
+  await writeDict(newXref, buffer, null);
  buffer.push("\nstartxref\n", baseOffset.toString(), "\n%%EOF\n");
 }

@ -332,10 +337,17 @@ async function getXRefStreamTable(
  const xrefTableData = [];
  let maxOffset = 0;
  let maxGen = 0;
-  for (const { ref, data } of newRefs) {
+  for (const { ref, data, objStreamRef, index } of newRefs) {
    let gen;
    maxOffset = Math.max(maxOffset, baseOffset);
-    if (data !== null) {
+    // The first number in each entry is the type (see 7.5.8.3):
+    //  0: free object
+    //  1: in-use object
+    //  2: compressed object
+    if (objStreamRef) {
+      gen = index;
+      xrefTableData.push([2, objStreamRef.num, gen]);
+    } else if (data !== null) {
      gen = Math.min(ref.gen, 0xffff);
      xrefTableData.push([1, baseOffset, gen]);
      baseOffset += data.length;
@ -371,13 +383,13 @@ async function getXRefStreamTable(
 function computeIDs(baseOffset, xrefInfo, newXref) {
  if (Array.isArray(xrefInfo.fileIds) && xrefInfo.fileIds.length > 0) {
    const md5 = computeMD5(baseOffset, xrefInfo);
-    newXref.set("ID", [xrefInfo.fileIds[0], md5]);
+    newXref.set("ID", [xrefInfo.fileIds[0] || md5, md5]);
  }
 }

 function getTrailerDict(xrefInfo, changes, useXrefStream) {
  const newXref = new Dict(null);
-  newXref.set("Prev", xrefInfo.startXRef);
+  newXref.setIfDefined("Prev", xrefInfo?.startXRef);
  const refForXrefTable = xrefInfo.newRef;
  if (useXrefStream) {
    changes.put(refForXrefTable, { data: "" });
@ -386,21 +398,20 @@ function getTrailerDict(xrefInfo, changes, useXrefStream) {
  } else {
    newXref.set("Size", refForXrefTable.num);
  }
-  if (xrefInfo.rootRef !== null) {
-    newXref.set("Root", xrefInfo.rootRef);
-  }
-  if (xrefInfo.infoRef !== null) {
-    newXref.set("Info", xrefInfo.infoRef);
-  }
-  if (xrefInfo.encryptRef !== null) {
-    newXref.set("Encrypt", xrefInfo.encryptRef);
-  }
+  newXref.setIfDefined("Root", xrefInfo?.rootRef);
+  newXref.setIfDefined("Info", xrefInfo?.infoRef);
+  newXref.setIfDefined("Encrypt", xrefInfo?.encryptRef);
+
  return newXref;
 }

 async function writeChanges(changes, xref, buffer = []) {
  const newRefs = [];
-  for (const [ref, { data }] of changes.items()) {
+  for (const [ref, { data, objStreamRef, index }] of changes.items()) {
+    if (objStreamRef) {
+      newRefs.push({ ref, data, objStreamRef, index });
+      continue;
+    }
    if (data === null || typeof data === "string") {
      newRefs.push({ ref, data });
      continue;
@ -483,4 +494,4 @@ async function incrementalUpdate({
  return array;
 }

-export { incrementalUpdate, writeChanges, writeDict, writeObject };
+export { incrementalUpdate, writeChanges, writeDict, writeObject, writeValue };
--- a/src/display/api.js
+++ b/src/display/api.js
@ -1025,6 +1025,24 @@ class PDFDocumentProxy {
    return this._transport.saveDocument();
  }

+  /**
+   * @typedef {Object} PageInfo
+   * @property {null|Uint8Array} document
+   * @property {Array<Array<number>|number>} [includePages]
+   *  included ranges or indices.
+   * @property {Array<Array<number>|number>} [excludePages]
+   *  excluded ranges or indices.
+   */
+
+  /**
+   * @param {Array<PageInfo>} pageInfos - The pages to extract.
+   * @returns {Promise<Uint8Array>} A promise that is resolved with a
+   *   {Uint8Array} containing the full data of the saved document.
+   */
+  extractPages(pageInfos) {
+    return this._transport.extractPages(pageInfos);
+  }
+
  /**
   * @returns {Promise<{ length: number }>} A promise that is resolved when the
   *   document's data is loaded. It is resolved with an {Object} that contains
@ -2900,6 +2918,10 @@ class WorkerTransport {
      });
  }

+  extractPages(pageInfos) {
+    return this.messageHandler.sendWithPromise("ExtractPages", { pageInfos });
+  }
+
  getPage(pageNumber) {
    if (
      !Number.isInteger(pageNumber) ||
--- a/test/driver.js
+++ b/test/driver.js
@ -506,6 +506,7 @@ class Driver {
    this.inFlightRequests = 0;
    this.testFilter = JSON.parse(params.get("testfilter") || "[]");
    this.xfaOnly = params.get("xfaonly") === "true";
+    this.masterMode = params.get("mastermode") === "true";

    // Create a working canvas
    this.canvas = document.createElement("canvas");
@ -591,6 +592,25 @@ class Driver {
      task.stats = { times: [] };
      task.enableXfa = task.enableXfa === true;

+      if (task.includePages && task.type === "extract") {
+        if (this.masterMode) {
+          const includePages = [];
+          for (const page of task.includePages) {
+            if (Array.isArray(page)) {
+              for (let i = page[0]; i <= page[1]; i++) {
+                includePages.push(i);
+              }
+            } else {
+              includePages.push(page);
+            }
+          }
+          task.numberOfTasks = includePages.length;
+          task.includePages = includePages;
+        } else {
+          delete task.pageMapping;
+        }
+      }
+
      const prevFile = md5FileMap.get(task.md5);
      if (prevFile) {
        if (task.file !== prevFile) {
@ -658,6 +678,20 @@ class Driver {
        });
        let promise = loadingTask.promise;

+        if (!this.masterMode && task.type === "extract") {
+          promise = promise.then(async doc => {
+            const data = await doc.extractPages([
+              {
+                document: null,
+                includePages: task.includePages,
+              },
+            ]);
+            await loadingTask.destroy();
+            delete task.includePages;
+            return getDocument(data).promise;
+          });
+        }
+
        if (task.annotationStorage) {
          for (const annotation of Object.values(task.annotationStorage)) {
            const { bitmapName, quadPoints, paths, outlines } = annotation;
@ -862,7 +896,12 @@ class Driver {
      }
    }

-    if (task.skipPages?.includes(task.pageNum)) {
+    if (
+      task.skipPages?.includes(task.pageNum) ||
+      (this.masterMode &&
+        task.includePages &&
+        !task.includePages.includes(task.pageNum - 1))
+    ) {
      this._log(
        `    Skipping page ${task.pageNum}/${task.pdfDoc.numPages}...\n`
      );
@ -1274,10 +1313,11 @@ class Driver {
      id: task.id,
      numPages: task.pdfDoc ? task.lastPage || task.pdfDoc.numPages : 0,
      lastPageNum: this._getLastPageNumber(task),
+      numberOfTasks: task.numberOfTasks ?? -1,
      failure,
      file: task.file,
      round: task.round,
-      page: task.pageNum,
+      page: task.pageMapping?.[task.pageNum] ?? task.pageNum,
      snapshot,
      baselineSnapshot,
      stats: task.stats.times,
--- a/test/pdfs/.gitignore
+++ b/test/pdfs/.gitignore
@ -754,3 +754,6 @@
 !bug1937438_from_word.pdf
 !bug1937438_mml_from_latex.pdf
 !bug1997343.pdf
+!doc_1_3_pages.pdf
+!doc_2_3_pages.pdf
+!doc_3_3_pages.pdf
--- a/test/pdfs/doc_1_3_pages.pdf
+++ b/test/pdfs/doc_1_3_pages.pdf
--- a/test/pdfs/doc_2_3_pages.pdf
+++ b/test/pdfs/doc_2_3_pages.pdf
--- a/test/pdfs/doc_3_3_pages.pdf
+++ b/test/pdfs/doc_3_3_pages.pdf
--- a/test/test.mjs
+++ b/test/test.mjs
@ -672,6 +672,7 @@ function checkRefTestResults(browser, id, results) {
    case "partial":
    case "text":
    case "highlight":
+    case "extract":
      checkEq(task, results, browser, session.masterMode);
      break;
    case "fbf":
@ -731,6 +732,7 @@ function refTestPostHandler(parsedUrl, req, res) {
    var snapshot = data.snapshot;
    var baselineSnapshot = data.baselineSnapshot;
    var lastPageNum = data.lastPageNum;
+    var numberOfTasks = data.numberOfTasks;

    session = getSession(browser);
    monitorBrowserTimeout(session, handleSessionTimeout);
@ -773,7 +775,10 @@ function refTestPostHandler(parsedUrl, req, res) {
      });
    }

-    var isDone = taskResults.at(-1)?.[lastPageNum - 1];
+    const lastTaskResults = taskResults.at(-1);
+    const isDone =
+      lastTaskResults?.[lastPageNum - 1] ||
+      lastTaskResults?.filter(result => !!result).length === numberOfTasks;
    if (isDone) {
      checkRefTestResults(browser, id, taskResults);
      session.remaining--;
--- a/test/test_manifest.json
+++ b/test/test_manifest.json
@ -13049,5 +13049,23 @@
        "rotation": 0
      }
    }
+  },
+  {
+    "id": "tracemonkey-extract_0_2_12",
+    "file": "pdfs/tracemonkey.pdf",
+    "md5": "9a192d8b1a7dc652a19835f6f08098bd",
+    "rounds": 1,
+    "type": "extract",
+    "includePages": [0, 2, 12],
+    "pageMapping": { "1": 1, "3": 2, "13": 3 }
+  },
+  {
+    "id": "bug900822-encrypted-extract_0",
+    "file": "pdfs/bug900822.pdf",
+    "md5": "70e2a3c5922574eeda169c955cf9d084",
+    "rounds": 1,
+    "type": "extract",
+    "includePages": [0],
+    "pageMapping": { "1": 1 }
  }
 ]
--- a/test/unit/api_spec.js
+++ b/test/unit/api_spec.js
@ -5335,4 +5335,212 @@ deployment as easy as distributing a source ﬁle. They are used for
 small scripts as well as for`);
    });
  });
+
+  describe("PDF page editing", function () {
+    describe("Merge pdfs", function () {
+      it("should merge three PDFs", async function () {
+        const loadingTask = getDocument(
+          buildGetDocumentParams("doc_1_3_pages.pdf")
+        );
+        const pdfDoc = await loadingTask.promise;
+        const pdfData2 = await DefaultFileReaderFactory.fetch({
+          path: TEST_PDFS_PATH + "doc_2_3_pages.pdf",
+        });
+        const pdfData3 = await DefaultFileReaderFactory.fetch({
+          path: TEST_PDFS_PATH + "doc_3_3_pages.pdf",
+        });
+
+        let data = await pdfDoc.extractPages([
+          { document: null },
+          { document: pdfData2 },
+          { document: pdfData3 },
+        ]);
+        let newLoadingTask = getDocument(data);
+        let newPdfDoc = await newLoadingTask.promise;
+        expect(newPdfDoc.numPages).toEqual(9);
+
+        for (let i = 1; i <= 9; i++) {
+          const pdfPage = await newPdfDoc.getPage(i);
+          const { items: textItems } = await pdfPage.getTextContent();
+          expect(mergeText(textItems)).toEqual(
+            `Document ${Math.ceil(i / 3)}:Page ${((i - 1) % 3) + 1}`
+          );
+        }
+        await newLoadingTask.destroy();
+
+        data = await pdfDoc.extractPages([
+          { document: pdfData3 },
+          { document: pdfData2 },
+          { document: null },
+        ]);
+        newLoadingTask = getDocument(data);
+        newPdfDoc = await newLoadingTask.promise;
+        expect(newPdfDoc.numPages).toEqual(9);
+        for (let i = 1; i <= 9; i++) {
+          const pdfPage = await newPdfDoc.getPage(i);
+          const { items: textItems } = await pdfPage.getTextContent();
+          expect(mergeText(textItems)).toEqual(
+            `Document ${Math.ceil((10 - i) / 3)}:Page ${((i - 1) % 3) + 1}`
+          );
+        }
+        await newLoadingTask.destroy();
+
+        data = await pdfDoc.extractPages([
+          { document: null, includePages: [0] },
+          { document: pdfData2, includePages: [0] },
+          { document: pdfData3, includePages: [0] },
+        ]);
+        newLoadingTask = getDocument(data);
+        newPdfDoc = await newLoadingTask.promise;
+        expect(newPdfDoc.numPages).toEqual(3);
+        for (let i = 1; i <= 3; i++) {
+          const pdfPage = await newPdfDoc.getPage(i);
+          const { items: textItems } = await pdfPage.getTextContent();
+          expect(mergeText(textItems)).toEqual(`Document ${i}:Page 1`);
+        }
+        await newLoadingTask.destroy();
+
+        data = await pdfDoc.extractPages([
+          { document: null, excludePages: [0] },
+          { document: pdfData2, excludePages: [0] },
+          { document: pdfData3, excludePages: [0] },
+        ]);
+        newLoadingTask = getDocument(data);
+        newPdfDoc = await newLoadingTask.promise;
+        expect(newPdfDoc.numPages).toEqual(6);
+        for (let i = 1; i <= 6; i++) {
+          const pdfPage = await newPdfDoc.getPage(i);
+          const { items: textItems } = await pdfPage.getTextContent();
+          expect(mergeText(textItems)).toEqual(
+            `Document ${Math.ceil(i / 2)}:Page ${((i - 1) % 2) + 2}`
+          );
+        }
+        await newLoadingTask.destroy();
+
+        await loadingTask.destroy();
+      });
+
+      it("should merge two PDFs with page included ranges", async function () {
+        const loadingTask = getDocument(
+          buildGetDocumentParams("tracemonkey.pdf")
+        );
+        const pdfDoc = await loadingTask.promise;
+        const pdfData1 = await DefaultFileReaderFactory.fetch({
+          path: TEST_PDFS_PATH + "doc_1_3_pages.pdf",
+        });
+
+        const data = await pdfDoc.extractPages([
+          { document: pdfData1, includePages: [[0, 0], 2] },
+          { document: null, includePages: [[2, 4], 7] },
+        ]);
+        const newLoadingTask = getDocument(data);
+        const newPdfDoc = await newLoadingTask.promise;
+        expect(newPdfDoc.numPages).toEqual(6);
+
+        for (let i = 1; i <= 2; i++) {
+          const pdfPage = await newPdfDoc.getPage(i);
+          const { items: textItems } = await pdfPage.getTextContent();
+          expect(mergeText(textItems)).toEqual(`Document 1:Page ${2 * i - 1}`);
+        }
+
+        const expectedPagesText = [
+          "v0 := ld s",
+          "i=4. On th",
+          "resentatio",
+          "5.1 Optimi",
+        ];
+        for (let i = 3; i <= 6; i++) {
+          const pdfPage = await newPdfDoc.getPage(i);
+          const { items: textItems } = await pdfPage.getTextContent();
+          const text = mergeText(textItems);
+          expect(text.substring(0, 10)).toEqual(expectedPagesText[i - 3]);
+        }
+
+        await newLoadingTask.destroy();
+        await loadingTask.destroy();
+      });
+
+      it("should merge two PDFs with page excluded ranges", async function () {
+        const loadingTask = getDocument(
+          buildGetDocumentParams("tracemonkey.pdf")
+        );
+        const pdfDoc = await loadingTask.promise;
+        const pdfData1 = await DefaultFileReaderFactory.fetch({
+          path: TEST_PDFS_PATH + "doc_1_3_pages.pdf",
+        });
+
+        const data = await pdfDoc.extractPages([
+          { document: pdfData1, excludePages: [[1, 1]] },
+          {
+            document: null,
+            excludePages: [
+              [0, 1],
+              [5, 6],
+              [8, 13],
+            ],
+          },
+        ]);
+        const newLoadingTask = getDocument(data);
+        const newPdfDoc = await newLoadingTask.promise;
+        expect(newPdfDoc.numPages).toEqual(6);
+
+        for (let i = 1; i <= 2; i++) {
+          const pdfPage = await newPdfDoc.getPage(i);
+          const { items: textItems } = await pdfPage.getTextContent();
+          expect(mergeText(textItems)).toEqual(`Document 1:Page ${2 * i - 1}`);
+        }
+
+        const expectedPagesText = [
+          "v0 := ld s",
+          "i=4. On th",
+          "resentatio",
+          "5.1 Optimi",
+        ];
+        for (let i = 3; i <= 6; i++) {
+          const pdfPage = await newPdfDoc.getPage(i);
+          const { items: textItems } = await pdfPage.getTextContent();
+          const text = mergeText(textItems);
+          expect(text.substring(0, 10)).toEqual(expectedPagesText[i - 3]);
+        }
+
+        await newLoadingTask.destroy();
+        await loadingTask.destroy();
+      });
+
+      it("should merge two PDFs with one with a password", async function () {
+        const loadingTask = getDocument(
+          buildGetDocumentParams("doc_1_3_pages.pdf")
+        );
+        const pdfDoc = await loadingTask.promise;
+        const pdfData1 = await DefaultFileReaderFactory.fetch({
+          path: TEST_PDFS_PATH + "pr6531_2.pdf",
+        });
+
+        const data = await pdfDoc.extractPages([
+          { document: null, includePages: [0] },
+          { document: pdfData1, password: "asdfasdf" },
+        ]);
+        const newLoadingTask = getDocument(data);
+        const newPdfDoc = await newLoadingTask.promise;
+        expect(newPdfDoc.numPages).toEqual(2);
+
+        const expectedPagesText = ["Document 1:Page 1", ""];
+        for (let i = 1; i <= 2; i++) {
+          const pdfPage = await newPdfDoc.getPage(i);
+          const { items: textItems } = await pdfPage.getTextContent();
+          expect(mergeText(textItems)).toEqual(expectedPagesText[i - 1]);
+        }
+
+        const page2 = await newPdfDoc.getPage(2);
+        const annots = await page2.getAnnotations();
+        expect(annots.length).toEqual(1);
+        expect(annots[0].contentsObj.str).toEqual(
+          "Bluebeam should be encrypting this."
+        );
+
+        await newLoadingTask.destroy();
+        await loadingTask.destroy();
+      });
+    });
+  });
 });
--- a/test/unit/primitives_spec.js
+++ b/test/unit/primitives_spec.js
@ -310,6 +310,16 @@ describe("primitives", function () {
      expect(rawValues2.sort()).toEqual(expectedRawValues2);
    });

+    it("should get all raw entries", function () {
+      const expectedRawEntries = [
+        ["FontFile", testFontFile],
+        ["FontFile2", testFontFile2],
+        ["FontFile3", testFontFile3],
+      ];
+      const rawEntries = Array.from(dictWithManyKeys.getRawEntries());
+      expect(rawEntries.sort()).toEqual(expectedRawEntries);
+    });
+
    it("should create only one object for Dict.empty", function () {
      const firstDictEmpty = Dict.empty;
      const secondDictEmpty = Dict.empty;
@ -423,6 +433,12 @@ describe("primitives", function () {

      dict.setIfName("k", 1234);
      expect(dict.has("k")).toBeFalse();
+
+      dict.setIfDict("l", new Dict());
+      expect(dict.get("l")).toEqual(new Dict());
+
+      dict.setIfDict("m", "not a dict");
+      expect(dict.has("m")).toBeFalse();
    });
  });

--- a/test/unit/writer_spec.js
+++ b/test/unit/writer_spec.js
@ -170,8 +170,8 @@ describe("Writer", function () {

      const expected =
        "<< /A /B /B 123 456 R /C 789 /D (hello world) " +
-        "/E (\\(hello\\\\world\\)) /F [1.23 4.5 6] " +
-        "/G << /H 123 /I << /Length 8>> stream\n" +
+        "/E (\\(hello\\\\world\\)) /F [1.23001 4.50001 6] " +
+        "/G << /H 123.00001 /I << /Length 8>> stream\n" +
        "a stream\n" +
        "endstream>> /J true /K false " +
        "/NullArr [null 10] /NullVal null>>";