From 83e68b3b27422d572be17ac91f78d85e64cf23fa Mon Sep 17 00:00:00 2001 From: "Richard Smith (smir)" Date: Tue, 20 Aug 2024 09:46:49 +0100 Subject: [PATCH 1/2] Assume that a top level /Pages list contains /Page dicts directly if it is the right length, and there are enough pages to be worth the optimisation --- src/core/catalog.js | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/src/core/catalog.js b/src/core/catalog.js index 47d2812de..a91676d03 100644 --- a/src/core/catalog.js +++ b/src/core/catalog.js @@ -121,6 +121,8 @@ function fetchRemoteDest(action) { return null; } +const MIN_PAGES_TO_ASSUME_ALL_TOP_LEVEL = 20; + class Catalog { constructor(pdfManager, xref) { this.pdfManager = pdfManager; @@ -1186,6 +1188,14 @@ class Catalog { pageIndexCache = this.pageIndexCache; let currentPageIndex = 0; + const getIsPage = async obj => { + let type = obj.getRaw("Type"); + if (type instanceof Ref) { + type = await xref.fetchAsync(type); + } + return isName(type, "Page") || !obj.has("Kids"); + }; + while (nodesToVisit.length) { const currentNode = nodesToVisit.pop(); @@ -1202,13 +1212,13 @@ class Catalog { } visitedNodes.put(currentNode); - const obj = await xref.fetchAsync(currentNode); + const obj = + (await this.pageDictCache.get(currentNode)) || + xref.fetchAsync(currentNode); + if (obj instanceof Dict) { - let type = obj.getRaw("Type"); - if (type instanceof Ref) { - type = await xref.fetchAsync(type); - } - if (isName(type, "Page") || !obj.has("Kids")) { + const isPage = await getIsPage(obj); + if (isPage) { // Cache the Page reference, since it can *greatly* improve // performance by reducing redundant lookups in long documents // where all nodes are found at *one* level of the tree. @@ -1283,6 +1293,22 @@ class Catalog { // Always check all `Kids` nodes, to avoid getting stuck in an empty // node further down in the tree (see issue5644.pdf, issue8088.pdf), // and to ensure that we actually find the correct `Page` dict. + + // EXCEPT if it looks like this is likely an "everything in the top level + // /Pages" PDF and there's enough pages to be worth not reading everything + // See if the n'th item is a page + if ( + currentNode === this.toplevelPagesDict && + count >= MIN_PAGES_TO_ASSUME_ALL_TOP_LEVEL && + count === kids.length + ) { + const maybePage = await xref.fetchAsync(kids[currentPageIndex]); + const isPage = await getIsPage(maybePage); + if (isPage) { + return [maybePage, kids[currentPageIndex]]; + } + } + for (let last = kids.length - 1; last >= 0; last--) { nodesToVisit.push(kids[last]); } From 1c5a754ff0acb33258bcaf094ca90c799cb342e9 Mon Sep 17 00:00:00 2001 From: "Richard Smith (smir)" Date: Tue, 20 Aug 2024 16:41:02 +0100 Subject: [PATCH 2/2] Random access page loading - fixed which page to load --- src/core/catalog.js | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/core/catalog.js b/src/core/catalog.js index a91676d03..1d6116ebe 100644 --- a/src/core/catalog.js +++ b/src/core/catalog.js @@ -1212,9 +1212,7 @@ class Catalog { } visitedNodes.put(currentNode); - const obj = - (await this.pageDictCache.get(currentNode)) || - xref.fetchAsync(currentNode); + const obj = await xref.fetchAsync(currentNode); if (obj instanceof Dict) { const isPage = await getIsPage(obj); @@ -1302,13 +1300,13 @@ class Catalog { count >= MIN_PAGES_TO_ASSUME_ALL_TOP_LEVEL && count === kids.length ) { - const maybePage = await xref.fetchAsync(kids[currentPageIndex]); + const maybePage = await xref.fetchAsync(kids[pageIndex]); const isPage = await getIsPage(maybePage); if (isPage) { - return [maybePage, kids[currentPageIndex]]; + return [maybePage, kids[pageIndex]]; } } - + for (let last = kids.length - 1; last >= 0; last--) { nodesToVisit.push(kids[last]); }