diff --git a/src/core/catalog.js b/src/core/catalog.js index 3f73e6d91..d6b12d19e 100644 --- a/src/core/catalog.js +++ b/src/core/catalog.js @@ -84,6 +84,8 @@ function fetchRemoteDest(action) { return null; } +const MIN_PAGES_TO_ASSUME_ALL_TOP_LEVEL = 20; + class Catalog { #actualNumPages = null; @@ -1185,6 +1187,14 @@ class Catalog { pageDictCache = this.pageDictCache; let currentPageIndex = 0; + const getIsPage = async obj => { + let type = obj.getRaw("Type"); + if (type instanceof Ref) { + type = await xref.fetchAsync(type); + } + return isName(type, "Page") || !obj.has("Kids"); + }; + while (nodesToVisit.length) { const currentNode = nodesToVisit.pop(); @@ -1204,11 +1214,8 @@ class Catalog { const obj = await (pageDictCache.get(currentNode) || xref.fetchAsync(currentNode)); if (obj instanceof Dict) { - let type = obj.getRaw("Type"); - if (type instanceof Ref) { - type = await xref.fetchAsync(type); - } - if (isName(type, "Page") || !obj.has("Kids")) { + const isPage = await getIsPage(obj); + if (isPage) { // Cache the Page reference, since it can *greatly* improve // performance by reducing redundant lookups in long documents // where all nodes are found at *one* level of the tree. @@ -1283,6 +1290,22 @@ class Catalog { // Always check all `Kids` nodes, to avoid getting stuck in an empty // node further down in the tree (see issue5644.pdf, issue8088.pdf), // and to ensure that we actually find the correct `Page` dict. + + // EXCEPT if it looks like this is likely an "everything in the top level + // /Pages" PDF and there's enough pages to be worth not reading everything + // See if the n'th item is a page + if ( + currentNode === this.toplevelPagesDict && + count >= MIN_PAGES_TO_ASSUME_ALL_TOP_LEVEL && + count === kids.length + ) { + const maybePage = await xref.fetchAsync(kids[pageIndex]); + const isPage = await getIsPage(maybePage); + if (isPage) { + return [maybePage, kids[pageIndex]]; + } + } + for (let last = kids.length - 1; last >= 0; last--) { const lastKid = kids[last]; nodesToVisit.push(lastKid);