Merge pull request #20327 from edoardocavazza/collect-list-table-children

Collect all child nodes of lists and tables in StructTree
This commit is contained in:
calixteman 2025-10-29 21:09:52 +01:00 committed by GitHub
commit 7fc5706e16
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 46 additions and 0 deletions

View File

@ -824,6 +824,23 @@ class StructTreePage {
const element = new StructElementNode(this, dict);
map.set(dict, element);
switch (element.role) {
case "L":
case "LBody":
case "LI":
case "Table":
case "THead":
case "TBody":
case "TFoot":
case "TR": {
// Always collect all child nodes of lists and tables, even empty ones
for (const kid of element.kids) {
if (kid.type === StructElementType.ELEMENT) {
this.addNode(kid.dict, map, level - 1);
}
}
}
}
const parent = dict.get("P");

View File

@ -620,6 +620,7 @@
!autoprint.pdf
!bug1811694.pdf
!bug1811510.pdf
!issue20324.pdf
!bug1815476.pdf
!issue16021.pdf
!bug1770750.pdf

BIN
test/pdfs/issue20324.pdf Normal file

Binary file not shown.

View File

@ -300,6 +300,34 @@ describe("struct tree", function () {
},
struct
);
});
it("should collect all list and table items in StructTree", async function () {
const findNodes = (node, check) => {
const results = [];
if (check(node)) {
results.push(node);
}
if (node.children) {
for (const child of node.children) {
results.push(...findNodes(child, check));
}
}
return results;
};
const loadingTask = getDocument(buildGetDocumentParams("issue20324.pdf"));
const pdfDoc = await loadingTask.promise;
const page = await pdfDoc.getPage(1);
const tree = await page.getStructTree({
includeMarkedContent: true,
});
const cells = findNodes(tree, node => node.role === "TD");
expect(cells.length).toEqual(4);
const listItems = findNodes(tree, node => node.role === "LI");
expect(listItems.length).toEqual(4);
await loadingTask.destroy();
});
});