Extract Lang attribute for marked contents

This commit is contained in:
Edoardo Cavazza 2025-10-29 19:50:28 +01:00
parent de7179fd74
commit e4569c5d22
5 changed files with 64 additions and 0 deletions

View File

@ -2317,9 +2317,19 @@ class PartialEvaluator {
return;
}
// Other marked content types aren't supported yet.
let props = null;
if (args[1] instanceof Dict) {
const lang = args[1].get("Lang");
if (typeof lang === "string") {
props = Object.create(null);
props.lang = stringToPDFString(lang);
}
}
args = [
args[0].name,
args[1] instanceof Dict ? args[1].get("MCID") : null,
props,
];
break;
@ -3505,8 +3515,13 @@ class PartialEvaluator {
markedContentData.level++;
let mcid = null;
let itemLang = null;
if (args[1] instanceof Dict) {
mcid = args[1].get("MCID");
const langString = args[1].get("Lang");
if (typeof langString === "string") {
itemLang = stringToPDFString(langString);
}
}
textContent.items.push({
type: "beginMarkedContentProps",
@ -3514,6 +3529,7 @@ class PartialEvaluator {
? `${self.idFactory.getPageObjId()}_mc${mcid}`
: null,
tag: args[0] instanceof Name ? args[0].name : null,
lang: itemLang,
});
}
break;

View File

@ -1166,6 +1166,8 @@ class PDFDocumentProxy {
* 'beginMarkedContentProps', or 'endMarkedContent'.
* @property {string} id - The marked content identifier. Only used for type
* 'beginMarkedContentProps'.
* @property {string|null} tag - The marked content tag.
* @property {string|null} lang - The lang attribute for the marked content.
*/
/**

View File

@ -293,6 +293,9 @@ class TextLayer {
if (item.id) {
this.#container.setAttribute("id", `${item.id}`);
}
if (item.lang) {
this.#container.setAttribute("lang", item.lang);
}
parent.append(this.#container);
} else if (item.type === "endMarkedContent") {
this.#container = this.#container.parentNode;

View File

@ -4488,6 +4488,27 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
await loadingTask.destroy();
});
it("gets operatorList, with marked content lang", async function () {
const loadingTask = getDocument(
buildGetDocumentParams("marked_content_lang.pdf")
);
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
pdfDoc.annotationStorage.setValue("30R", { value: "test" });
pdfDoc.annotationStorage.setValue("31R", { value: true });
const opList = await pdfPage.getOperatorList({
annotationMode: AnnotationMode.DISABLE,
});
expect(opList.fnArray[0]).toEqual(OPS.beginMarkedContentProps);
expect(opList.argsArray[0][0]).toEqual("P");
expect(opList.argsArray[0][2]?.lang).toEqual("en-US");
expect(opList.fnArray[10]).toEqual(OPS.beginMarkedContentProps);
expect(opList.argsArray[10][0]).toEqual("P");
expect(opList.argsArray[10][2]?.lang).toEqual("es-ES");
});
it("gets operatorList, with page resources containing corrupt /CCITTFaxDecode data", async function () {
const loadingTask = getDocument(
buildGetDocumentParams("poppler-90-0-fuzzed.pdf")

View File

@ -250,4 +250,26 @@ describe("textLayer", function () {
await loadingTask.destroy();
});
it("handles lang attribute for marked content", async function () {
const loadingTask = getDocument(
buildGetDocumentParams("marked_content_lang.pdf")
);
const pdfDocument = await loadingTask.promise;
const page = await pdfDocument.getPage(1);
const container = document.createElement("div");
const textLayer = new TextLayer({
textContentSource: page.streamTextContent({
includeMarkedContent: true,
}),
container,
viewport: page.getViewport({ scale: 1 }),
});
await textLayer.render();
const span = container.querySelector("#p17R_mc1");
expect(span.getAttribute("lang")).toEqual("es-ES");
expect(span.textContent).toEqual("Esto es español");
});
});