Merge 9f576beee83291b61f1b54e3681671ceca256fd0 into 4aca13e77aa2f2c4b5a133aa43fe27bbdf86ad21

This commit is contained in:
Edoardo Cavazza 2025-11-29 22:28:28 +03:00 committed by GitHub
commit 21c3a13774
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 64 additions and 0 deletions

View File

@ -2317,9 +2317,19 @@ class PartialEvaluator {
return; return;
} }
// Other marked content types aren't supported yet. // Other marked content types aren't supported yet.
let props = null;
if (args[1] instanceof Dict) {
const lang = args[1].get("Lang");
if (typeof lang === "string") {
props = Object.create(null);
props.lang = stringToPDFString(lang);
}
}
args = [ args = [
args[0].name, args[0].name,
args[1] instanceof Dict ? args[1].get("MCID") : null, args[1] instanceof Dict ? args[1].get("MCID") : null,
props,
]; ];
break; break;
@ -3505,8 +3515,13 @@ class PartialEvaluator {
markedContentData.level++; markedContentData.level++;
let mcid = null; let mcid = null;
let itemLang = null;
if (args[1] instanceof Dict) { if (args[1] instanceof Dict) {
mcid = args[1].get("MCID"); mcid = args[1].get("MCID");
const langString = args[1].get("Lang");
if (typeof langString === "string") {
itemLang = stringToPDFString(langString);
}
} }
textContent.items.push({ textContent.items.push({
type: "beginMarkedContentProps", type: "beginMarkedContentProps",
@ -3514,6 +3529,7 @@ class PartialEvaluator {
? `${self.idFactory.getPageObjId()}_mc${mcid}` ? `${self.idFactory.getPageObjId()}_mc${mcid}`
: null, : null,
tag: args[0] instanceof Name ? args[0].name : null, tag: args[0] instanceof Name ? args[0].name : null,
lang: itemLang,
}); });
} }
break; break;

View File

@ -1184,6 +1184,8 @@ class PDFDocumentProxy {
* 'beginMarkedContentProps', or 'endMarkedContent'. * 'beginMarkedContentProps', or 'endMarkedContent'.
* @property {string} id - The marked content identifier. Only used for type * @property {string} id - The marked content identifier. Only used for type
* 'beginMarkedContentProps'. * 'beginMarkedContentProps'.
* @property {string|null} tag - The marked content tag.
* @property {string|null} lang - The lang attribute for the marked content.
*/ */
/** /**

View File

@ -293,6 +293,9 @@ class TextLayer {
if (item.id) { if (item.id) {
this.#container.setAttribute("id", `${item.id}`); this.#container.setAttribute("id", `${item.id}`);
} }
if (item.lang) {
this.#container.setAttribute("lang", item.lang);
}
parent.append(this.#container); parent.append(this.#container);
} else if (item.type === "endMarkedContent") { } else if (item.type === "endMarkedContent") {
this.#container = this.#container.parentNode; this.#container = this.#container.parentNode;

View File

@ -754,6 +754,7 @@
!bug1937438_af_from_latex.pdf !bug1937438_af_from_latex.pdf
!bug1937438_from_word.pdf !bug1937438_from_word.pdf
!bug1937438_mml_from_latex.pdf !bug1937438_mml_from_latex.pdf
!marked_content_lang.pdf
!bug1997343.pdf !bug1997343.pdf
!doc_1_3_pages.pdf !doc_1_3_pages.pdf
!doc_2_3_pages.pdf !doc_2_3_pages.pdf

Binary file not shown.

View File

@ -4488,6 +4488,23 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
await loadingTask.destroy(); await loadingTask.destroy();
}); });
it("gets operatorList, with marked content lang", async function () {
const loadingTask = getDocument(
buildGetDocumentParams("marked_content_lang.pdf")
);
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const opList = await pdfPage.getOperatorList({
annotationMode: AnnotationMode.DISABLE,
});
expect(opList.fnArray[0]).toEqual(OPS.beginMarkedContentProps);
expect(opList.argsArray[0][0]).toEqual("P");
expect(opList.argsArray[0][2]?.lang).toEqual("en-US");
expect(opList.fnArray[10]).toEqual(OPS.beginMarkedContentProps);
expect(opList.argsArray[10][0]).toEqual("P");
expect(opList.argsArray[10][2]?.lang).toEqual("es-ES");
});
it("gets operatorList, with page resources containing corrupt /CCITTFaxDecode data", async function () { it("gets operatorList, with page resources containing corrupt /CCITTFaxDecode data", async function () {
const loadingTask = getDocument( const loadingTask = getDocument(
buildGetDocumentParams("poppler-90-0-fuzzed.pdf") buildGetDocumentParams("poppler-90-0-fuzzed.pdf")

View File

@ -250,4 +250,29 @@ describe("textLayer", function () {
await loadingTask.destroy(); await loadingTask.destroy();
}); });
it("handles lang attribute for marked content", async function () {
if (isNodeJS) {
pending("document.createElement is not supported in Node.js.");
}
const loadingTask = getDocument(
buildGetDocumentParams("marked_content_lang.pdf")
);
const pdfDocument = await loadingTask.promise;
const page = await pdfDocument.getPage(1);
const container = document.createElement("div");
const textLayer = new TextLayer({
textContentSource: page.streamTextContent({
includeMarkedContent: true,
}),
container,
viewport: page.getViewport({ scale: 1 }),
});
await textLayer.render();
const span = container.querySelector("#p17R_mc1");
expect(span.getAttribute("lang")).toEqual("es-ES");
expect(span.textContent).toEqual("Esto es español");
});
}); });