diff --git a/src/core/struct_tree.js b/src/core/struct_tree.js index fa2d6e98c..083230c2b 100644 --- a/src/core/struct_tree.js +++ b/src/core/struct_tree.js @@ -13,7 +13,12 @@ * limitations under the License. */ -import { AnnotationPrefix, stringToPDFString, warn } from "../shared/util.js"; +import { + AnnotationPrefix, + stringToPDFString, + stringToUTF8String, + warn, +} from "../shared/util.js"; import { Dict, isName, Name, Ref, RefSetCache } from "./primitives.js"; import { lookupNormalRect, stringToAsciiOrUTF16BE } from "./core_utils.js"; import { BaseStream } from "./base_stream.js"; @@ -610,7 +615,8 @@ class StructElementNode { if (!isName(fileStream.dict.get("Subtype"), "application/mathml+xml")) { continue; } - return fileStream.getString(); + // The default encoding for xml files is UTF-8. + return stringToUTF8String(fileStream.getString()); } const A = this.dict.get("A"); if (A instanceof Dict) { diff --git a/test/integration/accessibility_spec.mjs b/test/integration/accessibility_spec.mjs index 5bb2c7915..94472ddc8 100644 --- a/test/integration/accessibility_spec.mjs +++ b/test/integration/accessibility_spec.mjs @@ -346,6 +346,46 @@ describe("accessibility", () => { }); }); + describe("MathML with some attributes in AF entry from LaTeX", () => { + let pages; + + beforeEach(async () => { + pages = await loadAndWait("bug1997343.pdf", ".textLayer"); + }); + + afterEach(async () => { + await closePages(pages); + }); + + it("must check that the MathML is correctly inserted", async () => { + await Promise.all( + pages.map(async ([browserName, page]) => { + const isSanitizerSupported = await page.evaluate(() => { + try { + // eslint-disable-next-line no-undef + return typeof Sanitizer !== "undefined"; + } catch { + return false; + } + }); + if (isSanitizerSupported) { + const mathML = await page.$eval( + "span.structTree span[aria-owns='p21R_mc64']", + el => el?.innerHTML ?? "" + ); + expect(mathML) + .withContext(`In ${browserName}`) + .toEqual( + ' 𝑛 𝑝 = 𝑛 mod 𝑝 ' + ); + } else { + pending(`Sanitizer API (in ${browserName}) is not supported`); + } + }) + ); + }); + }); + describe("MathML tags in the struct tree", () => { let pages; diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index ed64906db..1eb2dce1b 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -753,3 +753,4 @@ !bug1937438_af_from_latex.pdf !bug1937438_from_word.pdf !bug1937438_mml_from_latex.pdf +!bug1997343.pdf diff --git a/test/pdfs/bug1997343.pdf b/test/pdfs/bug1997343.pdf new file mode 100755 index 000000000..378d0fc41 Binary files /dev/null and b/test/pdfs/bug1997343.pdf differ diff --git a/web/struct_tree_layer_builder.js b/web/struct_tree_layer_builder.js index 66e1f98cb..393191420 100644 --- a/web/struct_tree_layer_builder.js +++ b/web/struct_tree_layer_builder.js @@ -155,10 +155,7 @@ class MathMLSanitizer { "accentunder", "columnspan", "rowspan", - ].map(name => ({ - name, - namespace: MathMLNamespace, - })), + ], comments: false, }) : null