diff --git a/src/core/struct_tree.js b/src/core/struct_tree.js index 6ed379e3b..539ae008f 100644 --- a/src/core/struct_tree.js +++ b/src/core/struct_tree.js @@ -16,6 +16,7 @@ import { AnnotationPrefix, stringToPDFString, warn } from "../shared/util.js"; import { Dict, isName, Name, Ref, RefSetCache } from "./primitives.js"; import { lookupNormalRect, stringToAsciiOrUTF16BE } from "./core_utils.js"; +import { BaseStream } from "./base_stream.js"; import { NumberTree } from "./name_number_tree.js"; const MAX_DEPTH = 40; @@ -579,6 +580,50 @@ class StructElementNode { return root.roleMap.get(name) ?? name; } + get mathML() { + let AFs = this.dict.get("AF") || []; + if (!Array.isArray(AFs)) { + AFs = [AFs]; + } + for (let af of AFs) { + af = this.xref.fetchIfRef(af); + if (!(af instanceof Dict)) { + continue; + } + if (!isName(af.get("Type"), "Filespec")) { + continue; + } + if (!isName(af.get("AFRelationship"), "Supplement")) { + continue; + } + const ef = af.get("EF"); + if (!(ef instanceof Dict)) { + continue; + } + const fileStream = ef.get("UF") || ef.get("F"); + if (!(fileStream instanceof BaseStream)) { + continue; + } + if (!isName(fileStream.dict.get("Type"), "EmbeddedFile")) { + continue; + } + if (!isName(fileStream.dict.get("Subtype"), "application/mathml+xml")) { + continue; + } + return fileStream.getString(); + } + const A = this.dict.get("A"); + if (A instanceof Dict) { + // This stuff isn't in the spec, but MS Office seems to use it. + const O = A.get("O"); + if (isName(O, "MSFT_Office")) { + const mathml = A.get("MSFT_MathML"); + return mathml ? stringToPDFString(mathml) : null; + } + } + return null; + } + parseKids() { let pageObjId = null; const objRef = this.dict.getRaw("Pg"); @@ -842,6 +887,12 @@ class StructTreePage { if (typeof alt === "string") { obj.alt = stringToPDFString(alt); } + if (obj.role === "Formula") { + const { mathML } = node; + if (mathML) { + obj.mathML = mathML; + } + } const a = node.dict.get("A"); if (a instanceof Dict) { diff --git a/src/shared/util.js b/src/shared/util.js index ffc010129..b50819976 100644 --- a/src/shared/util.js +++ b/src/shared/util.js @@ -658,6 +658,15 @@ class FeatureTest { ); } + static get isSanitizerSupported() { + return shadow( + this, + "isSanitizerSupported", + // eslint-disable-next-line no-undef + typeof Sanitizer !== "undefined" + ); + } + static get platform() { const { platform, userAgent } = navigator; diff --git a/test/integration/accessibility_spec.mjs b/test/integration/accessibility_spec.mjs index 5dfca3785..5bb2c7915 100644 --- a/test/integration/accessibility_spec.mjs +++ b/test/integration/accessibility_spec.mjs @@ -305,4 +305,72 @@ describe("accessibility", () => { ); }); }); + + describe("MathML in AF entry from LaTeX", () => { + let pages; + + beforeEach(async () => { + pages = await loadAndWait("bug1937438_af_from_latex.pdf", ".textLayer"); + }); + + afterEach(async () => { + await closePages(pages); + }); + + it("must check that the MathML is correctly inserted", async () => { + await Promise.all( + pages.map(async ([browserName, page]) => { + const isSanitizerSupported = await page.evaluate(() => { + try { + // eslint-disable-next-line no-undef + return typeof Sanitizer !== "undefined"; + } catch { + return false; + } + }); + if (isSanitizerSupported) { + const mathML = await page.$eval( + "span.structTree span[aria-owns='p58R_mc13'] > math", + el => el?.innerHTML ?? "" + ); + expect(mathML) + .withContext(`In ${browserName}`) + .toEqual( + " x2 = |x| " + ); + } else { + pending(`Sanitizer API (in ${browserName}) is not supported`); + } + }) + ); + }); + }); + + describe("MathML tags in the struct tree", () => { + let pages; + + beforeEach(async () => { + pages = await loadAndWait("bug1937438_mml_from_latex.pdf", ".textLayer"); + }); + + afterEach(async () => { + await closePages(pages); + }); + + it("must check that the MathML is correctly inserted", async () => { + await Promise.all( + pages.map(async ([browserName, page]) => { + const mathML = await page.$eval( + "span.structTree span[role='group'] span[role='group']:last-child > span math", + el => el?.innerHTML ?? "" + ); + expect(mathML) + .withContext(`In ${browserName}`) + .toEqual( + `` + ); + }) + ); + }); + }); }); diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index b59617d45..75b11be7d 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -749,3 +749,6 @@ !issue20319_1.pdf !issue20319_2.pdf !bug1992868.pdf +!bug1937438_af_from_latex.pdf +!bug1937438_from_word.pdf +!bug1937438_mml_from_latex.pdf diff --git a/test/pdfs/bug1937438_af_from_latex.pdf b/test/pdfs/bug1937438_af_from_latex.pdf new file mode 100755 index 000000000..b9e897358 --- /dev/null +++ b/test/pdfs/bug1937438_af_from_latex.pdf @@ -0,0 +1,965 @@ +%PDF-2.0 +% +22 0 obj +<< /Type /EmbeddedFile /Subtype /application#2Fmathml+xml /Params<> /Length 25 >> +stream + x +endstream +endobj +23 0 obj +<< /Type /Filespec /AFRelationship /Supplement /Desc (mathml-1) /F /UF /EF<> >> +endobj +24 0 obj +<< /Type /EmbeddedFile /Subtype /application#2Fmathml+xml /Params<> /Length 25 >> +stream + y +endstream +endobj +25 0 obj +<< /Type /Filespec /AFRelationship /Supplement /Desc (mathml-2) /F /UF /EF<> >> +endobj +26 0 obj +<< /Type /EmbeddedFile /Subtype /application#2Fmathml+xml /Params<> /Length 50 >> +stream + x > y +endstream +endobj +27 0 obj +<< /Type /Filespec /AFRelationship /Supplement /Desc (mathml-3) /F /UF /EF<> >> +endobj +28 0 obj +<< /Type /EmbeddedFile /Subtype /application#2Fmathml+xml /Params<> /Length 154 >> +stream + x2 = |x| +endstream +endobj +29 0 obj +<< /Type /Filespec /AFRelationship /Supplement /Desc (mathml-4) /F /UF /EF<> >> +endobj +43 0 obj +<< /Subtype /application#2Fx-tex/Type /EmbeddedFile /Params<> /Length 3 >> +stream +$x$ +endstream +endobj +44 0 obj +<< /Type /Filespec /AFRelationship /Source /Desc (TeX source) /F /UF /EF<> >> +endobj +46 0 obj +<< /Subtype /application#2Fx-tex/Type /EmbeddedFile /Params<> /Length 3 >> +stream +$y$ +endstream +endobj +47 0 obj +<< /Type /Filespec /AFRelationship /Source /Desc (TeX source) /F /UF /EF<> >> +endobj +49 0 obj +<< /Subtype /application#2Fx-tex/Type /EmbeddedFile /Params<> /Length 5 >> +stream +$x>y$ +endstream +endobj +50 0 obj +<< /Type /Filespec /AFRelationship /Source /Desc (TeX source) /F /UF /EF<> >> +endobj +54 0 obj +<< /Subtype /application#2Fx-tex/Type /EmbeddedFile /Params<> /Length 61 >> +stream +\begin {equation*}\sqrt {x^2}=\lvert x\rvert \end {equation*} +endstream +endobj +55 0 obj +<< /Type /Filespec /AFRelationship /Source /Desc (TeX source) /F /UF /EF<> >> +endobj +56 0 obj +<< /Type /Metadata /Subtype /XML /Length 11660 >> +stream + + + + + + + + + XMP Media Management Schema + xmpMM + http://ns.adobe.com/xap/1.0/mm/ + + + + OriginalDocumentID + URI + internal + The common identifier for all versions and renditions of a document. + + + + + + PDF/A Identification Schema + pdfaid + http://www.aiim.org/pdfa/ns/id/ + + + + year + Integer + internal + Year of standard + + + + + + PDF/UA Universal Accessibility Schema + pdfuaid + http://www.aiim.org/pdfua/ns/id/ + + + + part + Integer + internal + Part of ISO 14289 standard + + + rev + Integer + internal + Revision of ISO 14289 standard + + + + + + PDF/X ID Schema + pdfxid + http://www.npes.org/pdfx/ns/id/ + + + + GTS_PDFXVersion + Text + internal + ID of PDF/X standard + + + + + + PRISM Basic Metadata + prism + http://prismstandard.org/namespaces/basic/3.0/ + + + + complianceProfile + Text + internal + PRISM specification compliance profile to which this document adheres + + + publicationName + Text + external + Publication name + + + aggregationType + Text + external + Publication type + + + bookEdition + Text + external + Edition of the book in which the document was published + + + volume + Text + external + Publication volume number + + + number + Text + external + Publication issue number within a volume + + + pageRange + Text + external + Page range for the document within the print version of its publication + + + issn + Text + external + ISSN for the printed publication in which the document was published + + + eIssn + Text + external + ISSN for the electronic publication in which the document was published + + + isbn + Text + external + ISBN for the publication in which the document was published + + + doi + Text + external + Digital Object Identifier for the document + + + url + URL + external + URL at which the document can be found + + + byteCount + Integer + internal + Approximate file size in octets + + + pageCount + Integer + internal + Number of pages in the print version of the document + + + subtitle + Text + external + Document's subtitle + + + + + + + luahbtex-1.17.0 + 2.0 + 2 + 2024 + + + David + + + + + Math Test One + + + + + Text + + + + + en-US + + + + + 2024-02-23T21:38:22Z + + + application/pdf + mathml-AF-ex1.tex + LaTeX + 2024-02-23T21:38:22Z + 2024-02-23T21:38:22Z + 2024-02-23T21:38:22Z + uuid:9b73c2b4-65f6-4897-89df-cb965cd08a00 + uuid:220f5099-def3-4191-8b9d-00a07a988b2e + three + 1 + + + + + + + + + + + + + + +endstream +endobj +59 0 obj +<< /Length 2090 >> +stream +/opacity1 gs +/Artifact BMC +EMC +/text<> BDC +BT +/F54 17.21544 Tf +1 0 0 1 252.247 615.392 Tm [<000F00FF011301060001>20<0016>70<01030112011300010011010D0103>]TJ +ET +EMC +/text<> BDC +BT +/F54 11.95517 Tf +1 0 0 1 290.74 582.516 Tm [<0001>235<000600FF>25<011501070102>]TJ +ET +EMC +/text<> BDC +BT +/F54 11.95517 Tf +1 0 0 1 260.72 554.947 Tm [<0008>40<010301000111011400FF0111011800010788078907210001078807860788078A>]TJ +ET +EMC +/Lbl<> BDC +BT +/F69 14.3462 Tf +1 0 0 1 133.768 510.558 Tm [<07870001>]TJ +ET +EMC +/section<> BDC +BT +/F69 14.3462 Tf +1 0 0 1 155.216 510.558 Tm [<000F00FF011301060001>20<0016>70<0103011201130112>]TJ +ET +EMC +/Artifact BMC +EMC +/text<> BDC +BT +/F54 9.96264 Tf +1 0 0 1 133.768 487.765 Tm [<0015010E010C010300010107010D010B0107010D01030001010C00FF0113010607210001010B010301130001>]TJ +ET +EMC +/Formula<> BDC +BT +/F42 9.96264 Tf +1 0 0 1 226.162 487.765 Tm [<0D1A>]TJ +ET +EMC +/text<> BDC +BT +/F54 9.96264 Tf +1 0 0 1 231.731 487.765 Tm [<000100FF010D01020001>]TJ +ET +EMC +/Formula<> BDC +BT +/F42 9.96264 Tf +1 0 0 1 252.304 487.765 Tm [<0D1B>]TJ +ET +EMC +/text<> BDC +BT +/F54 9.96264 Tf +1 0 0 1 257.385 487.765 Tm [<0001011200FF011301070112010401180001>]TJ +ET +EMC +/Formula<> BDC +BT +/F42 9.96264 Tf +1 0 0 1 288.568 487.765 Tm [<0D1A>-287<04B4>-278<0D1B>]TJ +ET +EMC +/text<> BDC +BT +/F54 9.96264 Tf +1 0 0 1 312.025 487.765 Tm [<0725>]TJ +ET +EMC +/text<> BDC +BT +/F54 9.96264 Tf +1 0 0 1 148.712 475.81 Tm [<0015010E010C010300010113>5<0103>10<011701130721000100FF010D0102000100FF010D000101030110011400FF01130107010E010D0725>]TJ +ET +EMC +/Formula<> BDC +BT +/F42 9.96264 Tf +1 0 0 1 284.96 452.696 Tm [<0679>]TJ +ET +q +1 0 0 1 292.87 461.543 cm +[] 0 d 0 J 0.677 w 0 0 m 9.722 0 l S +Q +BT +/F42 9.96264 Tf +1 0 0 1 292.87 451.238 Tm [<0D1A>]TJ +/F43 6.97385 Tf +1 0 0 1 298.539 453.749 Tm [<10B3>]TJ +/F42 9.96264 Tf +1 0 0 1 305.359 451.238 Tm [<04B2>-277<04260D1A>-10<0426>]TJ +ET +EMC +/Artifact BMC +EMC +/Artifact BMC +BT +/F54 9.96264 Tf +1 0 0 1 303.158 89.365 Tm [<0787>]TJ +ET +EMC +/Artifact BMC +EMC + +endstream +endobj +58 0 obj +<< /Type /Page /Contents 59 0 R /Resources 57 0 R /MediaBox [ 0 0 612 792 ] /StructParents 0/Tabs /S /Parent 64 0 R >> +endobj +57 0 obj +<< /ExtGState 1 0 R /Font << /F54 60 0 R /F69 61 0 R /F42 62 0 R /F43 63 0 R >> >> +endobj +1 0 obj +<< /opacity1 <> >> +endobj +65 0 obj +<< /Marked true >> +endobj +66 0 obj +<< /DisplayDocTitle true >> +endobj +67 0 obj +<< /Names[(l3ef0001) 23 0 R (l3ef0002) 25 0 R (l3ef0003) 27 0 R (l3ef0004) 29 0 R] >> +endobj +6 0 obj +<< /Nums [0 [ 34 0 R 35 0 R 36 0 R 39 0 R 38 0 R 41 0 R 42 0 R 41 0 R 45 0 R 41 0 R 48 0 R 41 0 R 52 0 R 53 0 R] +] >> +endobj +68 0 obj +<< /Limits [(ID.001) (ID.019)]/Names [(ID.001) 21 0 R (ID.002) 30 0 R (ID.003) 31 0 R (ID.004) 32 0 R (ID.005) 33 0 R (ID.006) 34 0 R (ID.007) 35 0 R (ID.008) 36 0 R (ID.009) 37 0 R (ID.010) 38 0 R (ID.011) 39 0 R (ID.012) 40 0 R (ID.013) 41 0 R (ID.014) 42 0 R (ID.015) 45 0 R (ID.016) 48 0 R (ID.017) 51 0 R (ID.018) 52 0 R (ID.019) 53 0 R ] >> +endobj +69 0 obj +<< /Kids [68 0 R] >> +endobj +7 0 obj +<< /Artifact /NonStruct /DocumentFragment /Art /Aside /Note /H7 /H6 /H8 /H6 /H9 /H6 /H10 /H6 /Title /P /FENote /Note /Sub /Span /Em /Span /Strong /Span /title /P /part /P /section /H1 /subsection /H2 /subsubsection /H3 /paragraph /H4 /subparagraph /H5 /list /L /itemize /L /enumerate /L /description /L /quote /BlockQuote /quotation /BlockQuote /verbatim /P /item /LI /itemlabel /Lbl /itembody /LBody /footnote /Note /footnotemark /Lbl /footnotelabel /Lbl /text-unit /Part /text /P /theorem-like /Sect /codeline /Span /float /Note /figures /Sect /tables /Sect >> +endobj +70 0 obj +<< /display <> +/justify <> +/inline <> +/center <> +/TH-both <> +/TH-row <> +/TH-col <> >> +endobj +9 0 obj +<< /Type /Namespace /NS (http://iso.org/pdf/ssn) >> +endobj +11 0 obj +<< /Type /Namespace /NS (http://iso.org/pdf2/ssn) >> +endobj +13 0 obj +<< /Type /Namespace /NS (http://www.w3.org/1998/Math/MathML) >> +endobj +16 0 obj +<< /title [/Title 11 0 R] /part [/Title 11 0 R] /section [/H1 11 0 R] /subsection [/H2 11 0 R] /subsubsection [/H3 11 0 R] /paragraph [/H4 11 0 R] /subparagraph [/H5 11 0 R] /list [/L 11 0 R] /itemize [/L 11 0 R] /enumerate [/L 11 0 R] /description [/L 11 0 R] /quote [/BlockQuote 9 0 R] /quotation [/BlockQuote 9 0 R] /verbatim [/P 11 0 R] /item [/LI 11 0 R] /itemlabel [/Lbl 11 0 R] /itembody [/LBody 11 0 R] /footnote [/FENote 11 0 R] /footnotemark [/Lbl 11 0 R] /footnotelabel [/Lbl 11 0 R] /text-unit [/Part 11 0 R] /text [/P 11 0 R] /theorem-like [/Sect 11 0 R] /codeline [/Sub 11 0 R] /float [/Aside 11 0 R] /figures [/Sect 11 0 R] /tables [/Sect 11 0 R] >> +endobj +15 0 obj +<< /Type /Namespace /NS (https://www.latex-project.org/ns/dflt/2022) /RoleMapNS 16 0 R >> +endobj +18 0 obj +<< /chapter [/H1 11 0 R] /section [/H2 11 0 R] /subsection [/H3 11 0 R] /subsubsection [/H4 11 0 R] /paragraph [/H5 11 0 R] /subparagraph [/H6 11 0 R] >> +endobj +17 0 obj +<< /Type /Namespace /NS (https://www.latex-project.org/ns/book/2022) /RoleMapNS 18 0 R >> +endobj +19 0 obj +<< /Type /Namespace /NS (data:,173E68E4-F47F-6026-897D-257CC7127A46) >> +endobj +8 0 obj +[ 9 0 R 11 0 R 13 0 R 15 0 R 17 0 R 19 0 R ] +endobj +21 0 obj +<< /Type /StructElem /S /Document /P 5 0 R /K [32 0 R 37 0 R] /NS 11 0 R /ID (ID.001) >> +endobj +30 0 obj +<< /Type /StructElem /S /Artifact /P 5 0 R /NS 15 0 R /ID (ID.002) >> +endobj +31 0 obj +<< /Type /StructElem /S /Artifact /P 5 0 R /NS 15 0 R /ID (ID.003) >> +endobj +32 0 obj +<< /Type /StructElem /S /text-unit /P 21 0 R /K [33 0 R 35 0 R 36 0 R] /NS 15 0 R /ID (ID.004) >> +endobj +33 0 obj +<< /Type /StructElem /S /Title /P 32 0 R /K 34 0 R /NS 11 0 R /ID (ID.005) >> +endobj +34 0 obj +<< /Type /StructElem /S /text /P 33 0 R /K <> /C /center /NS 15 0 R /ID (ID.006) >> +endobj +35 0 obj +<< /Type /StructElem /S /text /P 32 0 R /K <> /C /center /NS 15 0 R /ID (ID.007) >> +endobj +36 0 obj +<< /Type /StructElem /S /text /P 32 0 R /K <> /C /center /NS 15 0 R /ID (ID.008) >> +endobj +37 0 obj +<< /Type /StructElem /S /Sect /P 21 0 R /K [38 0 R 40 0 R 51 0 R] /NS 11 0 R /ID (ID.009) >> +endobj +38 0 obj +<< /Type /StructElem /S /section /P 37 0 R /K [39 0 R <> ] /NS 15 0 R /ID (ID.010) >> +endobj +39 0 obj +<< /Type /StructElem /S /Lbl /P 38 0 R /K [<> ] /NS 11 0 R /ID (ID.011) >> +endobj +40 0 obj +<< /Type /StructElem /S /text-unit /P 37 0 R /K 41 0 R /NS 15 0 R /ID (ID.012) >> +endobj +41 0 obj +<< /Type /StructElem /S /text /P 40 0 R /K [<> 42 0 R <> 45 0 R <> 48 0 R <> ] /C /justify /NS 15 0 R /ID (ID.013) >> +endobj +42 0 obj +<< /Type /StructElem /S /Formula /P 41 0 R /K <> /C /inline /T /AF [23 0 R 44 0 R] /NS 11 0 R /ID (ID.014) >> +endobj +45 0 obj +<< /Type /StructElem /S /Formula /P 41 0 R /K <> /C /inline /T /AF [25 0 R 47 0 R] /NS 11 0 R /ID (ID.015) >> +endobj +48 0 obj +<< /Type /StructElem /S /Formula /P 41 0 R /K <> /C /inline /T /AF [27 0 R 50 0 R] /NS 11 0 R /ID (ID.016) >> +endobj +51 0 obj +<< /Type /StructElem /S /text-unit /P 37 0 R /K [52 0 R 53 0 R] /NS 15 0 R /ID (ID.017) >> +endobj +52 0 obj +<< /Type /StructElem /S /text /P 51 0 R /K <> /C /justify /NS 15 0 R /ID (ID.018) >> +endobj +53 0 obj +<< /Type /StructElem /S /Formula /P 51 0 R /K <> /C /display /T /AF [29 0 R 55 0 R] /NS 11 0 R /ID (ID.019) >> +endobj +5 0 obj +<< /Type /StructTreeRoot /K 21 0 R /IDTree 69 0 R /ParentTree 6 0 R /RoleMap 7 0 R /ClassMap 70 0 R /Namespaces 8 0 R >> +endobj +71 0 obj +[ 4275 [ 524 ] ] +endobj +73 0 obj +<< /Subtype /CIDFontType0C /Length 692 >> +stream +XVNRED+STIXTwoMath-RegularL!"   =  f  +Cw "w $z % XVNRED+STIXTwoMath-Regular2.120STIX Fonts and STIX Two are trademarks of The Institute of Electrical and Electronics Engineers, Inc.Copyright 2001-2021 The STIX Fonts Project Authors https:github.comstipubstixfontsSTIX Two MathNormalAdobeIdentity=̾XS@@=ɿp=NNvvs5'4)&FR:\tppv˳h<.fqvhfupkjtTFw]}BhJquR= 6 $DR.j~̰ٹXDVO"$Z. &)|iԘ + + ޔ d +endstream +endobj +72 0 obj +<< /Type /FontDescriptor /FontName /XVNRED+STIXTwoMath-Regular /Flags 4 /FontBBox [ -978 -1641 3072 2627 ] /Ascent 762 /CapHeight 703 /Descent -238 /ItalicAngle 0 /StemV 98 /XHeight 473 /FontFile3 73 0 R >> +endobj +74 0 obj +<< /Length 702 >> +stream +%!PS-Adobe-3.0 Resource-CMap +%%DocumentNeededResources: ProcSet (CIDInit) +%%IncludeResource: ProcSet (CIDInit) +%%BeginResource: CMap (TeX-XVNRED-STIXTwoMath-Regular-0) +%%Title: (TeX-XVNRED-STIXTwoMath-Regular-0 TeX XVNRED-STIXTwoMath-Regular 0) +%%Version: 1.000 +%%EndComments +/CIDInit /ProcSet findresource begin +12 dict begin +begincmap +/CIDSystemInfo +<< /Registry (TeX) +/Ordering (XVNRED-STIXTwoMath-Regular) +/Supplement 0 +>> def +/CMapName /TeX-Identity-XVNRED-STIXTwoMath-Regular def +/CMapType 2 def +1 begincodespacerange +<0000> +endcodespacerange +0 beginbfrange +endbfrange +1 beginbfchar +<10B3> <0032> +endbfchar +endcmap +CMapName currentdict /CMap defineresource pop +end +end +%%EndResource +%%EOF + +endstream +endobj +63 0 obj +<< /Type /Font /Subtype /Type0 /Encoding /Identity-H /BaseFont /XVNRED+STIXTwoMath-Regular /DescendantFonts [ 75 0 R ] /ToUnicode 74 0 R >> +endobj +75 0 obj +<< /Type /Font /Subtype /CIDFontType0 /BaseFont /XVNRED+STIXTwoMath-Regular /FontDescriptor 72 0 R /W 71 0 R /CIDSystemInfo << /Registry (Adobe) /Ordering (Identity) /Supplement 0 >> >> +endobj +76 0 obj +[ 1062 [ 267 ] 1202 [ 720 ] 1204 [ 720 ] 1657 [ 794 ] 3354 [ 559 510 ] ] +endobj +78 0 obj +<< /Subtype /CIDFontType0C /Length 1046 >> +stream +ETRRWY+STIXTwoMath-RegularL!"   =  f  +Cw  " $ % ETRRWY+STIXTwoMath-Regular2.120STIX Fonts and STIX Two are trademarks of The Institute of Electrical and Electronics Engineers, Inc.Copyright 2001-2021 The STIX Fonts Project Authors https:github.comstipubstixfontsSTIX Two MathNormalAdobeIdentity&y  #;=̾XS@@=ɿp=NNvvs5'4)&FR:\tppv˳h<.fqvhfupkjt[v2w6p2R2&G`HrFNnNnHk/.Zw3o}wD=w}xs|{m͌p70LC"Sdy,zsus|uwrsǬ&JeϵϷvgpxwqlwovePcZ[pFpEvRwJۄu|"nȐZ^`o;xuet{~mfoثqfn|yveoqsssdM &)|iԘ + + ޔ d +endstream +endobj +77 0 obj +<< /Type /FontDescriptor /FontName /ETRRWY+STIXTwoMath-Regular /Flags 4 /FontBBox [ -978 -1641 3072 2627 ] /Ascent 762 /CapHeight 657 /Descent -238 /ItalicAngle 0 /StemV 82 /XHeight 473 /FontFile3 78 0 R >> +endobj +79 0 obj +<< /Length 780 >> +stream +%!PS-Adobe-3.0 Resource-CMap +%%DocumentNeededResources: ProcSet (CIDInit) +%%IncludeResource: ProcSet (CIDInit) +%%BeginResource: CMap (TeX-ETRRWY-STIXTwoMath-Regular-0) +%%Title: (TeX-ETRRWY-STIXTwoMath-Regular-0 TeX ETRRWY-STIXTwoMath-Regular 0) +%%Version: 1.000 +%%EndComments +/CIDInit /ProcSet findresource begin +12 dict begin +begincmap +/CIDSystemInfo +<< /Registry (TeX) +/Ordering (ETRRWY-STIXTwoMath-Regular) +/Supplement 0 +>> def +/CMapName /TeX-Identity-ETRRWY-STIXTwoMath-Regular def +/CMapType 2 def +1 begincodespacerange +<0000> +endcodespacerange +0 beginbfrange +endbfrange +6 beginbfchar +<0426> <007C> +<04B2> <003D> +<04B4> <003E> +<0679> <221A> +<0D1A> +<0D1B> +endbfchar +endcmap +CMapName currentdict /CMap defineresource pop +end +end +%%EndResource +%%EOF + +endstream +endobj +62 0 obj +<< /Type /Font /Subtype /Type0 /Encoding /Identity-H /BaseFont /ETRRWY+STIXTwoMath-Regular /DescendantFonts [ 80 0 R ] /ToUnicode 79 0 R >> +endobj +80 0 obj +<< /Type /Font /Subtype /CIDFontType0 /BaseFont /ETRRWY+STIXTwoMath-Regular /FontDescriptor 77 0 R /W 76 0 R /CIDSystemInfo << /Registry (Adobe) /Ordering (Identity) /Supplement 0 >> >> +endobj +81 0 obj +[ 1 [ 235 ] 15 [ 996 ] 22 [ 659 ] 255 [ 528 ] 259 [ 515 ] 262 [ 640 ] 274 [ 440 382 ] 1927 [ 495 ] ] +endobj +83 0 obj +<< /Subtype /CIDFontType0C /Length 1432 >> +stream +OSVLMY+STIXTwoText-BoldG!"   =  HF "^ $ %OSVLMY+STIXTwoText-Bold2.120STIX Fonts and STIX Two are trademarks of The Institute of Electrical and Electronics Engineers, Inc.Copyright 2001-2021 The STIX Fonts Project Authors https:github.comstipubstixfontsSTIX Two Text BoldSTIX Two TextAdobeIdentity + +P;5t=̾XS@@=ɿp=NNvv5'4)&FR:\tppv˳h<.fqFvhfupkjt6϶ +(@~pm"H}?`׶Fy"՜OcPc`יy@%;`܋4ZS)6\iZЀZ\04`YEץv^vR# #3С]z֭9u[||z{vmFBWO^uĖy6[=,o;v:jonyql|ִ٣L4&&''1(ٵqbjcq=5_ +8<@35&;뎟T9ɋqw#C#PKS$NdpRTM2lbhT{UcGՎI?YxMc7e iaRC*}}M5Ecn+Ȁ7b|rygPLW73kb_xkejy\Cjx_X}#wsz~de'Pc7X[Siah8wF(a/`IrTZ}hYcz-` &&rwMa ;H& +)  +endstream +endobj +82 0 obj +<< /Type /FontDescriptor /FontName /OSVLMY+STIXTwoText-Bold /Flags 4 /FontBBox [ -948 -434 1429 1047 ] /Ascent 762 /CapHeight 657 /Descent -238 /ItalicAngle 0 /StemV 92 /XHeight 473 /FontFile3 83 0 R >> +endobj +84 0 obj +<< /Length 799 >> +stream +%!PS-Adobe-3.0 Resource-CMap +%%DocumentNeededResources: ProcSet (CIDInit) +%%IncludeResource: ProcSet (CIDInit) +%%BeginResource: CMap (TeX-OSVLMY-STIXTwoText-Bold-0) +%%Title: (TeX-OSVLMY-STIXTwoText-Bold-0 TeX OSVLMY-STIXTwoText-Bold 0) +%%Version: 1.000 +%%EndComments +/CIDInit /ProcSet findresource begin +12 dict begin +begincmap +/CIDSystemInfo +<< /Registry (TeX) +/Ordering (OSVLMY-STIXTwoText-Bold) +/Supplement 0 +>> def +/CMapName /TeX-Identity-OSVLMY-STIXTwoText-Bold def +/CMapType 2 def +1 begincodespacerange +<0000> +endcodespacerange +0 beginbfrange +endbfrange +9 beginbfchar +<0001> <0020> +<000F> <004D> +<0016> <0054> +<00FF> <0061> +<0103> <0065> +<0106> <0068> +<0112> <0073> +<0113> <0074> +<0787> <0031> +endbfchar +endcmap +CMapName currentdict /CMap defineresource pop +end +end +%%EndResource +%%EOF + +endstream +endobj +61 0 obj +<< /Type /Font /Subtype /Type0 /Encoding /Identity-H /BaseFont /OSVLMY+STIXTwoText-Bold /DescendantFonts [ 85 0 R ] /ToUnicode 84 0 R >> +endobj +85 0 obj +<< /Type /Font /Subtype /CIDFontType0 /BaseFont /OSVLMY+STIXTwoText-Bold /FontDescriptor 82 0 R /W 81 0 R /CIDSystemInfo << /Registry (Adobe) /Ordering (Identity) /Supplement 0 >> >> +endobj +86 0 obj +[ 1 [ 235 ] 6 [ 730 ] 8 [ 588 ] 15 [ 895 ] 17 [ 737 ] 21 [ 504 603 ] 255 [ 480 522 ] 258 [ 539 464 307 ] 262 [ 577 285 ] 267 [ 283 852 576 514 ] 272 [ 521 380 395 315 570 481 ] 279 [ 479 483 ] 1825 [ 245 ] 1829 [ 245 ] 1926 [ 495 495 495 495 495 ] ] +endobj +88 0 obj +<< /Subtype /CIDFontType0C /Length 3959 >> +stream +CVHPXV+STIXTwoText-RegularG!"   =  H  "@ $ % CVHPXV+STIXTwoText-Regular2.120STIX Fonts and STIX Two are trademarks of The Institute of Electrical and Electronics Engineers, Inc.Copyright 2001-2021 The STIX Fonts Project Authors https:github.comstipubstixfontsSTIX Two Text RegularSTIX Two TextAdobeIdentity   !%""\3(I^  N +4 + f / #=̾XS@@=ɿp=NNvv5'4)&FR:\tppv˳h<.fqFvhfupkjt;mִmll#z6dm֢B7:~3lmwxwc$RuOJ8^mtƨ=qf +~:|i:wy5 +3s,ׅ8mӤ}F@A{:m?|'}CB{>mC{5՛Rfg`mڙz@$}~:mB?Y !eZ [Z _h [$D PKH]E#"Pqvlw'#qKj|m (2@CɲV\qv{su0#5S""[]\h3?\m@%XhljlSH^S5~uqHЀHu0}7m8V#G1?1˯r_tl||}}xvwbHUSj|Ǖǩj4_5pEqK[$oig{j`nߦb4ww}Vho^~pGG(HW`ah1+g2,Vmxs[v$4ww;`@;W@FW7vf~pho_mlk#&9(6]ɨdbshp~\(]";rׯxaklg:9B C -;;54 撱ҬGA!d8xߖyxxw\JEgp]ptad0?a=Bn4wwlL[xV]tPLphohSyWnwFܖF9A[wFnZw'Hqon`R{HouhhuqmlqwCrhpbS{Inˈw;a]Cu0TNqGnvF` X=VXnUTwiRG]f\UsnngOLnxM~u6ZEzDnˈwPiHb`0L_g[UtnngSzHoyCx ISzDoZ5<(62)%61 5Z387_[9apLwpSfpjaia("42̶DMz:o}'knYtom|ZK?Mw:@xxėixd[mhAXsmoygSInG'[s |o˜VhckrXWeu|oU_\99vV~ggCC + +hd2Cnu{5vvNx6sgnM) 73o;.keJ9vf|oB9o|{|pf\GsiyʧknY) 0djyin;j +h*zhn;B۩nj >-0.>nw*~)pjn~t'pa~nOg\}niqw~me}ғ!߭pYcupknr:]sf{҃  udftniirGۭ11?b1 %>RE.5;_VZݶ-_T^-<Gwe>/fnVTj}h;Yhz-oG0wD4nOt~vVE<*GS*lְػ[@"YL(%,,iG~y`?~&H;ʳP%9M]4f~ɳiP:DTGxeP2)NnLXZcq~tvqnGv26wr2%6N<L2r{nr &#T|Oa ;Hӝ +  +endstream +endobj +87 0 obj +<< /Type /FontDescriptor /FontName /CVHPXV+STIXTwoText-Regular /Flags 4 /FontBBox [ -948 -373 1297 1047 ] /Ascent 762 /CapHeight 657 /Descent -238 /ItalicAngle 0 /StemV 82 /XHeight 473 /FontFile3 88 0 R >> +endobj +89 0 obj +<< /Length 1151 >> +stream +%!PS-Adobe-3.0 Resource-CMap +%%DocumentNeededResources: ProcSet (CIDInit) +%%IncludeResource: ProcSet (CIDInit) +%%BeginResource: CMap (TeX-CVHPXV-STIXTwoText-Regular-0) +%%Title: (TeX-CVHPXV-STIXTwoText-Regular-0 TeX CVHPXV-STIXTwoText-Regular 0) +%%Version: 1.000 +%%EndComments +/CIDInit /ProcSet findresource begin +12 dict begin +begincmap +/CIDSystemInfo +<< /Registry (TeX) +/Ordering (CVHPXV-STIXTwoText-Regular) +/Supplement 0 +>> def +/CMapName /TeX-Identity-CVHPXV-STIXTwoText-Regular def +/CMapType 2 def +1 begincodespacerange +<0000> +endcodespacerange +0 beginbfrange +endbfrange +33 beginbfchar +<0001> <0020> +<0006> <0044> +<0008> <0046> +<000F> <004D> +<0011> <004F> +<0015> <0053> +<0016> <0054> +<00FF> <0061> +<0100> <0062> +<0102> <0064> +<0103> <0065> +<0104> <0066> +<0106> <0068> +<0107> <0069> +<010B> <006C> +<010C> <006D> +<010D> <006E> +<010E> <006F> +<0110> <0071> +<0111> <0072> +<0112> <0073> +<0113> <0074> +<0114> <0075> +<0115> <0076> +<0117> <0078> +<0118> <0079> +<0721> <002C> +<0725> <002E> +<0786> <0030> +<0787> <0031> +<0788> <0032> +<0789> <0033> +<078A> <0034> +endbfchar +endcmap +CMapName currentdict /CMap defineresource pop +end +end +%%EndResource +%%EOF + +endstream +endobj +60 0 obj +<< /Type /Font /Subtype /Type0 /Encoding /Identity-H /BaseFont /CVHPXV+STIXTwoText-Regular /DescendantFonts [ 90 0 R ] /ToUnicode 89 0 R >> +endobj +90 0 obj +<< /Type /Font /Subtype /CIDFontType0 /BaseFont /CVHPXV+STIXTwoText-Regular /FontDescriptor 87 0 R /W 86 0 R /CIDSystemInfo << /Registry (Adobe) /Ordering (Identity) /Supplement 0 >> >> +endobj +64 0 obj +<< /Type /Pages /Count 1 /Kids [ 58 0 R ] >> +endobj +91 0 obj +<< /EmbeddedFiles 67 0 R >> +endobj +92 0 obj +<< /Type /Catalog /Pages 64 0 R /Names 91 0 R /MarkInfo 65 0 R/ViewerPreferences 66 0 R/Lang (en-US)/StructTreeRoot 5 0 R/Metadata 56 0 R >> +endobj +93 0 obj +<< /Producer (luahbtex-1.17.0)/Creator (LaTeX)/Title /Author /CreationDate (D:20240223213822Z)/ModDate (D:20240223213822Z) /Trapped /False /PTEX.FullBanner (This is LuaHBTeX, Version 1.17.0 (TeX Live 2023)) >> +endobj +xref +0 94 +0000000002 65535 f +0000017552 00000 n +0000000003 00000 f +0000000004 00000 f +0000000010 00000 f +0000023241 00000 n +0000017782 00000 n +0000018316 00000 n +0000020548 00000 n +0000019173 00000 n +0000000012 00000 f +0000019241 00000 n +0000000014 00000 f +0000019311 00000 n +0000000020 00000 f +0000020074 00000 n +0000019392 00000 n +0000020352 00000 n +0000020181 00000 n +0000020459 00000 n +0000000000 00000 f +0000020609 00000 n +0000000020 00000 n +0000000194 00000 n +0000000421 00000 n +0000000595 00000 n +0000000822 00000 n +0000001021 00000 n +0000001248 00000 n +0000001551 00000 n +0000020716 00000 n +0000020804 00000 n +0000020892 00000 n +0000021008 00000 n +0000021104 00000 n +0000021238 00000 n +0000021372 00000 n +0000021506 00000 n +0000021617 00000 n +0000021752 00000 n +0000021877 00000 n +0000021977 00000 n +0000022241 00000 n +0000001778 00000 n +0000001924 00000 n +0000022424 00000 n +0000002173 00000 n +0000002319 00000 n +0000022607 00000 n +0000002568 00000 n +0000002716 00000 n +0000022791 00000 n +0000022900 00000 n +0000023036 00000 n +0000002965 00000 n +0000003169 00000 n +0000003418 00000 n +0000017453 00000 n +0000017318 00000 n +0000015168 00000 n +0000036975 00000 n +0000030881 00000 n +0000027812 00000 n +0000025172 00000 n +0000037333 00000 n +0000017599 00000 n +0000017635 00000 n +0000017680 00000 n +0000017916 00000 n +0000018279 00000 n +0000018895 00000 n +0000023378 00000 n +0000024187 00000 n +0000023411 00000 n +0000024410 00000 n +0000025328 00000 n +0000025530 00000 n +0000026749 00000 n +0000025619 00000 n +0000026972 00000 n +0000027968 00000 n +0000028170 00000 n +0000029803 00000 n +0000028287 00000 n +0000030022 00000 n +0000031034 00000 n +0000031233 00000 n +0000035542 00000 n +0000031499 00000 n +0000035764 00000 n +0000037131 00000 n +0000037395 00000 n +0000037439 00000 n +0000037596 00000 n +trailer +<< /Size 94 /Root 92 0 R /Info 93 0 R /ID [ ] >> +startxref +37906 +%%EOF diff --git a/test/pdfs/bug1937438_from_word.pdf b/test/pdfs/bug1937438_from_word.pdf new file mode 100755 index 000000000..8e6c88fc2 Binary files /dev/null and b/test/pdfs/bug1937438_from_word.pdf differ diff --git a/test/pdfs/bug1937438_mml_from_latex.pdf b/test/pdfs/bug1937438_mml_from_latex.pdf new file mode 100755 index 000000000..2082e4df6 Binary files /dev/null and b/test/pdfs/bug1937438_mml_from_latex.pdf differ diff --git a/test/unit/struct_tree_spec.js b/test/unit/struct_tree_spec.js index 055156559..ca722b854 100644 --- a/test/unit/struct_tree_spec.js +++ b/test/unit/struct_tree_spec.js @@ -21,6 +21,7 @@ function equalTrees(rootA, rootB) { expect(a.role).toEqual(b.role); expect(a.lang).toEqual(b.lang); expect(a.type).toEqual(b.type); + expect(a.mathML).toEqual(b.mathML); expect("children" in a).toEqual("children" in b); if (!a.children) { return; @@ -151,4 +152,154 @@ describe("struct tree", function () { ); await loadingTask.destroy(); }); + + it("parses structure with some MathML in AF dictionary", async function () { + const filename = "bug1937438_af_from_latex.pdf"; + const params = buildGetDocumentParams(filename); + const loadingTask = getDocument(params); + const doc = await loadingTask.promise; + const page = await doc.getPage(1); + const struct = await page.getStructTree(); + equalTrees( + { + children: [ + { + role: "Document", + children: [ + { + role: "Part", + children: [ + { + role: "P", + children: [ + { + role: "P", + children: [{ type: "content", id: "p58R_mc0" }], + }, + ], + }, + { + role: "P", + children: [{ type: "content", id: "p58R_mc1" }], + }, + { + role: "P", + children: [{ type: "content", id: "p58R_mc2" }], + }, + ], + }, + { + role: "Sect", + children: [ + { + role: "H1", + children: [ + { + role: "Lbl", + children: [{ type: "content", id: "p58R_mc3" }], + }, + { type: "content", id: "p58R_mc4" }, + ], + }, + { + role: "Part", + children: [ + { + role: "P", + children: [ + { type: "content", id: "p58R_mc5" }, + { + role: "Formula", + children: [{ type: "content", id: "p58R_mc6" }], + mathML: " x ", + }, + { type: "content", id: "p58R_mc7" }, + { + role: "Formula", + children: [{ type: "content", id: "p58R_mc8" }], + mathML: " y ", + }, + { type: "content", id: "p58R_mc9" }, + { + role: "Formula", + children: [{ type: "content", id: "p58R_mc10" }], + mathML: + " x > y ", + }, + { type: "content", id: "p58R_mc11" }, + ], + }, + ], + }, + { + role: "Part", + children: [ + { + role: "P", + children: [{ type: "content", id: "p58R_mc12" }], + }, + { + role: "Formula", + children: [{ type: "content", id: "p58R_mc13" }], + mathML: + ' x2 = |x| ', + }, + ], + }, + ], + }, + ], + }, + ], + role: "Root", + }, + struct + ); + await loadingTask.destroy(); + }); + + it("parses structure with some MathML in MS Office specific entry", async function () { + const filename = "bug1937438_from_word.pdf"; + const params = buildGetDocumentParams(filename); + const loadingTask = getDocument(params); + const doc = await loadingTask.promise; + const page = await doc.getPage(1); + const struct = await page.getStructTree(); + equalTrees( + { + children: [ + { + role: "Document", + children: [ + { + role: "P", + children: [ + { type: "content", id: "p3R_mc0" }, + { + role: "Formula", + children: [{ type: "content", id: "p3R_mc1" }], + alt: "pi", + mathML: '𝜋', + }, + { type: "content", id: "p3R_mc2" }, + ], + }, + { + role: "Formula", + children: [{ type: "content", id: "p3R_mc3" }], + alt: "6 sum from n equals 1 to infinity of 1 over n squared , equals pi squared", + mathML: + '6n=11n2=𝜋2', + }, + { role: "P", children: [{ type: "content", id: "p3R_mc4" }] }, + { role: "P", children: [{ type: "content", id: "p3R_mc5" }] }, + ], + }, + ], + role: "Root", + }, + struct + ); + await loadingTask.destroy(); + }); }); diff --git a/web/struct_tree_layer_builder.js b/web/struct_tree_layer_builder.js index dbc9d7ee4..66e1f98cb 100644 --- a/web/struct_tree_layer_builder.js +++ b/web/struct_tree_layer_builder.js @@ -15,6 +15,7 @@ /** @typedef {import("../src/display/api").PDFPageProxy} PDFPageProxy */ +import { FeatureTest, shadow } from "pdfjs-lib"; import { removeNullCharacters } from "./ui_utils.js"; const PDF_ROLE_TO_HTML_ROLE = { @@ -73,6 +74,98 @@ const PDF_ROLE_TO_HTML_ROLE = { Artifact: null, }; +const MathMLElements = new Set([ + "math", + "merror", + "mfrac", + "mi", + "mmultiscripts", + "mn", + "mo", + "mover", + "mpadded", + "mprescripts", + "mroot", + "mrow", + "ms", + "mspace", + "msqrt", + "mstyle", + "msub", + "msubsup", + "msup", + "mtable", + "mtd", + "mtext", + "mtr", + "munder", + "munderover", + "semantics", +]); +const MathMLNamespace = "http://www.w3.org/1998/Math/MathML"; + +class MathMLSanitizer { + static get sanitizer() { + // From https://w3c.github.io/mathml-docs/mathml-safe-list. + + return shadow( + this, + "sanitizer", + FeatureTest.isSanitizerSupported + ? // eslint-disable-next-line no-undef + new Sanitizer({ + elements: [...MathMLElements].map(name => ({ + name, + namespace: MathMLNamespace, + })), + replaceWithChildrenElements: [ + { + name: "maction", + namespace: MathMLNamespace, + }, + ], + attributes: [ + "dir", + "displaystyle", + "mathbackground", + "mathcolor", + "mathsize", + "scriptlevel", + "encoding", + "display", + "linethickness", + "intent", + "arg", + "form", + "fence", + "separator", + "lspace", + "rspace", + "stretchy", + "symmetric", + "maxsize", + "minsize", + "largeop", + "movablelimits", + "width", + "height", + "depth", + "voffset", + "accent", + "accentunder", + "columnspan", + "rowspan", + ].map(name => ({ + name, + namespace: MathMLNamespace, + })), + comments: false, + }) + : null + ); + } +} + const HEADING_PATTERN = /^H(\d+)$/; /** @@ -230,9 +323,12 @@ class StructTreeLayerBuilder { return null; } - const element = document.createElement("span"); + let element; if ("role" in node) { const { role } = node; + element = MathMLElements.has(role) + ? document.createElementNS(MathMLNamespace, role) + : document.createElement("span"); const match = role.match(HEADING_PATTERN); if (match) { element.setAttribute("role", "heading"); @@ -243,8 +339,24 @@ class StructTreeLayerBuilder { if (role === "Figure" && this.#addImageInTextLayer(node, element)) { return element; } + if (role === "Formula") { + if (node.mathML && MathMLSanitizer.sanitizer) { + element.setHTML(node.mathML, { + sanitizer: MathMLSanitizer.sanitizer, + }); + } + if ( + !node.mathML && + node.children.length === 1 && + node.children[0].role !== "math" + ) { + element = document.createElementNS(MathMLNamespace, "math"); + } + } } + element ||= document.createElement("span"); + this.#setAttributes(node, element); if (node.children) {