Get the text under highlight/squiggly/underline/strikethrough annotations (bug 1885505)

and add an invisible element containing the text in the annotation layer to make
it readable by a screen reader.
This commit is contained in:
Calixte Denizet 2025-06-18 21:05:53 +02:00
parent 7d3b2a6d3f
commit 3bdc5d54fe
10 changed files with 449 additions and 10 deletions

View File

@ -1247,6 +1247,10 @@ class Annotation {
return null;
}
get overlaysTextContent() {
return false;
}
get hasTextContent() {
return false;
}
@ -4711,6 +4715,10 @@ class HighlightAnnotation extends MarkupAnnotation {
}
}
get overlaysTextContent() {
return true;
}
static createNewDict(annotation, xref, { apRef, ap }) {
const { color, oldAnnotation, opacity, rect, rotation, user, quadPoints } =
annotation;
@ -4835,6 +4843,10 @@ class UnderlineAnnotation extends MarkupAnnotation {
this.data.popupRef = null;
}
}
get overlaysTextContent() {
return true;
}
}
class SquigglyAnnotation extends MarkupAnnotation {
@ -4879,6 +4891,10 @@ class SquigglyAnnotation extends MarkupAnnotation {
this.data.popupRef = null;
}
}
get overlaysTextContent() {
return true;
}
}
class StrikeOutAnnotation extends MarkupAnnotation {
@ -4918,6 +4934,10 @@ class StrikeOutAnnotation extends MarkupAnnotation {
this.data.popupRef = null;
}
}
get overlaysTextContent() {
return true;
}
}
class StampAnnotation extends MarkupAnnotation {

View File

@ -66,6 +66,7 @@ import { calculateMD5 } from "./calculate_md5.js";
import { Catalog } from "./catalog.js";
import { clearGlobalCaches } from "./cleanup_helper.js";
import { DatasetReader } from "./dataset_reader.js";
import { Intersector } from "./intersector.js";
import { Linearization } from "./parser.js";
import { NullStream } from "./stream.js";
import { ObjectLoader } from "./object_loader.js";
@ -632,6 +633,7 @@ class Page {
includeMarkedContent,
disableNormalization,
sink,
intersector = null,
}) {
const contentStreamPromise = this.getContentStream();
const resourcesPromise = this.loadResources(RESOURCES_KEYS_TEXT_CONTENT);
@ -658,6 +660,7 @@ class Page {
sink,
viewBox: this.view,
lang,
intersector,
});
}
@ -707,6 +710,8 @@ class Page {
intentDisplay = !!(intent & RenderingIntentFlag.DISPLAY),
intentPrint = !!(intent & RenderingIntentFlag.PRINT);
const highlightedAnnotations = [];
for (const annotation of annotations) {
// Get the annotation even if it's hidden because
// JS can change its display.
@ -732,9 +737,29 @@ class Page {
);
})
);
} else if (annotation.overlaysTextContent && isVisible) {
highlightedAnnotations.push(annotation);
}
}
if (highlightedAnnotations.length > 0) {
const intersector = new Intersector(highlightedAnnotations);
textContentPromises.push(
this.extractTextContent({
handler,
task,
includeMarkedContent: false,
disableNormalization: false,
sink: null,
viewBox: this.view,
lang: null,
intersector,
}).then(() => {
intersector.setText();
})
);
}
await Promise.all(textContentPromises);
return annotationsData;
}

View File

@ -2362,6 +2362,7 @@ class PartialEvaluator {
disableNormalization = false,
keepWhiteSpace = false,
prevRefs = null,
intersector = null,
}) {
const objId = stream.dict?.objId;
const seenRefs = new RefSet(prevRefs);
@ -2506,6 +2507,7 @@ class PartialEvaluator {
transform = textContentItem.prevTransform,
fontName = textContentItem.fontName,
}) {
intersector?.addExtraChar(" ");
textContent.items.push({
str: " ",
dir: "ltr",
@ -2964,9 +2966,21 @@ class PartialEvaluator {
if (!font.vertical) {
scaledDim *= textState.textHScale;
intersector?.addGlyph(
getCurrentTextTransform(),
scaledDim,
0,
glyph.unicode
);
textState.translateTextMatrix(scaledDim, 0);
textChunk.width += scaledDim;
} else {
intersector?.addGlyph(
getCurrentTextTransform(),
0,
scaledDim,
glyph.unicode
);
textState.translateTextMatrix(0, scaledDim);
scaledDim = Math.abs(scaledDim);
textChunk.height += scaledDim;
@ -2985,8 +2999,12 @@ class PartialEvaluator {
// alignment issues between the textLayer and the canvas if the text
// contains e.g. tabs (fixes issue6612.pdf).
textChunk.str.push(" ");
intersector?.addExtraChar(" ");
}
if (!intersector) {
textChunk.str.push(glyphUnicode);
}
textChunk.str.push(glyphUnicode);
if (charSpacing) {
if (!font.vertical) {
@ -3002,6 +3020,7 @@ class PartialEvaluator {
}
function appendEOL() {
intersector?.addExtraChar("\n");
resetLastChars();
if (textContentItem.initialized) {
textContentItem.hasEOL = true;
@ -3027,6 +3046,7 @@ class PartialEvaluator {
if (textContentItem.initialized) {
resetLastChars();
textContentItem.str.push(" ");
intersector?.addExtraChar(" ");
}
return false;
}
@ -3078,7 +3098,7 @@ class PartialEvaluator {
if (batch && length < TEXT_CHUNK_BATCH_SIZE) {
return;
}
sink.enqueue(textContent, length);
sink?.enqueue(textContent, length);
textContent.items = [];
textContent.styles = Object.create(null);
}
@ -3088,7 +3108,7 @@ class PartialEvaluator {
return new Promise(function promiseBody(resolve, reject) {
const next = function (promise) {
enqueueChunk(/* batch = */ true);
Promise.all([promise, sink.ready]).then(function () {
Promise.all([promise, sink?.ready]).then(function () {
try {
promiseBody(resolve, reject);
} catch (ex) {
@ -3341,7 +3361,7 @@ class PartialEvaluator {
},
get desiredSize() {
return sink.desiredSize;
return sink.desiredSize ?? 0;
},
get ready() {
@ -3359,7 +3379,7 @@ class PartialEvaluator {
: resources,
stateManager: xObjStateManager,
includeMarkedContent,
sink: sinkWrapper,
sink: sink && sinkWrapper,
seenStyles,
viewBox,
lang,
@ -3499,7 +3519,7 @@ class PartialEvaluator {
}
break;
} // switch
if (textContent.items.length >= sink.desiredSize) {
if (textContent.items.length >= (sink?.desiredSize ?? 1)) {
// Wait for ready, if we reach highWaterMark.
stop = true;
break;

203
src/core/intersector.js Normal file
View File

@ -0,0 +1,203 @@
/* Copyright 2025 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
class SingleIntersector {
#annotation;
#minX = Infinity;
#minY = Infinity;
#maxX = -Infinity;
#maxY = -Infinity;
#quadPoints;
#text = [];
#extraChars = [];
#lastIntersectingQuadIndex = -1;
#canTakeExtraChars = false;
constructor(annotation) {
this.#annotation = annotation;
const quadPoints = (this.#quadPoints = annotation.data.quadPoints);
for (let i = 0, ii = quadPoints.length; i < ii; i += 8) {
this.#minX = Math.min(this.#minX, quadPoints[i]);
this.#maxX = Math.max(this.#maxX, quadPoints[i + 2]);
this.#minY = Math.min(this.#minY, quadPoints[i + 5]);
this.#maxY = Math.max(this.#maxY, quadPoints[i + 1]);
}
}
overlaps(other) {
return !(
this.#minX >= other.#maxX ||
this.#maxX <= other.#minX ||
this.#minY >= other.#maxY ||
this.#maxY <= other.#minY
);
}
/**
* Check if the given point intersects with the annotation's quad points.
* The point (x, y) is supposed to be the center of the glyph.
* @param {number} x
* @param {number} y
* @returns {boolean}
*/
#intersects(x, y) {
if (
this.#minX >= x ||
this.#maxX <= x ||
this.#minY >= y ||
this.#maxY <= y
) {
return false;
}
const quadPoints = this.#quadPoints;
if (quadPoints.length === 8) {
// We've only one quad, so if we intersect min/max bounds then we
// intersect the quad.
return true;
}
if (this.#lastIntersectingQuadIndex >= 0) {
const i = this.#lastIntersectingQuadIndex;
if (
!(
quadPoints[i] >= x ||
quadPoints[i + 2] <= x ||
quadPoints[i + 5] >= y ||
quadPoints[i + 1] <= y
)
) {
return true;
}
this.#lastIntersectingQuadIndex = -1;
}
for (let i = 0, ii = quadPoints.length; i < ii; i += 8) {
if (
!(
quadPoints[i] >= x ||
quadPoints[i + 2] <= x ||
quadPoints[i + 5] >= y ||
quadPoints[i + 1] <= y
)
) {
this.#lastIntersectingQuadIndex = i;
return true;
}
}
return false;
}
addGlyph(x, y, glyph) {
if (!this.#intersects(x, y)) {
this.disableExtraChars();
return false;
}
if (this.#extraChars.length > 0) {
this.#text.push(this.#extraChars.join(""));
this.#extraChars.length = 0;
}
this.#text.push(glyph);
this.#canTakeExtraChars = true;
return true;
}
addExtraChar(char) {
if (this.#canTakeExtraChars) {
this.#extraChars.push(char);
}
}
disableExtraChars() {
if (!this.#canTakeExtraChars) {
return;
}
this.#canTakeExtraChars = false;
this.#extraChars.length = 0;
}
setText() {
this.#annotation.data.overlaidText = this.#text.join("");
}
}
class Intersector {
#intersectors = new Map();
constructor(annotations) {
for (const annotation of annotations) {
if (!annotation.data.quadPoints) {
continue;
}
const intersector = new SingleIntersector(annotation);
for (const [otherIntersector, overlapping] of this.#intersectors) {
if (otherIntersector.overlaps(intersector)) {
if (!overlapping) {
this.#intersectors.set(otherIntersector, new Set([intersector]));
} else {
overlapping.add(intersector);
}
}
}
this.#intersectors.set(intersector, null);
}
}
addGlyph(transform, width, height, glyph) {
const x = transform[4] + width / 2;
const y = transform[5] + height / 2;
let overlappingIntersectors;
for (const [intersector, overlapping] of this.#intersectors) {
if (overlappingIntersectors) {
if (overlappingIntersectors.has(intersector)) {
intersector.addGlyph(x, y, glyph);
} else {
intersector.disableExtraChars();
}
continue;
}
if (!intersector.addGlyph(x, y, glyph)) {
continue;
}
overlappingIntersectors = overlapping;
}
}
addExtraChar(char) {
for (const intersector of this.#intersectors.keys()) {
intersector.addExtraChar(char);
}
}
setText() {
for (const intersector of this.#intersectors.keys()) {
intersector.setText();
}
}
}
export { Intersector };

View File

@ -555,6 +555,7 @@ class AnnotationElement {
svg.classList.add("quadrilateralsContainer");
svg.setAttribute("width", 0);
svg.setAttribute("height", 0);
svg.role = "none";
const defs = svgFactory.createElement("defs");
svg.append(defs);
const clipPath = svgFactory.createElement("clipPath");
@ -2912,13 +2913,23 @@ class HighlightAnnotationElement extends AnnotationElement {
}
render() {
if (!this.data.popupRef && this.hasPopupData) {
const {
data: { overlaidText, popupRef },
} = this;
if (!popupRef && this.hasPopupData) {
this._createPopup();
}
this.container.classList.add("highlightAnnotation");
this._editOnDoubleClick();
if (overlaidText) {
const mark = document.createElement("mark");
mark.classList.add("overlaidText");
mark.textContent = overlaidText;
this.container.append(mark);
}
return this.container;
}
}
@ -2933,11 +2944,22 @@ class UnderlineAnnotationElement extends AnnotationElement {
}
render() {
if (!this.data.popupRef && this.hasPopupData) {
const {
data: { overlaidText, popupRef },
} = this;
if (!popupRef && this.hasPopupData) {
this._createPopup();
}
this.container.classList.add("underlineAnnotation");
if (overlaidText) {
const underline = document.createElement("u");
underline.classList.add("overlaidText");
underline.textContent = overlaidText;
this.container.append(underline);
}
return this.container;
}
}
@ -2952,11 +2974,22 @@ class SquigglyAnnotationElement extends AnnotationElement {
}
render() {
if (!this.data.popupRef && this.hasPopupData) {
const {
data: { overlaidText, popupRef },
} = this;
if (!popupRef && this.hasPopupData) {
this._createPopup();
}
this.container.classList.add("squigglyAnnotation");
if (overlaidText) {
const underline = document.createElement("u");
underline.classList.add("overlaidText");
underline.textContent = overlaidText;
this.container.append(underline);
}
return this.container;
}
}
@ -2971,11 +3004,22 @@ class StrikeOutAnnotationElement extends AnnotationElement {
}
render() {
if (!this.data.popupRef && this.hasPopupData) {
const {
data: { overlaidText, popupRef },
} = this;
if (!popupRef && this.hasPopupData) {
this._createPopup();
}
this.container.classList.add("strikeoutAnnotation");
if (overlaidText) {
const strikeout = document.createElement("s");
strikeout.classList.add("overlaidText");
strikeout.textContent = overlaidText;
this.container.append(strikeout);
}
return this.container;
}
}

View File

@ -696,4 +696,77 @@ describe("ResetForm action", () => {
});
});
});
describe("Text under some annotations", () => {
describe("bug1885505.pdf", () => {
let pages;
beforeEach(async () => {
pages = await loadAndWait(
"bug1885505.pdf",
":is(" +
[56, 58, 60, 65]
.map(id => `[data-annotation-id='${id}R']`)
.join(", ") +
")"
);
});
afterEach(async () => {
await closePages(pages);
});
it("must check that the text under a highlight annotation exist in the DOM", async () => {
await Promise.all(
pages.map(async ([browserName, page]) => {
const text = await page.$eval(
"[data-annotation-id='56R'] mark",
el => el.textContent
);
expect(text).withContext(`In ${browserName}`).toEqual("Languages");
})
);
});
it("must check that the text under an underline annotation exist in the DOM", async () => {
await Promise.all(
pages.map(async ([browserName, page]) => {
const text = await page.$eval(
"[data-annotation-id='58R'] u",
el => el.textContent
);
expect(text).withContext(`In ${browserName}`).toEqual("machine");
})
);
});
it("must check that the text under a squiggly annotation exist in the DOM", async () => {
await Promise.all(
pages.map(async ([browserName, page]) => {
const text = await page.$eval(
"[data-annotation-id='60R'] u",
el => el.textContent
);
expect(text).withContext(`In ${browserName}`)
.toEqual(`paths through nested loops. We have implemented
a dynamic compiler for JavaScript based on our`);
})
);
});
it("must check that the text under a strikeout annotation exist in the DOM", async () => {
await Promise.all(
pages.map(async ([browserName, page]) => {
const text = await page.$eval(
"[data-annotation-id='65R'] s",
el => el.textContent
);
expect(text)
.withContext(`In ${browserName}`)
.toEqual("Experimentation,");
})
);
});
});
});
});

View File

@ -727,3 +727,4 @@
!bug1963407.pdf
!issue19517.pdf
!empty#hash.pdf
!bug1885505.pdf

BIN
test/pdfs/bug1885505.pdf Executable file

Binary file not shown.

View File

@ -5127,4 +5127,47 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
}
);
});
describe("Annotations", function () {
it("should extract the text under some annotations", async function () {
const loadingTask = getDocument(buildGetDocumentParams("bug1885505.pdf"));
const pdfDoc = await loadingTask.promise;
const page1 = await pdfDoc.getPage(1);
const annots = await page1.getAnnotations();
let annot = annots.find(x => x.id === "56R");
expect(annot.overlaidText).toEqual("Languages");
annot = annots.find(x => x.id === "52R");
expect(annot.overlaidText)
.toEqual(`Dynamic languages such as JavaScript are more difficult to com-
pile than statically typed ones. Since no concrete type information
is available, traditional compilers`);
annot = annots.find(x => x.id === "54R");
expect(annot.overlaidText)
.toEqual(`typed ones. Since no concrete type information
is available, traditional compilers need to emit generic code that can
handle all possible type combinations at runtime. We present an al-
ternative compilation technique for dynamically-`);
annot = annots.find(x => x.id === "58R");
expect(annot.overlaidText).toEqual("machine");
annot = annots.find(x => x.id === "60R");
expect(annot.overlaidText)
.toEqual(`paths through nested loops. We have implemented
a dynamic compiler for JavaScript based on our`);
annot = annots.find(x => x.id === "65R");
expect(annot.overlaidText).toEqual("Experimentation,");
annot = annots.find(x => x.id === "63R");
expect(annot.overlaidText)
.toEqual(`languages such as JavaScript, Python, and Ruby, are pop-
ular since they are expressive, accessible to non-experts, and make
deployment as easy as distributing a source file. They are used for
small scripts as well as for`);
});
});
});

View File

@ -126,6 +126,16 @@
display: none;
}
}
.overlaidText {
position: absolute;
top: 0;
left: 0;
width: 0;
height: 0;
display: inline-block;
overflow: hidden;
}
}
.textLayer.selecting ~ & section {