diff --git a/src/core/annotation.js b/src/core/annotation.js index f5fe324d8..f5fba7225 100644 --- a/src/core/annotation.js +++ b/src/core/annotation.js @@ -1247,6 +1247,10 @@ class Annotation { return null; } + get overlaysTextContent() { + return false; + } + get hasTextContent() { return false; } @@ -4711,6 +4715,10 @@ class HighlightAnnotation extends MarkupAnnotation { } } + get overlaysTextContent() { + return true; + } + static createNewDict(annotation, xref, { apRef, ap }) { const { color, oldAnnotation, opacity, rect, rotation, user, quadPoints } = annotation; @@ -4835,6 +4843,10 @@ class UnderlineAnnotation extends MarkupAnnotation { this.data.popupRef = null; } } + + get overlaysTextContent() { + return true; + } } class SquigglyAnnotation extends MarkupAnnotation { @@ -4879,6 +4891,10 @@ class SquigglyAnnotation extends MarkupAnnotation { this.data.popupRef = null; } } + + get overlaysTextContent() { + return true; + } } class StrikeOutAnnotation extends MarkupAnnotation { @@ -4918,6 +4934,10 @@ class StrikeOutAnnotation extends MarkupAnnotation { this.data.popupRef = null; } } + + get overlaysTextContent() { + return true; + } } class StampAnnotation extends MarkupAnnotation { diff --git a/src/core/document.js b/src/core/document.js index 46c20bdf5..b26acd3dd 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -66,6 +66,7 @@ import { calculateMD5 } from "./calculate_md5.js"; import { Catalog } from "./catalog.js"; import { clearGlobalCaches } from "./cleanup_helper.js"; import { DatasetReader } from "./dataset_reader.js"; +import { Intersector } from "./intersector.js"; import { Linearization } from "./parser.js"; import { NullStream } from "./stream.js"; import { ObjectLoader } from "./object_loader.js"; @@ -632,6 +633,7 @@ class Page { includeMarkedContent, disableNormalization, sink, + intersector = null, }) { const contentStreamPromise = this.getContentStream(); const resourcesPromise = this.loadResources(RESOURCES_KEYS_TEXT_CONTENT); @@ -658,6 +660,7 @@ class Page { sink, viewBox: this.view, lang, + intersector, }); } @@ -707,6 +710,8 @@ class Page { intentDisplay = !!(intent & RenderingIntentFlag.DISPLAY), intentPrint = !!(intent & RenderingIntentFlag.PRINT); + const highlightedAnnotations = []; + for (const annotation of annotations) { // Get the annotation even if it's hidden because // JS can change its display. @@ -732,9 +737,29 @@ class Page { ); }) ); + } else if (annotation.overlaysTextContent && isVisible) { + highlightedAnnotations.push(annotation); } } + if (highlightedAnnotations.length > 0) { + const intersector = new Intersector(highlightedAnnotations); + textContentPromises.push( + this.extractTextContent({ + handler, + task, + includeMarkedContent: false, + disableNormalization: false, + sink: null, + viewBox: this.view, + lang: null, + intersector, + }).then(() => { + intersector.setText(); + }) + ); + } + await Promise.all(textContentPromises); return annotationsData; } diff --git a/src/core/evaluator.js b/src/core/evaluator.js index edf02c2c0..069c2d38b 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -2362,6 +2362,7 @@ class PartialEvaluator { disableNormalization = false, keepWhiteSpace = false, prevRefs = null, + intersector = null, }) { const objId = stream.dict?.objId; const seenRefs = new RefSet(prevRefs); @@ -2506,6 +2507,7 @@ class PartialEvaluator { transform = textContentItem.prevTransform, fontName = textContentItem.fontName, }) { + intersector?.addExtraChar(" "); textContent.items.push({ str: " ", dir: "ltr", @@ -2964,9 +2966,21 @@ class PartialEvaluator { if (!font.vertical) { scaledDim *= textState.textHScale; + intersector?.addGlyph( + getCurrentTextTransform(), + scaledDim, + 0, + glyph.unicode + ); textState.translateTextMatrix(scaledDim, 0); textChunk.width += scaledDim; } else { + intersector?.addGlyph( + getCurrentTextTransform(), + 0, + scaledDim, + glyph.unicode + ); textState.translateTextMatrix(0, scaledDim); scaledDim = Math.abs(scaledDim); textChunk.height += scaledDim; @@ -2985,8 +2999,12 @@ class PartialEvaluator { // alignment issues between the textLayer and the canvas if the text // contains e.g. tabs (fixes issue6612.pdf). textChunk.str.push(" "); + intersector?.addExtraChar(" "); + } + + if (!intersector) { + textChunk.str.push(glyphUnicode); } - textChunk.str.push(glyphUnicode); if (charSpacing) { if (!font.vertical) { @@ -3002,6 +3020,7 @@ class PartialEvaluator { } function appendEOL() { + intersector?.addExtraChar("\n"); resetLastChars(); if (textContentItem.initialized) { textContentItem.hasEOL = true; @@ -3027,6 +3046,7 @@ class PartialEvaluator { if (textContentItem.initialized) { resetLastChars(); textContentItem.str.push(" "); + intersector?.addExtraChar(" "); } return false; } @@ -3078,7 +3098,7 @@ class PartialEvaluator { if (batch && length < TEXT_CHUNK_BATCH_SIZE) { return; } - sink.enqueue(textContent, length); + sink?.enqueue(textContent, length); textContent.items = []; textContent.styles = Object.create(null); } @@ -3088,7 +3108,7 @@ class PartialEvaluator { return new Promise(function promiseBody(resolve, reject) { const next = function (promise) { enqueueChunk(/* batch = */ true); - Promise.all([promise, sink.ready]).then(function () { + Promise.all([promise, sink?.ready]).then(function () { try { promiseBody(resolve, reject); } catch (ex) { @@ -3341,7 +3361,7 @@ class PartialEvaluator { }, get desiredSize() { - return sink.desiredSize; + return sink.desiredSize ?? 0; }, get ready() { @@ -3359,7 +3379,7 @@ class PartialEvaluator { : resources, stateManager: xObjStateManager, includeMarkedContent, - sink: sinkWrapper, + sink: sink && sinkWrapper, seenStyles, viewBox, lang, @@ -3499,7 +3519,7 @@ class PartialEvaluator { } break; } // switch - if (textContent.items.length >= sink.desiredSize) { + if (textContent.items.length >= (sink?.desiredSize ?? 1)) { // Wait for ready, if we reach highWaterMark. stop = true; break; diff --git a/src/core/intersector.js b/src/core/intersector.js new file mode 100644 index 000000000..e50f86b25 --- /dev/null +++ b/src/core/intersector.js @@ -0,0 +1,203 @@ +/* Copyright 2025 Mozilla Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +class SingleIntersector { + #annotation; + + #minX = Infinity; + + #minY = Infinity; + + #maxX = -Infinity; + + #maxY = -Infinity; + + #quadPoints; + + #text = []; + + #extraChars = []; + + #lastIntersectingQuadIndex = -1; + + #canTakeExtraChars = false; + + constructor(annotation) { + this.#annotation = annotation; + const quadPoints = (this.#quadPoints = annotation.data.quadPoints); + + for (let i = 0, ii = quadPoints.length; i < ii; i += 8) { + this.#minX = Math.min(this.#minX, quadPoints[i]); + this.#maxX = Math.max(this.#maxX, quadPoints[i + 2]); + this.#minY = Math.min(this.#minY, quadPoints[i + 5]); + this.#maxY = Math.max(this.#maxY, quadPoints[i + 1]); + } + } + + overlaps(other) { + return !( + this.#minX >= other.#maxX || + this.#maxX <= other.#minX || + this.#minY >= other.#maxY || + this.#maxY <= other.#minY + ); + } + + /** + * Check if the given point intersects with the annotation's quad points. + * The point (x, y) is supposed to be the center of the glyph. + * @param {number} x + * @param {number} y + * @returns {boolean} + */ + #intersects(x, y) { + if ( + this.#minX >= x || + this.#maxX <= x || + this.#minY >= y || + this.#maxY <= y + ) { + return false; + } + + const quadPoints = this.#quadPoints; + if (quadPoints.length === 8) { + // We've only one quad, so if we intersect min/max bounds then we + // intersect the quad. + return true; + } + + if (this.#lastIntersectingQuadIndex >= 0) { + const i = this.#lastIntersectingQuadIndex; + if ( + !( + quadPoints[i] >= x || + quadPoints[i + 2] <= x || + quadPoints[i + 5] >= y || + quadPoints[i + 1] <= y + ) + ) { + return true; + } + this.#lastIntersectingQuadIndex = -1; + } + + for (let i = 0, ii = quadPoints.length; i < ii; i += 8) { + if ( + !( + quadPoints[i] >= x || + quadPoints[i + 2] <= x || + quadPoints[i + 5] >= y || + quadPoints[i + 1] <= y + ) + ) { + this.#lastIntersectingQuadIndex = i; + return true; + } + } + return false; + } + + addGlyph(x, y, glyph) { + if (!this.#intersects(x, y)) { + this.disableExtraChars(); + return false; + } + + if (this.#extraChars.length > 0) { + this.#text.push(this.#extraChars.join("")); + this.#extraChars.length = 0; + } + this.#text.push(glyph); + this.#canTakeExtraChars = true; + + return true; + } + + addExtraChar(char) { + if (this.#canTakeExtraChars) { + this.#extraChars.push(char); + } + } + + disableExtraChars() { + if (!this.#canTakeExtraChars) { + return; + } + this.#canTakeExtraChars = false; + this.#extraChars.length = 0; + } + + setText() { + this.#annotation.data.overlaidText = this.#text.join(""); + } +} + +class Intersector { + #intersectors = new Map(); + + constructor(annotations) { + for (const annotation of annotations) { + if (!annotation.data.quadPoints) { + continue; + } + const intersector = new SingleIntersector(annotation); + for (const [otherIntersector, overlapping] of this.#intersectors) { + if (otherIntersector.overlaps(intersector)) { + if (!overlapping) { + this.#intersectors.set(otherIntersector, new Set([intersector])); + } else { + overlapping.add(intersector); + } + } + } + this.#intersectors.set(intersector, null); + } + } + + addGlyph(transform, width, height, glyph) { + const x = transform[4] + width / 2; + const y = transform[5] + height / 2; + let overlappingIntersectors; + for (const [intersector, overlapping] of this.#intersectors) { + if (overlappingIntersectors) { + if (overlappingIntersectors.has(intersector)) { + intersector.addGlyph(x, y, glyph); + } else { + intersector.disableExtraChars(); + } + continue; + } + if (!intersector.addGlyph(x, y, glyph)) { + continue; + } + overlappingIntersectors = overlapping; + } + } + + addExtraChar(char) { + for (const intersector of this.#intersectors.keys()) { + intersector.addExtraChar(char); + } + } + + setText() { + for (const intersector of this.#intersectors.keys()) { + intersector.setText(); + } + } +} + +export { Intersector }; diff --git a/src/display/annotation_layer.js b/src/display/annotation_layer.js index 38cf43dea..ff0d638e5 100644 --- a/src/display/annotation_layer.js +++ b/src/display/annotation_layer.js @@ -555,6 +555,7 @@ class AnnotationElement { svg.classList.add("quadrilateralsContainer"); svg.setAttribute("width", 0); svg.setAttribute("height", 0); + svg.role = "none"; const defs = svgFactory.createElement("defs"); svg.append(defs); const clipPath = svgFactory.createElement("clipPath"); @@ -2912,13 +2913,23 @@ class HighlightAnnotationElement extends AnnotationElement { } render() { - if (!this.data.popupRef && this.hasPopupData) { + const { + data: { overlaidText, popupRef }, + } = this; + if (!popupRef && this.hasPopupData) { this._createPopup(); } this.container.classList.add("highlightAnnotation"); this._editOnDoubleClick(); + if (overlaidText) { + const mark = document.createElement("mark"); + mark.classList.add("overlaidText"); + mark.textContent = overlaidText; + this.container.append(mark); + } + return this.container; } } @@ -2933,11 +2944,22 @@ class UnderlineAnnotationElement extends AnnotationElement { } render() { - if (!this.data.popupRef && this.hasPopupData) { + const { + data: { overlaidText, popupRef }, + } = this; + if (!popupRef && this.hasPopupData) { this._createPopup(); } this.container.classList.add("underlineAnnotation"); + + if (overlaidText) { + const underline = document.createElement("u"); + underline.classList.add("overlaidText"); + underline.textContent = overlaidText; + this.container.append(underline); + } + return this.container; } } @@ -2952,11 +2974,22 @@ class SquigglyAnnotationElement extends AnnotationElement { } render() { - if (!this.data.popupRef && this.hasPopupData) { + const { + data: { overlaidText, popupRef }, + } = this; + if (!popupRef && this.hasPopupData) { this._createPopup(); } this.container.classList.add("squigglyAnnotation"); + + if (overlaidText) { + const underline = document.createElement("u"); + underline.classList.add("overlaidText"); + underline.textContent = overlaidText; + this.container.append(underline); + } + return this.container; } } @@ -2971,11 +3004,22 @@ class StrikeOutAnnotationElement extends AnnotationElement { } render() { - if (!this.data.popupRef && this.hasPopupData) { + const { + data: { overlaidText, popupRef }, + } = this; + if (!popupRef && this.hasPopupData) { this._createPopup(); } this.container.classList.add("strikeoutAnnotation"); + + if (overlaidText) { + const strikeout = document.createElement("s"); + strikeout.classList.add("overlaidText"); + strikeout.textContent = overlaidText; + this.container.append(strikeout); + } + return this.container; } } diff --git a/test/integration/annotation_spec.mjs b/test/integration/annotation_spec.mjs index 7db5284d2..e1c68c09b 100644 --- a/test/integration/annotation_spec.mjs +++ b/test/integration/annotation_spec.mjs @@ -696,4 +696,77 @@ describe("ResetForm action", () => { }); }); }); + + describe("Text under some annotations", () => { + describe("bug1885505.pdf", () => { + let pages; + + beforeEach(async () => { + pages = await loadAndWait( + "bug1885505.pdf", + ":is(" + + [56, 58, 60, 65] + .map(id => `[data-annotation-id='${id}R']`) + .join(", ") + + ")" + ); + }); + + afterEach(async () => { + await closePages(pages); + }); + + it("must check that the text under a highlight annotation exist in the DOM", async () => { + await Promise.all( + pages.map(async ([browserName, page]) => { + const text = await page.$eval( + "[data-annotation-id='56R'] mark", + el => el.textContent + ); + expect(text).withContext(`In ${browserName}`).toEqual("Languages"); + }) + ); + }); + + it("must check that the text under an underline annotation exist in the DOM", async () => { + await Promise.all( + pages.map(async ([browserName, page]) => { + const text = await page.$eval( + "[data-annotation-id='58R'] u", + el => el.textContent + ); + expect(text).withContext(`In ${browserName}`).toEqual("machine"); + }) + ); + }); + + it("must check that the text under a squiggly annotation exist in the DOM", async () => { + await Promise.all( + pages.map(async ([browserName, page]) => { + const text = await page.$eval( + "[data-annotation-id='60R'] u", + el => el.textContent + ); + expect(text).withContext(`In ${browserName}`) + .toEqual(`paths through nested loops. We have implemented +a dynamic compiler for JavaScript based on our`); + }) + ); + }); + + it("must check that the text under a strikeout annotation exist in the DOM", async () => { + await Promise.all( + pages.map(async ([browserName, page]) => { + const text = await page.$eval( + "[data-annotation-id='65R'] s", + el => el.textContent + ); + expect(text) + .withContext(`In ${browserName}`) + .toEqual("Experimentation,"); + }) + ); + }); + }); + }); }); diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index cac180c7f..761ca05ae 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -727,3 +727,4 @@ !bug1963407.pdf !issue19517.pdf !empty#hash.pdf +!bug1885505.pdf diff --git a/test/pdfs/bug1885505.pdf b/test/pdfs/bug1885505.pdf new file mode 100755 index 000000000..69ec68aa5 Binary files /dev/null and b/test/pdfs/bug1885505.pdf differ diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 6610de8a7..f806c1048 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -5127,4 +5127,47 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`) } ); }); + + describe("Annotations", function () { + it("should extract the text under some annotations", async function () { + const loadingTask = getDocument(buildGetDocumentParams("bug1885505.pdf")); + const pdfDoc = await loadingTask.promise; + + const page1 = await pdfDoc.getPage(1); + const annots = await page1.getAnnotations(); + let annot = annots.find(x => x.id === "56R"); + expect(annot.overlaidText).toEqual("Languages"); + + annot = annots.find(x => x.id === "52R"); + expect(annot.overlaidText) + .toEqual(`Dynamic languages such as JavaScript are more difficult to com- +pile than statically typed ones. Since no concrete type information +is available, traditional compilers`); + + annot = annots.find(x => x.id === "54R"); + expect(annot.overlaidText) + .toEqual(`typed ones. Since no concrete type information +is available, traditional compilers need to emit generic code that can +handle all possible type combinations at runtime. We present an al- +ternative compilation technique for dynamically-`); + + annot = annots.find(x => x.id === "58R"); + expect(annot.overlaidText).toEqual("machine"); + + annot = annots.find(x => x.id === "60R"); + expect(annot.overlaidText) + .toEqual(`paths through nested loops. We have implemented +a dynamic compiler for JavaScript based on our`); + + annot = annots.find(x => x.id === "65R"); + expect(annot.overlaidText).toEqual("Experimentation,"); + + annot = annots.find(x => x.id === "63R"); + expect(annot.overlaidText) + .toEqual(`languages such as JavaScript, Python, and Ruby, are pop- +ular since they are expressive, accessible to non-experts, and make +deployment as easy as distributing a source file. They are used for +small scripts as well as for`); + }); + }); }); diff --git a/web/annotation_layer_builder.css b/web/annotation_layer_builder.css index f2dbc375e..cfee955e8 100644 --- a/web/annotation_layer_builder.css +++ b/web/annotation_layer_builder.css @@ -126,6 +126,16 @@ display: none; } } + + .overlaidText { + position: absolute; + top: 0; + left: 0; + width: 0; + height: 0; + display: inline-block; + overflow: hidden; + } } .textLayer.selecting ~ & section {