Add text extractor for an external service

This commit is contained in:
Greg Tatum 2025-10-21 11:41:04 -05:00
parent f6317ddbbb
commit 26360c3e63
4 changed files with 103 additions and 0 deletions

View File

@ -89,6 +89,7 @@ import { PDFPrintServiceFactory } from "web-print_service";
import { PDFRenderingQueue } from "./pdf_rendering_queue.js"; import { PDFRenderingQueue } from "./pdf_rendering_queue.js";
import { PDFScriptingManager } from "./pdf_scripting_manager.js"; import { PDFScriptingManager } from "./pdf_scripting_manager.js";
import { PDFSidebar } from "web-pdf_sidebar"; import { PDFSidebar } from "web-pdf_sidebar";
import { PdfTextExtractor } from "./pdf_text_extractor.js";
import { PDFThumbnailViewer } from "web-pdf_thumbnail_viewer"; import { PDFThumbnailViewer } from "web-pdf_thumbnail_viewer";
import { PDFViewer } from "./pdf_viewer.js"; import { PDFViewer } from "./pdf_viewer.js";
import { Preferences } from "web-preferences"; import { Preferences } from "web-preferences";
@ -129,6 +130,8 @@ const PDFViewerApplication = {
pdfDocumentProperties: null, pdfDocumentProperties: null,
/** @type {PDFLinkService} */ /** @type {PDFLinkService} */
pdfLinkService: null, pdfLinkService: null,
/** @type {PdfTextExtractor|null} */
pdfTextExtractor: null,
/** @type {PDFHistory} */ /** @type {PDFHistory} */
pdfHistory: null, pdfHistory: null,
/** @type {PDFSidebar} */ /** @type {PDFSidebar} */
@ -262,6 +265,8 @@ const PDFViewerApplication = {
} }
await this._initializeViewerComponents(); await this._initializeViewerComponents();
this.pdfTextExtractor = new PdfTextExtractor(this.externalServices);
// Bind the various event handlers *after* the viewer has been // Bind the various event handlers *after* the viewer has been
// initialized, to prevent errors if an event arrives too soon. // initialized, to prevent errors if an event arrives too soon.
this.bindEvents(); this.bindEvents();
@ -1144,6 +1149,7 @@ const PDFViewerApplication = {
this.pdfViewer.setDocument(null); this.pdfViewer.setDocument(null);
this.pdfLinkService.setDocument(null); this.pdfLinkService.setDocument(null);
this.pdfDocumentProperties?.setDocument(null); this.pdfDocumentProperties?.setDocument(null);
this.pdfTextExtractor?.setDocument(null);
} }
this.pdfLinkService.externalLinkEnabled = true; this.pdfLinkService.externalLinkEnabled = true;
this.store = null; this.store = null;
@ -1450,6 +1456,7 @@ const PDFViewerApplication = {
const pdfViewer = this.pdfViewer; const pdfViewer = this.pdfViewer;
pdfViewer.setDocument(pdfDocument); pdfViewer.setDocument(pdfDocument);
this.pdfTextExtractor.setViewer(pdfViewer);
const { firstPagePromise, onePageRendered, pagesPromise } = pdfViewer; const { firstPagePromise, onePageRendered, pagesPromise } = pdfViewer;
this.pdfThumbnailViewer?.setDocument(pdfDocument); this.pdfThumbnailViewer?.setDocument(pdfDocument);

View File

@ -33,6 +33,8 @@ class BaseExternalServices {
reportTelemetry(data) {} reportTelemetry(data) {}
reportText(data) {}
/** /**
* @returns {Promise<IL10n>} * @returns {Promise<IL10n>}
*/ */

View File

@ -645,6 +645,10 @@ class ExternalServices extends BaseExternalServices {
FirefoxCom.request("reportTelemetry", data); FirefoxCom.request("reportTelemetry", data);
} }
reportText(data) {
FirefoxCom.request("reportText", data);
}
updateEditorStates(data) { updateEditorStates(data) {
FirefoxCom.request("updateEditorStates", data); FirefoxCom.request("updateEditorStates", data);
} }

90
web/pdf_text_extractor.js Normal file
View File

@ -0,0 +1,90 @@
/* Copyright 2024 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* This class manages the interaction of extracting the text content of the page
* and passing it back to the external service.
*/
class PdfTextExtractor {
/** @type {PDFViewer} */
#pdfViewer;
#externalServices;
/**
* @type {?Promise<string>}
*/
#textPromise;
#pendingRequests = new Set();
constructor(externalServices) {
this.#externalServices = externalServices;
window.addEventListener("requestTextContent", ({ detail }) => {
this.extractTextContent(detail.requestId);
});
}
/**
* The PDF viewer is required to get the page text.
*
* @param {PDFViewer | null}
*/
setViewer(pdfViewer) {
this.#pdfViewer = pdfViewer;
if (this.#pdfViewer && this.#pendingRequests.size) {
// Handle any pending requests that came in while things were loading.
for (const pendingRequest of this.#pendingRequests) {
this.extractTextContent(pendingRequest);
}
this.#pendingRequests = new Set();
}
}
/**
* Builds up all of the text from a PDF.
*
* @param {number} requestId
*/
async extractTextContent(requestId) {
if (!this.#pdfViewer) {
this.#pendingRequests.add(requestId);
return;
}
if (!this.#textPromise) {
const textPromise = this.#pdfViewer.getAllText();
this.#textPromise = textPromise;
// After the text resolves, cache the text for a little bit in case
// multiple consumers call it.
textPromise.then(() => {
setTimeout(() => {
if (this.#textPromise === textPromise) {
this.#textPromise = null;
}
}, 5000);
});
}
this.#externalServices.reportText({
text: await this.#textPromise,
requestId,
});
}
}
export { PdfTextExtractor };