Merge pull request #17941 from Snuffleupagus/getTextContent-lang

[api-minor] Include the document /Lang attribute in the textContent-data
This commit is contained in:
Jonas Jenwald 2024-05-14 13:57:46 +02:00 committed by GitHub
commit bb9bb34721
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 56 additions and 41 deletions

View File

@ -166,7 +166,7 @@ class Catalog {
return shadow( return shadow(
this, this,
"lang", "lang",
typeof lang === "string" ? stringToPDFString(lang) : null lang && typeof lang === "string" ? stringToPDFString(lang) : null
); );
} }

View File

@ -395,10 +395,9 @@ class Page {
} }
loadResources(keys) { loadResources(keys) {
if (!this.resourcesPromise) { // TODO: add async `_getInheritableProperty` and remove this.
// TODO: add async `_getInheritableProperty` and remove this. this.resourcesPromise ||= this.pdfManager.ensure(this, "resources");
this.resourcesPromise = this.pdfManager.ensure(this, "resources");
}
return this.resourcesPromise.then(() => { return this.resourcesPromise.then(() => {
const objectLoader = new ObjectLoader(this.resources, keys, this.xref); const objectLoader = new ObjectLoader(this.resources, keys, this.xref);
return objectLoader.load(); return objectLoader.load();
@ -625,7 +624,7 @@ class Page {
}); });
} }
extractTextContent({ async extractTextContent({
handler, handler,
task, task,
includeMarkedContent, includeMarkedContent,
@ -639,31 +638,35 @@ class Page {
"Properties", "Properties",
"XObject", "XObject",
]); ]);
const langPromise = this.pdfManager.ensureCatalog("lang");
const dataPromises = Promise.all([contentStreamPromise, resourcesPromise]); const [contentStream, , lang] = await Promise.all([
return dataPromises.then(([contentStream]) => { contentStreamPromise,
const partialEvaluator = new PartialEvaluator({ resourcesPromise,
xref: this.xref, langPromise,
handler, ]);
pageIndex: this.pageIndex, const partialEvaluator = new PartialEvaluator({
idFactory: this._localIdFactory, xref: this.xref,
fontCache: this.fontCache, handler,
builtInCMapCache: this.builtInCMapCache, pageIndex: this.pageIndex,
standardFontDataCache: this.standardFontDataCache, idFactory: this._localIdFactory,
globalImageCache: this.globalImageCache, fontCache: this.fontCache,
systemFontCache: this.systemFontCache, builtInCMapCache: this.builtInCMapCache,
options: this.evaluatorOptions, standardFontDataCache: this.standardFontDataCache,
}); globalImageCache: this.globalImageCache,
systemFontCache: this.systemFontCache,
options: this.evaluatorOptions,
});
return partialEvaluator.getTextContent({ return partialEvaluator.getTextContent({
stream: contentStream, stream: contentStream,
task, task,
resources: this.resources, resources: this.resources,
includeMarkedContent, includeMarkedContent,
disableNormalization, disableNormalization,
sink, sink,
viewBox: this.view, viewBox: this.view,
}); lang,
}); });
} }

View File

@ -2307,6 +2307,7 @@ class PartialEvaluator {
sink, sink,
seenStyles = new Set(), seenStyles = new Set(),
viewBox, viewBox,
lang = null,
markedContentData = null, markedContentData = null,
disableNormalization = false, disableNormalization = false,
keepWhiteSpace = false, keepWhiteSpace = false,
@ -2323,6 +2324,7 @@ class PartialEvaluator {
const textContent = { const textContent = {
items: [], items: [],
styles: Object.create(null), styles: Object.create(null),
lang,
}; };
const textContentItem = { const textContentItem = {
initialized: false, initialized: false,
@ -3296,6 +3298,7 @@ class PartialEvaluator {
sink: sinkWrapper, sink: sinkWrapper,
seenStyles, seenStyles,
viewBox, viewBox,
lang,
markedContentData, markedContentData,
disableNormalization, disableNormalization,
keepWhiteSpace, keepWhiteSpace,

View File

@ -1160,6 +1160,7 @@ class PDFDocumentProxy {
* items are included when includeMarkedContent is true. * items are included when includeMarkedContent is true.
* @property {Object<string, TextStyle>} styles - {@link TextStyle} objects, * @property {Object<string, TextStyle>} styles - {@link TextStyle} objects,
* indexed by font name. * indexed by font name.
* @property {string | null} lang - The document /Lang attribute.
*/ */
/** /**
@ -1677,6 +1678,7 @@ class PDFPageProxy {
resolve(textContent); resolve(textContent);
return; return;
} }
textContent.lang ??= value.lang;
Object.assign(textContent.styles, value.styles); Object.assign(textContent.styles, value.styles);
textContent.items.push(...value.items); textContent.items.push(...value.items);
pump(); pump();
@ -1687,6 +1689,7 @@ class PDFPageProxy {
const textContent = { const textContent = {
items: [], items: [],
styles: Object.create(null), styles: Object.create(null),
lang: null,
}; };
pump(); pump();
}); });

View File

@ -64,7 +64,7 @@ const DEFAULT_FONT_ASCENT = 0.8;
const ascentCache = new Map(); const ascentCache = new Map();
let _canvasContext = null; let _canvasContext = null;
function getCtx() { function getCtx(lang = null) {
if (!_canvasContext) { if (!_canvasContext) {
// We don't use an OffscreenCanvas here because we use serif/sans serif // We don't use an OffscreenCanvas here because we use serif/sans serif
// fonts with it and they depends on the locale. // fonts with it and they depends on the locale.
@ -89,13 +89,13 @@ function cleanupTextLayer() {
_canvasContext = null; _canvasContext = null;
} }
function getAscent(fontFamily) { function getAscent(fontFamily, lang) {
const cachedAscent = ascentCache.get(fontFamily); const cachedAscent = ascentCache.get(fontFamily);
if (cachedAscent) { if (cachedAscent) {
return cachedAscent; return cachedAscent;
} }
const ctx = getCtx(); const ctx = getCtx(lang);
const savedFont = ctx.font; const savedFont = ctx.font;
ctx.canvas.width = ctx.canvas.height = DEFAULT_FONT_SIZE; ctx.canvas.width = ctx.canvas.height = DEFAULT_FONT_SIZE;
@ -162,7 +162,7 @@ function getAscent(fontFamily) {
return DEFAULT_FONT_ASCENT; return DEFAULT_FONT_ASCENT;
} }
function appendText(task, geom) { function appendText(task, geom, lang) {
// Initialize all used properties to keep the caches monomorphic. // Initialize all used properties to keep the caches monomorphic.
const textDiv = document.createElement("span"); const textDiv = document.createElement("span");
const textDivProperties = { const textDivProperties = {
@ -184,7 +184,7 @@ function appendText(task, geom) {
const fontFamily = const fontFamily =
(task._fontInspectorEnabled && style.fontSubstitution) || style.fontFamily; (task._fontInspectorEnabled && style.fontSubstitution) || style.fontFamily;
const fontHeight = Math.hypot(tx[2], tx[3]); const fontHeight = Math.hypot(tx[2], tx[3]);
const fontAscent = fontHeight * getAscent(fontFamily); const fontAscent = fontHeight * getAscent(fontFamily, lang);
let left, top; let left, top;
if (angle === 0) { if (angle === 0) {
@ -324,7 +324,7 @@ class TextLayerRenderTask {
div: null, div: null,
scale: viewport.scale * (globalThis.devicePixelRatio || 1), scale: viewport.scale * (globalThis.devicePixelRatio || 1),
properties: null, properties: null,
ctx: getCtx(), ctx: null,
}; };
this._styleCache = Object.create(null); this._styleCache = Object.create(null);
const { pageWidth, pageHeight, pageX, pageY } = viewport.rawDims; const { pageWidth, pageHeight, pageX, pageY } = viewport.rawDims;
@ -371,7 +371,11 @@ class TextLayerRenderTask {
/** /**
* @private * @private
*/ */
_processItems(items) { _processItems(items, lang) {
if (!this._layoutTextParams.ctx) {
this._textDivProperties.set(this._rootContainer, { lang });
this._layoutTextParams.ctx = getCtx(lang);
}
const textDivs = this._textDivs, const textDivs = this._textDivs,
textContentItemsStr = this._textContentItemsStr; textContentItemsStr = this._textContentItemsStr;
@ -403,7 +407,7 @@ class TextLayerRenderTask {
continue; continue;
} }
textContentItemsStr.push(item.str); textContentItemsStr.push(item.str);
appendText(this, item); appendText(this, item, lang);
} }
} }
@ -440,7 +444,7 @@ class TextLayerRenderTask {
} }
Object.assign(styleCache, value.styles); Object.assign(styleCache, value.styles);
this._processItems(value.items); this._processItems(value.items, value.lang);
pump(); pump();
}, this._capability.reject); }, this._capability.reject);
}; };
@ -476,7 +480,7 @@ function updateTextLayer({
} }
if (mustRescale) { if (mustRescale) {
const ctx = getCtx(); const ctx = getCtx(textDivProperties.get(container)?.lang);
const scale = viewport.scale * (globalThis.devicePixelRatio || 1); const scale = viewport.scale * (globalThis.devicePixelRatio || 1);
const params = { const params = {
prevFontSize: null, prevFontSize: null,

View File

@ -3128,10 +3128,11 @@ describe("api", function () {
}); });
it("gets text content", async function () { it("gets text content", async function () {
const { items, styles } = await page.getTextContent(); const { items, styles, lang } = await page.getTextContent();
expect(items.length).toEqual(15); expect(items.length).toEqual(15);
expect(objectSize(styles)).toEqual(5); expect(objectSize(styles)).toEqual(5);
expect(lang).toEqual("en");
const text = mergeText(items); const text = mergeText(items);
expect(text).toEqual(`Table Of Content expect(text).toEqual(`Table Of Content
@ -3146,13 +3147,14 @@ page 1 / 3`);
); );
const pdfDoc = await loadingTask.promise; const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1); const pdfPage = await pdfDoc.getPage(1);
const { items, styles } = await pdfPage.getTextContent({ const { items, styles, lang } = await pdfPage.getTextContent({
disableNormalization: true, disableNormalization: true,
}); });
expect(items.length).toEqual(1); expect(items.length).toEqual(1);
// Font name will be a random object id. // Font name will be a random object id.
const fontName = items[0].fontName; const fontName = items[0].fontName;
expect(Object.keys(styles)).toEqual([fontName]); expect(Object.keys(styles)).toEqual([fontName]);
expect(lang).toEqual(null);
expect(items[0]).toEqual({ expect(items[0]).toEqual({
dir: "ltr", dir: "ltr",