Merge pull request #17941 from Snuffleupagus/getTextContent-lang
[api-minor] Include the document /Lang attribute in the textContent-data
This commit is contained in:
commit
bb9bb34721
@ -166,7 +166,7 @@ class Catalog {
|
|||||||
return shadow(
|
return shadow(
|
||||||
this,
|
this,
|
||||||
"lang",
|
"lang",
|
||||||
typeof lang === "string" ? stringToPDFString(lang) : null
|
lang && typeof lang === "string" ? stringToPDFString(lang) : null
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -395,10 +395,9 @@ class Page {
|
|||||||
}
|
}
|
||||||
|
|
||||||
loadResources(keys) {
|
loadResources(keys) {
|
||||||
if (!this.resourcesPromise) {
|
// TODO: add async `_getInheritableProperty` and remove this.
|
||||||
// TODO: add async `_getInheritableProperty` and remove this.
|
this.resourcesPromise ||= this.pdfManager.ensure(this, "resources");
|
||||||
this.resourcesPromise = this.pdfManager.ensure(this, "resources");
|
|
||||||
}
|
|
||||||
return this.resourcesPromise.then(() => {
|
return this.resourcesPromise.then(() => {
|
||||||
const objectLoader = new ObjectLoader(this.resources, keys, this.xref);
|
const objectLoader = new ObjectLoader(this.resources, keys, this.xref);
|
||||||
return objectLoader.load();
|
return objectLoader.load();
|
||||||
@ -625,7 +624,7 @@ class Page {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
extractTextContent({
|
async extractTextContent({
|
||||||
handler,
|
handler,
|
||||||
task,
|
task,
|
||||||
includeMarkedContent,
|
includeMarkedContent,
|
||||||
@ -639,31 +638,35 @@ class Page {
|
|||||||
"Properties",
|
"Properties",
|
||||||
"XObject",
|
"XObject",
|
||||||
]);
|
]);
|
||||||
|
const langPromise = this.pdfManager.ensureCatalog("lang");
|
||||||
|
|
||||||
const dataPromises = Promise.all([contentStreamPromise, resourcesPromise]);
|
const [contentStream, , lang] = await Promise.all([
|
||||||
return dataPromises.then(([contentStream]) => {
|
contentStreamPromise,
|
||||||
const partialEvaluator = new PartialEvaluator({
|
resourcesPromise,
|
||||||
xref: this.xref,
|
langPromise,
|
||||||
handler,
|
]);
|
||||||
pageIndex: this.pageIndex,
|
const partialEvaluator = new PartialEvaluator({
|
||||||
idFactory: this._localIdFactory,
|
xref: this.xref,
|
||||||
fontCache: this.fontCache,
|
handler,
|
||||||
builtInCMapCache: this.builtInCMapCache,
|
pageIndex: this.pageIndex,
|
||||||
standardFontDataCache: this.standardFontDataCache,
|
idFactory: this._localIdFactory,
|
||||||
globalImageCache: this.globalImageCache,
|
fontCache: this.fontCache,
|
||||||
systemFontCache: this.systemFontCache,
|
builtInCMapCache: this.builtInCMapCache,
|
||||||
options: this.evaluatorOptions,
|
standardFontDataCache: this.standardFontDataCache,
|
||||||
});
|
globalImageCache: this.globalImageCache,
|
||||||
|
systemFontCache: this.systemFontCache,
|
||||||
|
options: this.evaluatorOptions,
|
||||||
|
});
|
||||||
|
|
||||||
return partialEvaluator.getTextContent({
|
return partialEvaluator.getTextContent({
|
||||||
stream: contentStream,
|
stream: contentStream,
|
||||||
task,
|
task,
|
||||||
resources: this.resources,
|
resources: this.resources,
|
||||||
includeMarkedContent,
|
includeMarkedContent,
|
||||||
disableNormalization,
|
disableNormalization,
|
||||||
sink,
|
sink,
|
||||||
viewBox: this.view,
|
viewBox: this.view,
|
||||||
});
|
lang,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -2307,6 +2307,7 @@ class PartialEvaluator {
|
|||||||
sink,
|
sink,
|
||||||
seenStyles = new Set(),
|
seenStyles = new Set(),
|
||||||
viewBox,
|
viewBox,
|
||||||
|
lang = null,
|
||||||
markedContentData = null,
|
markedContentData = null,
|
||||||
disableNormalization = false,
|
disableNormalization = false,
|
||||||
keepWhiteSpace = false,
|
keepWhiteSpace = false,
|
||||||
@ -2323,6 +2324,7 @@ class PartialEvaluator {
|
|||||||
const textContent = {
|
const textContent = {
|
||||||
items: [],
|
items: [],
|
||||||
styles: Object.create(null),
|
styles: Object.create(null),
|
||||||
|
lang,
|
||||||
};
|
};
|
||||||
const textContentItem = {
|
const textContentItem = {
|
||||||
initialized: false,
|
initialized: false,
|
||||||
@ -3296,6 +3298,7 @@ class PartialEvaluator {
|
|||||||
sink: sinkWrapper,
|
sink: sinkWrapper,
|
||||||
seenStyles,
|
seenStyles,
|
||||||
viewBox,
|
viewBox,
|
||||||
|
lang,
|
||||||
markedContentData,
|
markedContentData,
|
||||||
disableNormalization,
|
disableNormalization,
|
||||||
keepWhiteSpace,
|
keepWhiteSpace,
|
||||||
|
|||||||
@ -1160,6 +1160,7 @@ class PDFDocumentProxy {
|
|||||||
* items are included when includeMarkedContent is true.
|
* items are included when includeMarkedContent is true.
|
||||||
* @property {Object<string, TextStyle>} styles - {@link TextStyle} objects,
|
* @property {Object<string, TextStyle>} styles - {@link TextStyle} objects,
|
||||||
* indexed by font name.
|
* indexed by font name.
|
||||||
|
* @property {string | null} lang - The document /Lang attribute.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -1677,6 +1678,7 @@ class PDFPageProxy {
|
|||||||
resolve(textContent);
|
resolve(textContent);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
textContent.lang ??= value.lang;
|
||||||
Object.assign(textContent.styles, value.styles);
|
Object.assign(textContent.styles, value.styles);
|
||||||
textContent.items.push(...value.items);
|
textContent.items.push(...value.items);
|
||||||
pump();
|
pump();
|
||||||
@ -1687,6 +1689,7 @@ class PDFPageProxy {
|
|||||||
const textContent = {
|
const textContent = {
|
||||||
items: [],
|
items: [],
|
||||||
styles: Object.create(null),
|
styles: Object.create(null),
|
||||||
|
lang: null,
|
||||||
};
|
};
|
||||||
pump();
|
pump();
|
||||||
});
|
});
|
||||||
|
|||||||
@ -64,7 +64,7 @@ const DEFAULT_FONT_ASCENT = 0.8;
|
|||||||
const ascentCache = new Map();
|
const ascentCache = new Map();
|
||||||
let _canvasContext = null;
|
let _canvasContext = null;
|
||||||
|
|
||||||
function getCtx() {
|
function getCtx(lang = null) {
|
||||||
if (!_canvasContext) {
|
if (!_canvasContext) {
|
||||||
// We don't use an OffscreenCanvas here because we use serif/sans serif
|
// We don't use an OffscreenCanvas here because we use serif/sans serif
|
||||||
// fonts with it and they depends on the locale.
|
// fonts with it and they depends on the locale.
|
||||||
@ -89,13 +89,13 @@ function cleanupTextLayer() {
|
|||||||
_canvasContext = null;
|
_canvasContext = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
function getAscent(fontFamily) {
|
function getAscent(fontFamily, lang) {
|
||||||
const cachedAscent = ascentCache.get(fontFamily);
|
const cachedAscent = ascentCache.get(fontFamily);
|
||||||
if (cachedAscent) {
|
if (cachedAscent) {
|
||||||
return cachedAscent;
|
return cachedAscent;
|
||||||
}
|
}
|
||||||
|
|
||||||
const ctx = getCtx();
|
const ctx = getCtx(lang);
|
||||||
|
|
||||||
const savedFont = ctx.font;
|
const savedFont = ctx.font;
|
||||||
ctx.canvas.width = ctx.canvas.height = DEFAULT_FONT_SIZE;
|
ctx.canvas.width = ctx.canvas.height = DEFAULT_FONT_SIZE;
|
||||||
@ -162,7 +162,7 @@ function getAscent(fontFamily) {
|
|||||||
return DEFAULT_FONT_ASCENT;
|
return DEFAULT_FONT_ASCENT;
|
||||||
}
|
}
|
||||||
|
|
||||||
function appendText(task, geom) {
|
function appendText(task, geom, lang) {
|
||||||
// Initialize all used properties to keep the caches monomorphic.
|
// Initialize all used properties to keep the caches monomorphic.
|
||||||
const textDiv = document.createElement("span");
|
const textDiv = document.createElement("span");
|
||||||
const textDivProperties = {
|
const textDivProperties = {
|
||||||
@ -184,7 +184,7 @@ function appendText(task, geom) {
|
|||||||
const fontFamily =
|
const fontFamily =
|
||||||
(task._fontInspectorEnabled && style.fontSubstitution) || style.fontFamily;
|
(task._fontInspectorEnabled && style.fontSubstitution) || style.fontFamily;
|
||||||
const fontHeight = Math.hypot(tx[2], tx[3]);
|
const fontHeight = Math.hypot(tx[2], tx[3]);
|
||||||
const fontAscent = fontHeight * getAscent(fontFamily);
|
const fontAscent = fontHeight * getAscent(fontFamily, lang);
|
||||||
|
|
||||||
let left, top;
|
let left, top;
|
||||||
if (angle === 0) {
|
if (angle === 0) {
|
||||||
@ -324,7 +324,7 @@ class TextLayerRenderTask {
|
|||||||
div: null,
|
div: null,
|
||||||
scale: viewport.scale * (globalThis.devicePixelRatio || 1),
|
scale: viewport.scale * (globalThis.devicePixelRatio || 1),
|
||||||
properties: null,
|
properties: null,
|
||||||
ctx: getCtx(),
|
ctx: null,
|
||||||
};
|
};
|
||||||
this._styleCache = Object.create(null);
|
this._styleCache = Object.create(null);
|
||||||
const { pageWidth, pageHeight, pageX, pageY } = viewport.rawDims;
|
const { pageWidth, pageHeight, pageX, pageY } = viewport.rawDims;
|
||||||
@ -371,7 +371,11 @@ class TextLayerRenderTask {
|
|||||||
/**
|
/**
|
||||||
* @private
|
* @private
|
||||||
*/
|
*/
|
||||||
_processItems(items) {
|
_processItems(items, lang) {
|
||||||
|
if (!this._layoutTextParams.ctx) {
|
||||||
|
this._textDivProperties.set(this._rootContainer, { lang });
|
||||||
|
this._layoutTextParams.ctx = getCtx(lang);
|
||||||
|
}
|
||||||
const textDivs = this._textDivs,
|
const textDivs = this._textDivs,
|
||||||
textContentItemsStr = this._textContentItemsStr;
|
textContentItemsStr = this._textContentItemsStr;
|
||||||
|
|
||||||
@ -403,7 +407,7 @@ class TextLayerRenderTask {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
textContentItemsStr.push(item.str);
|
textContentItemsStr.push(item.str);
|
||||||
appendText(this, item);
|
appendText(this, item, lang);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -440,7 +444,7 @@ class TextLayerRenderTask {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Object.assign(styleCache, value.styles);
|
Object.assign(styleCache, value.styles);
|
||||||
this._processItems(value.items);
|
this._processItems(value.items, value.lang);
|
||||||
pump();
|
pump();
|
||||||
}, this._capability.reject);
|
}, this._capability.reject);
|
||||||
};
|
};
|
||||||
@ -476,7 +480,7 @@ function updateTextLayer({
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (mustRescale) {
|
if (mustRescale) {
|
||||||
const ctx = getCtx();
|
const ctx = getCtx(textDivProperties.get(container)?.lang);
|
||||||
const scale = viewport.scale * (globalThis.devicePixelRatio || 1);
|
const scale = viewport.scale * (globalThis.devicePixelRatio || 1);
|
||||||
const params = {
|
const params = {
|
||||||
prevFontSize: null,
|
prevFontSize: null,
|
||||||
|
|||||||
@ -3128,10 +3128,11 @@ describe("api", function () {
|
|||||||
});
|
});
|
||||||
|
|
||||||
it("gets text content", async function () {
|
it("gets text content", async function () {
|
||||||
const { items, styles } = await page.getTextContent();
|
const { items, styles, lang } = await page.getTextContent();
|
||||||
|
|
||||||
expect(items.length).toEqual(15);
|
expect(items.length).toEqual(15);
|
||||||
expect(objectSize(styles)).toEqual(5);
|
expect(objectSize(styles)).toEqual(5);
|
||||||
|
expect(lang).toEqual("en");
|
||||||
|
|
||||||
const text = mergeText(items);
|
const text = mergeText(items);
|
||||||
expect(text).toEqual(`Table Of Content
|
expect(text).toEqual(`Table Of Content
|
||||||
@ -3146,13 +3147,14 @@ page 1 / 3`);
|
|||||||
);
|
);
|
||||||
const pdfDoc = await loadingTask.promise;
|
const pdfDoc = await loadingTask.promise;
|
||||||
const pdfPage = await pdfDoc.getPage(1);
|
const pdfPage = await pdfDoc.getPage(1);
|
||||||
const { items, styles } = await pdfPage.getTextContent({
|
const { items, styles, lang } = await pdfPage.getTextContent({
|
||||||
disableNormalization: true,
|
disableNormalization: true,
|
||||||
});
|
});
|
||||||
expect(items.length).toEqual(1);
|
expect(items.length).toEqual(1);
|
||||||
// Font name will be a random object id.
|
// Font name will be a random object id.
|
||||||
const fontName = items[0].fontName;
|
const fontName = items[0].fontName;
|
||||||
expect(Object.keys(styles)).toEqual([fontName]);
|
expect(Object.keys(styles)).toEqual([fontName]);
|
||||||
|
expect(lang).toEqual(null);
|
||||||
|
|
||||||
expect(items[0]).toEqual({
|
expect(items[0]).toEqual({
|
||||||
dir: "ltr",
|
dir: "ltr",
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user