Merge pull request #20334 from calixteman/issue20319

Only apply word spacing when there is a 0x20 in the text chunk
This commit is contained in:
Tim van der Meij 2025-10-04 14:47:30 +02:00 committed by GitHub
commit d27cd6a4a1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 120 additions and 3 deletions

View File

@ -2927,7 +2927,7 @@ class PartialEvaluator {
for (let i = 0, ii = glyphs.length; i < ii; i++) {
const glyph = glyphs[i];
const { category } = glyph;
const { category, originalCharCode } = glyph;
if (category.isInvisibleFormatMark) {
continue;
@ -2941,6 +2941,10 @@ class PartialEvaluator {
}
let scaledDim = glyphWidth * scale;
if (originalCharCode === 0x20) {
charSpacing += textState.wordSpacing;
}
if (!keepWhiteSpace && category.isWhitespace) {
// Don't push a " " in the textContentItem
// (except when it's between two non-spaces chars),
@ -2948,13 +2952,13 @@ class PartialEvaluator {
// compareWithLastPosition.
// This way we can merge real spaces and spaces due to cursor moves.
if (!font.vertical) {
charSpacing += scaledDim + textState.wordSpacing;
charSpacing += scaledDim;
textState.translateTextMatrix(
charSpacing * textState.textHScale,
0
);
} else {
charSpacing += -scaledDim + textState.wordSpacing;
charSpacing += -scaledDim;
textState.translateTextMatrix(0, -charSpacing);
}
saveLastChar(" ");

View File

@ -746,3 +746,5 @@
!issue20232.pdf
!bug1989304.pdf
!comments.pdf
!issue20319_1.pdf
!issue20319_2.pdf

BIN
test/pdfs/issue20319_1.pdf Normal file

Binary file not shown.

View File

@ -0,0 +1,80 @@
%PDF-1.7
1 0 obj
<< /Type /Catalog
/Pages 2 0 R
>>
endobj
2 0 obj
<< /Type /Pages
/Kids [3 0 R]
/Count 1
>>
endobj
3 0 obj
<< /Type /Page
/Parent 2 0 R
/MediaBox [0 0 200 200]
/Resources <<
/Font << /F1 4 0 R >>
>>
/Contents 5 0 R
>>
endobj
4 0 obj
<< /Type /Font
/Subtype /Type3
/Name /F1
/FontBBox [0 0 500 500]
/FontMatrix [0.001 0 0 0.001 0 0]
/Encoding << /Differences [32 /A 33 /A] >>
/CharProcs << /A 6 0 R >>
/FirstChar 32
/LastChar 33
/Widths [500 500]
/Resources << >>
>>
endobj
5 0 obj
<< /Length 45 >>
stream
BT
/F1 20 Tf
50 Tw
100 100 Td
<212021> Tj
ET
endstream
endobj
6 0 obj
<< /Length 77 >>
stream
500 0 d0
50 50 50 400 re
400 50 50 400 re
50 400 400 50 re
50 200 400 50 re
f
endstream
endobj
xref
0 7
0000000000 65535 f
0000000009 00000 n
0000000062 00000 n
0000000126 00000 n
0000000275 00000 n
0000000554 00000 n
0000000650 00000 n
trailer
<< /Size 7
/Root 1 0 R
>>
startxref
778
%%EOF

View File

@ -4027,6 +4027,37 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
expect(items[1].fontName).not.toEqual(items[0].fontName);
});
it("gets text content with word spacing (issue 20319)", async function () {
const loadingTask = getDocument(
buildGetDocumentParams("issue20319_1.pdf")
);
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const { items } = await pdfPage.getTextContent({
disableNormalization: true,
});
const text = mergeText(items);
expect(text).toEqual("A A");
await loadingTask.destroy();
});
it("gets text content with word spacing and a fake space (issue 20319)", async function () {
const loadingTask = getDocument(
buildGetDocumentParams("issue20319_2.pdf")
);
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const { items } = await pdfPage.getTextContent({
disableNormalization: true,
});
const text = mergeText(items);
expect(text).toEqual("AA A");
await loadingTask.destroy();
});
it("gets empty structure tree", async function () {
const tree = await page.getStructTree();