diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 256a50626..d38fab6b0 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -2927,7 +2927,7 @@ class PartialEvaluator { for (let i = 0, ii = glyphs.length; i < ii; i++) { const glyph = glyphs[i]; - const { category } = glyph; + const { category, originalCharCode } = glyph; if (category.isInvisibleFormatMark) { continue; @@ -2941,6 +2941,10 @@ class PartialEvaluator { } let scaledDim = glyphWidth * scale; + if (originalCharCode === 0x20) { + charSpacing += textState.wordSpacing; + } + if (!keepWhiteSpace && category.isWhitespace) { // Don't push a " " in the textContentItem // (except when it's between two non-spaces chars), @@ -2948,13 +2952,13 @@ class PartialEvaluator { // compareWithLastPosition. // This way we can merge real spaces and spaces due to cursor moves. if (!font.vertical) { - charSpacing += scaledDim + textState.wordSpacing; + charSpacing += scaledDim; textState.translateTextMatrix( charSpacing * textState.textHScale, 0 ); } else { - charSpacing += -scaledDim + textState.wordSpacing; + charSpacing += -scaledDim; textState.translateTextMatrix(0, -charSpacing); } saveLastChar(" "); diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 6b1ede9ae..45530519a 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -746,3 +746,5 @@ !issue20232.pdf !bug1989304.pdf !comments.pdf +!issue20319_1.pdf +!issue20319_2.pdf diff --git a/test/pdfs/issue20319_1.pdf b/test/pdfs/issue20319_1.pdf new file mode 100644 index 000000000..0fe77c699 Binary files /dev/null and b/test/pdfs/issue20319_1.pdf differ diff --git a/test/pdfs/issue20319_2.pdf b/test/pdfs/issue20319_2.pdf new file mode 100644 index 000000000..0878fb50b --- /dev/null +++ b/test/pdfs/issue20319_2.pdf @@ -0,0 +1,80 @@ +%PDF-1.7 +1 0 obj +<< /Type /Catalog + /Pages 2 0 R +>> +endobj + +2 0 obj +<< /Type /Pages + /Kids [3 0 R] + /Count 1 +>> +endobj + +3 0 obj +<< /Type /Page + /Parent 2 0 R + /MediaBox [0 0 200 200] + /Resources << + /Font << /F1 4 0 R >> + >> + /Contents 5 0 R +>> +endobj + +4 0 obj +<< /Type /Font + /Subtype /Type3 + /Name /F1 + /FontBBox [0 0 500 500] + /FontMatrix [0.001 0 0 0.001 0 0] + /Encoding << /Differences [32 /A 33 /A] >> + /CharProcs << /A 6 0 R >> + /FirstChar 32 + /LastChar 33 + /Widths [500 500] + /Resources << >> +>> +endobj + +5 0 obj +<< /Length 45 >> +stream +BT +/F1 20 Tf +50 Tw +100 100 Td +<212021> Tj +ET +endstream +endobj + +6 0 obj +<< /Length 77 >> +stream +500 0 d0 +50 50 50 400 re +400 50 50 400 re +50 400 400 50 re +50 200 400 50 re +f +endstream +endobj + +xref +0 7 +0000000000 65535 f +0000000009 00000 n +0000000062 00000 n +0000000126 00000 n +0000000275 00000 n +0000000554 00000 n +0000000650 00000 n +trailer +<< /Size 7 + /Root 1 0 R +>> +startxref +778 +%%EOF diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 648fe309e..a006bb226 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -4027,6 +4027,37 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`) expect(items[1].fontName).not.toEqual(items[0].fontName); }); + it("gets text content with word spacing (issue 20319)", async function () { + const loadingTask = getDocument( + buildGetDocumentParams("issue20319_1.pdf") + ); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); + const text = mergeText(items); + + expect(text).toEqual("A A"); + + await loadingTask.destroy(); + }); + + it("gets text content with word spacing and a fake space (issue 20319)", async function () { + const loadingTask = getDocument( + buildGetDocumentParams("issue20319_2.pdf") + ); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); + const text = mergeText(items); + expect(text).toEqual("AA A"); + + await loadingTask.destroy(); + }); + it("gets empty structure tree", async function () { const tree = await page.getStructTree();