Consider foo-\nBar as a compound word

Fixes #18693.
This commit is contained in:
Calixte Denizet 2024-09-10 22:00:04 +02:00
parent 0ac7f294cd
commit 06f9d8002d
4 changed files with 48 additions and 17 deletions

View File

@ -666,3 +666,4 @@
!highlight.pdf !highlight.pdf
!bug1708040.pdf !bug1708040.pdf
!issue18694.pdf !issue18694.pdf
!issue18693.pdf

BIN
test/pdfs/issue18693.pdf Executable file

Binary file not shown.

View File

@ -1062,6 +1062,26 @@ describe("pdf_find_controller", function () {
await testOnFind({ eventBus }); await testOnFind({ eventBus });
}); });
it("performs a search in a text with compound word on two lines", async function () {
const { eventBus, pdfFindController } =
await initPdfFindController("issue18693.pdf");
await testSearch({
eventBus,
pdfFindController,
state: {
query: "hel-Lo",
},
matchesPerPage: [1],
selectedMatch: {
pageIndex: 0,
matchIndex: 0,
},
pageMatches: [[6]],
pageMatchesLength: [[7]],
});
});
describe("custom matcher", () => { describe("custom matcher", () => {
it("calls to the matcher with the right arguments", async () => { it("calls to the matcher with the right arguments", async () => {
const QUERY = "Foo bar"; const QUERY = "Foo bar";

View File

@ -131,7 +131,8 @@ function normalize(text) {
// 30A0-30FF: Katakana // 30A0-30FF: Katakana
const CJK = "(?:\\p{Ideographic}|[\u3040-\u30FF])"; const CJK = "(?:\\p{Ideographic}|[\u3040-\u30FF])";
const HKDiacritics = "(?:\u3099|\u309A)"; const HKDiacritics = "(?:\u3099|\u309A)";
const regexp = `([${replace}])|([${toNormalizeWithNFKC}])|(${HKDiacritics}\\n)|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(${CJK}\\n)|(\\n)`; const CompoundWord = "\\p{Ll}-\\n\\p{Lu}";
const regexp = `([${replace}])|([${toNormalizeWithNFKC}])|(${HKDiacritics}\\n)|(\\p{M}+(?:-\\n)?)|(${CompoundWord})|(\\S-\\n)|(${CJK}\\n)|(\\n)`;
if (syllablePositions.length === 0) { if (syllablePositions.length === 0) {
// Most of the syllables belong to Hangul so there are no need // Most of the syllables belong to Hangul so there are no need
@ -193,7 +194,7 @@ function normalize(text) {
normalized = normalized.replace( normalized = normalized.replace(
normalizationRegex, normalizationRegex,
(match, p1, p2, p3, p4, p5, p6, p7, p8, i) => { (match, p1, p2, p3, p4, p5, p6, p7, p8, p9, i) => {
i -= shiftOrigin; i -= shiftOrigin;
if (p1) { if (p1) {
// Maybe fractions or quotations mark... // Maybe fractions or quotations mark...
@ -267,7 +268,7 @@ function normalize(text) {
if (hasTrailingDashEOL) { if (hasTrailingDashEOL) {
// Diacritics are followed by a -\n. // Diacritics are followed by a -\n.
// See comments in `if (p5)` block. // See comments in `if (p6)` block.
i += len - 1; i += len - 1;
positions.push([i - shift + 1, 1 + shift]); positions.push([i - shift + 1, 1 + shift]);
shift += 1; shift += 1;
@ -280,32 +281,41 @@ function normalize(text) {
} }
if (p5) { if (p5) {
// Compound word with a line break after the hyphen.
positions.push([i - shift + 3, 1 + shift]);
shift += 1;
shiftOrigin += 1;
eol += 1;
return p5.replace("\n", "");
}
if (p6) {
// "X-\n" is removed because an hyphen at the end of a line // "X-\n" is removed because an hyphen at the end of a line
// with not a space before is likely here to mark a break // with not a space before is likely here to mark a break
// in a word. // in a word.
// If X is encoded with UTF-32 then it can have a length greater than 1. // If X is encoded with UTF-32 then it can have a length greater than 1.
// The \n isn't in the original text so here y = i, n = X.len - 2 and // The \n isn't in the original text so here y = i, n = X.len - 2 and
// o = X.len - 1. // o = X.len - 1.
const len = p5.length - 2; const len = p6.length - 2;
positions.push([i - shift + len, 1 + shift]); positions.push([i - shift + len, 1 + shift]);
shift += 1; shift += 1;
shiftOrigin += 1; shiftOrigin += 1;
eol += 1; eol += 1;
return p5.slice(0, -2); return p6.slice(0, -2);
}
if (p6) {
// An ideographic at the end of a line doesn't imply adding an extra
// white space.
// A CJK can be encoded in UTF-32, hence their length isn't always 1.
const len = p6.length - 1;
positions.push([i - shift + len, shift]);
shiftOrigin += 1;
eol += 1;
return p6.slice(0, -1);
} }
if (p7) { if (p7) {
// An ideographic at the end of a line doesn't imply adding an extra
// white space.
// A CJK can be encoded in UTF-32, hence their length isn't always 1.
const len = p7.length - 1;
positions.push([i - shift + len, shift]);
shiftOrigin += 1;
eol += 1;
return p7.slice(0, -1);
}
if (p8) {
// eol is replaced by space: "foo\nbar" is likely equivalent to // eol is replaced by space: "foo\nbar" is likely equivalent to
// "foo bar". // "foo bar".
positions.push([i - shift + 1, shift - 1]); positions.push([i - shift + 1, shift - 1]);
@ -327,7 +337,7 @@ function normalize(text) {
shift -= newCharLen; shift -= newCharLen;
shiftOrigin += newCharLen; shiftOrigin += newCharLen;
} }
return p8; return p9;
} }
); );