parent
0ac7f294cd
commit
06f9d8002d
1
test/pdfs/.gitignore
vendored
1
test/pdfs/.gitignore
vendored
@ -666,3 +666,4 @@
|
|||||||
!highlight.pdf
|
!highlight.pdf
|
||||||
!bug1708040.pdf
|
!bug1708040.pdf
|
||||||
!issue18694.pdf
|
!issue18694.pdf
|
||||||
|
!issue18693.pdf
|
||||||
|
|||||||
BIN
test/pdfs/issue18693.pdf
Executable file
BIN
test/pdfs/issue18693.pdf
Executable file
Binary file not shown.
@ -1062,6 +1062,26 @@ describe("pdf_find_controller", function () {
|
|||||||
await testOnFind({ eventBus });
|
await testOnFind({ eventBus });
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("performs a search in a text with compound word on two lines", async function () {
|
||||||
|
const { eventBus, pdfFindController } =
|
||||||
|
await initPdfFindController("issue18693.pdf");
|
||||||
|
|
||||||
|
await testSearch({
|
||||||
|
eventBus,
|
||||||
|
pdfFindController,
|
||||||
|
state: {
|
||||||
|
query: "hel-Lo",
|
||||||
|
},
|
||||||
|
matchesPerPage: [1],
|
||||||
|
selectedMatch: {
|
||||||
|
pageIndex: 0,
|
||||||
|
matchIndex: 0,
|
||||||
|
},
|
||||||
|
pageMatches: [[6]],
|
||||||
|
pageMatchesLength: [[7]],
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
describe("custom matcher", () => {
|
describe("custom matcher", () => {
|
||||||
it("calls to the matcher with the right arguments", async () => {
|
it("calls to the matcher with the right arguments", async () => {
|
||||||
const QUERY = "Foo bar";
|
const QUERY = "Foo bar";
|
||||||
|
|||||||
@ -131,7 +131,8 @@ function normalize(text) {
|
|||||||
// 30A0-30FF: Katakana
|
// 30A0-30FF: Katakana
|
||||||
const CJK = "(?:\\p{Ideographic}|[\u3040-\u30FF])";
|
const CJK = "(?:\\p{Ideographic}|[\u3040-\u30FF])";
|
||||||
const HKDiacritics = "(?:\u3099|\u309A)";
|
const HKDiacritics = "(?:\u3099|\u309A)";
|
||||||
const regexp = `([${replace}])|([${toNormalizeWithNFKC}])|(${HKDiacritics}\\n)|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(${CJK}\\n)|(\\n)`;
|
const CompoundWord = "\\p{Ll}-\\n\\p{Lu}";
|
||||||
|
const regexp = `([${replace}])|([${toNormalizeWithNFKC}])|(${HKDiacritics}\\n)|(\\p{M}+(?:-\\n)?)|(${CompoundWord})|(\\S-\\n)|(${CJK}\\n)|(\\n)`;
|
||||||
|
|
||||||
if (syllablePositions.length === 0) {
|
if (syllablePositions.length === 0) {
|
||||||
// Most of the syllables belong to Hangul so there are no need
|
// Most of the syllables belong to Hangul so there are no need
|
||||||
@ -193,7 +194,7 @@ function normalize(text) {
|
|||||||
|
|
||||||
normalized = normalized.replace(
|
normalized = normalized.replace(
|
||||||
normalizationRegex,
|
normalizationRegex,
|
||||||
(match, p1, p2, p3, p4, p5, p6, p7, p8, i) => {
|
(match, p1, p2, p3, p4, p5, p6, p7, p8, p9, i) => {
|
||||||
i -= shiftOrigin;
|
i -= shiftOrigin;
|
||||||
if (p1) {
|
if (p1) {
|
||||||
// Maybe fractions or quotations mark...
|
// Maybe fractions or quotations mark...
|
||||||
@ -267,7 +268,7 @@ function normalize(text) {
|
|||||||
|
|
||||||
if (hasTrailingDashEOL) {
|
if (hasTrailingDashEOL) {
|
||||||
// Diacritics are followed by a -\n.
|
// Diacritics are followed by a -\n.
|
||||||
// See comments in `if (p5)` block.
|
// See comments in `if (p6)` block.
|
||||||
i += len - 1;
|
i += len - 1;
|
||||||
positions.push([i - shift + 1, 1 + shift]);
|
positions.push([i - shift + 1, 1 + shift]);
|
||||||
shift += 1;
|
shift += 1;
|
||||||
@ -280,32 +281,41 @@ function normalize(text) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (p5) {
|
if (p5) {
|
||||||
|
// Compound word with a line break after the hyphen.
|
||||||
|
positions.push([i - shift + 3, 1 + shift]);
|
||||||
|
shift += 1;
|
||||||
|
shiftOrigin += 1;
|
||||||
|
eol += 1;
|
||||||
|
return p5.replace("\n", "");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (p6) {
|
||||||
// "X-\n" is removed because an hyphen at the end of a line
|
// "X-\n" is removed because an hyphen at the end of a line
|
||||||
// with not a space before is likely here to mark a break
|
// with not a space before is likely here to mark a break
|
||||||
// in a word.
|
// in a word.
|
||||||
// If X is encoded with UTF-32 then it can have a length greater than 1.
|
// If X is encoded with UTF-32 then it can have a length greater than 1.
|
||||||
// The \n isn't in the original text so here y = i, n = X.len - 2 and
|
// The \n isn't in the original text so here y = i, n = X.len - 2 and
|
||||||
// o = X.len - 1.
|
// o = X.len - 1.
|
||||||
const len = p5.length - 2;
|
const len = p6.length - 2;
|
||||||
positions.push([i - shift + len, 1 + shift]);
|
positions.push([i - shift + len, 1 + shift]);
|
||||||
shift += 1;
|
shift += 1;
|
||||||
shiftOrigin += 1;
|
shiftOrigin += 1;
|
||||||
eol += 1;
|
eol += 1;
|
||||||
return p5.slice(0, -2);
|
return p6.slice(0, -2);
|
||||||
}
|
|
||||||
|
|
||||||
if (p6) {
|
|
||||||
// An ideographic at the end of a line doesn't imply adding an extra
|
|
||||||
// white space.
|
|
||||||
// A CJK can be encoded in UTF-32, hence their length isn't always 1.
|
|
||||||
const len = p6.length - 1;
|
|
||||||
positions.push([i - shift + len, shift]);
|
|
||||||
shiftOrigin += 1;
|
|
||||||
eol += 1;
|
|
||||||
return p6.slice(0, -1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (p7) {
|
if (p7) {
|
||||||
|
// An ideographic at the end of a line doesn't imply adding an extra
|
||||||
|
// white space.
|
||||||
|
// A CJK can be encoded in UTF-32, hence their length isn't always 1.
|
||||||
|
const len = p7.length - 1;
|
||||||
|
positions.push([i - shift + len, shift]);
|
||||||
|
shiftOrigin += 1;
|
||||||
|
eol += 1;
|
||||||
|
return p7.slice(0, -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (p8) {
|
||||||
// eol is replaced by space: "foo\nbar" is likely equivalent to
|
// eol is replaced by space: "foo\nbar" is likely equivalent to
|
||||||
// "foo bar".
|
// "foo bar".
|
||||||
positions.push([i - shift + 1, shift - 1]);
|
positions.push([i - shift + 1, shift - 1]);
|
||||||
@ -327,7 +337,7 @@ function normalize(text) {
|
|||||||
shift -= newCharLen;
|
shift -= newCharLen;
|
||||||
shiftOrigin += newCharLen;
|
shiftOrigin += newCharLen;
|
||||||
}
|
}
|
||||||
return p8;
|
return p9;
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user