Merge pull request #19123 from calixteman/improve_search_perf

Very slightly improve the performance when searching in a pdf
This commit is contained in:
calixteman 2024-11-28 17:44:14 +01:00 committed by GitHub
commit 308ca2a16f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -184,7 +184,7 @@ function normalize(text) {
} }
let normalized = text.normalize("NFD"); let normalized = text.normalize("NFD");
const positions = [[0, 0]]; const positions = [0, 0];
let rawDiacriticsIndex = 0; let rawDiacriticsIndex = 0;
let syllableIndex = 0; let syllableIndex = 0;
let shift = 0; let shift = 0;
@ -201,7 +201,7 @@ function normalize(text) {
const replacement = CHARACTERS_TO_NORMALIZE[p1]; const replacement = CHARACTERS_TO_NORMALIZE[p1];
const jj = replacement.length; const jj = replacement.length;
for (let j = 1; j < jj; j++) { for (let j = 1; j < jj; j++) {
positions.push([i - shift + j, shift - j]); positions.push(i - shift + j, shift - j);
} }
shift -= jj - 1; shift -= jj - 1;
return replacement; return replacement;
@ -216,7 +216,7 @@ function normalize(text) {
} }
const jj = replacement.length; const jj = replacement.length;
for (let j = 1; j < jj; j++) { for (let j = 1; j < jj; j++) {
positions.push([i - shift + j, shift - j]); positions.push(i - shift + j, shift - j);
} }
shift -= jj - 1; shift -= jj - 1;
return replacement; return replacement;
@ -233,13 +233,13 @@ function normalize(text) {
} else { } else {
// i is the position of the first diacritic // i is the position of the first diacritic
// so (i - 1) is the position for the letter before. // so (i - 1) is the position for the letter before.
positions.push([i - 1 - shift + 1, shift - 1]); positions.push(i - 1 - shift + 1, shift - 1);
shift -= 1; shift -= 1;
shiftOrigin += 1; shiftOrigin += 1;
} }
// End-of-line. // End-of-line.
positions.push([i - shift + 1, shift]); positions.push(i - shift + 1, shift);
shiftOrigin += 1; shiftOrigin += 1;
eol += 1; eol += 1;
@ -261,7 +261,7 @@ function normalize(text) {
for (let j = 1; j <= jj; j++) { for (let j = 1; j <= jj; j++) {
// i is the position of the first diacritic // i is the position of the first diacritic
// so (i - 1) is the position for the letter before. // so (i - 1) is the position for the letter before.
positions.push([i - 1 - shift + j, shift - j]); positions.push(i - 1 - shift + j, shift - j);
} }
shift -= jj; shift -= jj;
shiftOrigin += jj; shiftOrigin += jj;
@ -270,7 +270,7 @@ function normalize(text) {
// Diacritics are followed by a -\n. // Diacritics are followed by a -\n.
// See comments in `if (p6)` block. // See comments in `if (p6)` block.
i += len - 1; i += len - 1;
positions.push([i - shift + 1, 1 + shift]); positions.push(i - shift + 1, 1 + shift);
shift += 1; shift += 1;
shiftOrigin += 1; shiftOrigin += 1;
eol += 1; eol += 1;
@ -296,7 +296,7 @@ function normalize(text) {
// The \n isn't in the original text so here y = i, n = X.len - 2 and // The \n isn't in the original text so here y = i, n = X.len - 2 and
// o = X.len - 1. // o = X.len - 1.
const len = p6.length - 2; const len = p6.length - 2;
positions.push([i - shift + len, 1 + shift]); positions.push(i - shift + len, 1 + shift);
shift += 1; shift += 1;
shiftOrigin += 1; shiftOrigin += 1;
eol += 1; eol += 1;
@ -308,7 +308,7 @@ function normalize(text) {
// white space. // white space.
// A CJK can be encoded in UTF-32, hence their length isn't always 1. // A CJK can be encoded in UTF-32, hence their length isn't always 1.
const len = p7.length - 1; const len = p7.length - 1;
positions.push([i - shift + len, shift]); positions.push(i - shift + len, shift);
shiftOrigin += 1; shiftOrigin += 1;
eol += 1; eol += 1;
return p7.slice(0, -1); return p7.slice(0, -1);
@ -317,7 +317,7 @@ function normalize(text) {
if (p8) { if (p8) {
// eol is replaced by space: "foo\nbar" is likely equivalent to // eol is replaced by space: "foo\nbar" is likely equivalent to
// "foo bar". // "foo bar".
positions.push([i - shift + 1, shift - 1]); positions.push(i - shift + 1, shift - 1);
shift -= 1; shift -= 1;
shiftOrigin += 1; shiftOrigin += 1;
eol += 1; eol += 1;
@ -331,7 +331,7 @@ function normalize(text) {
const newCharLen = syllablePositions[syllableIndex][0] - 1; const newCharLen = syllablePositions[syllableIndex][0] - 1;
++syllableIndex; ++syllableIndex;
for (let j = 1; j <= newCharLen; j++) { for (let j = 1; j <= newCharLen; j++) {
positions.push([i - (shift - j), shift - j]); positions.push(i - (shift - j), shift - j);
} }
shift -= newCharLen; shift -= newCharLen;
shiftOrigin += newCharLen; shiftOrigin += newCharLen;
@ -340,9 +340,15 @@ function normalize(text) {
} }
); );
positions.push([normalized.length, shift]); positions.push(normalized.length, shift);
const starts = new Uint32Array(positions.length >> 1);
const shifts = new Int32Array(positions.length >> 1);
for (let i = 0, ii = positions.length; i < ii; i += 2) {
starts[i >> 1] = positions[i];
shifts[i >> 1] = positions[i + 1];
}
return [normalized, positions, hasDiacritics]; return [normalized, [starts, shifts], hasDiacritics];
} }
// Determine the original, non-normalized, match index such that highlighting of // Determine the original, non-normalized, match index such that highlighting of
@ -353,25 +359,26 @@ function getOriginalIndex(diffs, pos, len) {
return [pos, len]; return [pos, len];
} }
const [starts, shifts] = diffs;
// First char in the new string. // First char in the new string.
const start = pos; const start = pos;
// Last char in the new string. // Last char in the new string.
const end = pos + len - 1; const end = pos + len - 1;
let i = binarySearchFirstItem(diffs, x => x[0] >= start); let i = binarySearchFirstItem(starts, x => x >= start);
if (diffs[i][0] > start) { if (starts[i] > start) {
--i; --i;
} }
let j = binarySearchFirstItem(diffs, x => x[0] >= end, i); let j = binarySearchFirstItem(starts, x => x >= end, i);
if (diffs[j][0] > end) { if (starts[j] > end) {
--j; --j;
} }
// First char in the old string. // First char in the old string.
const oldStart = start + diffs[i][1]; const oldStart = start + shifts[i];
// Last char in the old string. // Last char in the old string.
const oldEnd = end + diffs[j][1]; const oldEnd = end + shifts[j];
const oldLen = oldEnd + 1 - oldStart; const oldLen = oldEnd + 1 - oldStart;
return [oldStart, oldLen]; return [oldStart, oldLen];