Very slightly improve the performance when searching in a pdf

It helps to slightly decrease memory use in reducing the number of created arrays.
In searching for "a" in pdf.pdf, the time spent in getOriginalIndex is decreased by
around 30%.
This commit is contained in:
Calixte Denizet 2024-11-28 16:43:48 +01:00
parent 65f20b0392
commit 94d53d5b45

View File

@ -184,7 +184,7 @@ function normalize(text) {
} }
let normalized = text.normalize("NFD"); let normalized = text.normalize("NFD");
const positions = [[0, 0]]; const positions = [0, 0];
let rawDiacriticsIndex = 0; let rawDiacriticsIndex = 0;
let syllableIndex = 0; let syllableIndex = 0;
let shift = 0; let shift = 0;
@ -201,7 +201,7 @@ function normalize(text) {
const replacement = CHARACTERS_TO_NORMALIZE[p1]; const replacement = CHARACTERS_TO_NORMALIZE[p1];
const jj = replacement.length; const jj = replacement.length;
for (let j = 1; j < jj; j++) { for (let j = 1; j < jj; j++) {
positions.push([i - shift + j, shift - j]); positions.push(i - shift + j, shift - j);
} }
shift -= jj - 1; shift -= jj - 1;
return replacement; return replacement;
@ -216,7 +216,7 @@ function normalize(text) {
} }
const jj = replacement.length; const jj = replacement.length;
for (let j = 1; j < jj; j++) { for (let j = 1; j < jj; j++) {
positions.push([i - shift + j, shift - j]); positions.push(i - shift + j, shift - j);
} }
shift -= jj - 1; shift -= jj - 1;
return replacement; return replacement;
@ -233,13 +233,13 @@ function normalize(text) {
} else { } else {
// i is the position of the first diacritic // i is the position of the first diacritic
// so (i - 1) is the position for the letter before. // so (i - 1) is the position for the letter before.
positions.push([i - 1 - shift + 1, shift - 1]); positions.push(i - 1 - shift + 1, shift - 1);
shift -= 1; shift -= 1;
shiftOrigin += 1; shiftOrigin += 1;
} }
// End-of-line. // End-of-line.
positions.push([i - shift + 1, shift]); positions.push(i - shift + 1, shift);
shiftOrigin += 1; shiftOrigin += 1;
eol += 1; eol += 1;
@ -261,7 +261,7 @@ function normalize(text) {
for (let j = 1; j <= jj; j++) { for (let j = 1; j <= jj; j++) {
// i is the position of the first diacritic // i is the position of the first diacritic
// so (i - 1) is the position for the letter before. // so (i - 1) is the position for the letter before.
positions.push([i - 1 - shift + j, shift - j]); positions.push(i - 1 - shift + j, shift - j);
} }
shift -= jj; shift -= jj;
shiftOrigin += jj; shiftOrigin += jj;
@ -270,7 +270,7 @@ function normalize(text) {
// Diacritics are followed by a -\n. // Diacritics are followed by a -\n.
// See comments in `if (p6)` block. // See comments in `if (p6)` block.
i += len - 1; i += len - 1;
positions.push([i - shift + 1, 1 + shift]); positions.push(i - shift + 1, 1 + shift);
shift += 1; shift += 1;
shiftOrigin += 1; shiftOrigin += 1;
eol += 1; eol += 1;
@ -296,7 +296,7 @@ function normalize(text) {
// The \n isn't in the original text so here y = i, n = X.len - 2 and // The \n isn't in the original text so here y = i, n = X.len - 2 and
// o = X.len - 1. // o = X.len - 1.
const len = p6.length - 2; const len = p6.length - 2;
positions.push([i - shift + len, 1 + shift]); positions.push(i - shift + len, 1 + shift);
shift += 1; shift += 1;
shiftOrigin += 1; shiftOrigin += 1;
eol += 1; eol += 1;
@ -308,7 +308,7 @@ function normalize(text) {
// white space. // white space.
// A CJK can be encoded in UTF-32, hence their length isn't always 1. // A CJK can be encoded in UTF-32, hence their length isn't always 1.
const len = p7.length - 1; const len = p7.length - 1;
positions.push([i - shift + len, shift]); positions.push(i - shift + len, shift);
shiftOrigin += 1; shiftOrigin += 1;
eol += 1; eol += 1;
return p7.slice(0, -1); return p7.slice(0, -1);
@ -317,7 +317,7 @@ function normalize(text) {
if (p8) { if (p8) {
// eol is replaced by space: "foo\nbar" is likely equivalent to // eol is replaced by space: "foo\nbar" is likely equivalent to
// "foo bar". // "foo bar".
positions.push([i - shift + 1, shift - 1]); positions.push(i - shift + 1, shift - 1);
shift -= 1; shift -= 1;
shiftOrigin += 1; shiftOrigin += 1;
eol += 1; eol += 1;
@ -331,7 +331,7 @@ function normalize(text) {
const newCharLen = syllablePositions[syllableIndex][0] - 1; const newCharLen = syllablePositions[syllableIndex][0] - 1;
++syllableIndex; ++syllableIndex;
for (let j = 1; j <= newCharLen; j++) { for (let j = 1; j <= newCharLen; j++) {
positions.push([i - (shift - j), shift - j]); positions.push(i - (shift - j), shift - j);
} }
shift -= newCharLen; shift -= newCharLen;
shiftOrigin += newCharLen; shiftOrigin += newCharLen;
@ -340,9 +340,15 @@ function normalize(text) {
} }
); );
positions.push([normalized.length, shift]); positions.push(normalized.length, shift);
const starts = new Uint32Array(positions.length >> 1);
const shifts = new Int32Array(positions.length >> 1);
for (let i = 0, ii = positions.length; i < ii; i += 2) {
starts[i >> 1] = positions[i];
shifts[i >> 1] = positions[i + 1];
}
return [normalized, positions, hasDiacritics]; return [normalized, [starts, shifts], hasDiacritics];
} }
// Determine the original, non-normalized, match index such that highlighting of // Determine the original, non-normalized, match index such that highlighting of
@ -353,25 +359,26 @@ function getOriginalIndex(diffs, pos, len) {
return [pos, len]; return [pos, len];
} }
const [starts, shifts] = diffs;
// First char in the new string. // First char in the new string.
const start = pos; const start = pos;
// Last char in the new string. // Last char in the new string.
const end = pos + len - 1; const end = pos + len - 1;
let i = binarySearchFirstItem(diffs, x => x[0] >= start); let i = binarySearchFirstItem(starts, x => x >= start);
if (diffs[i][0] > start) { if (starts[i] > start) {
--i; --i;
} }
let j = binarySearchFirstItem(diffs, x => x[0] >= end, i); let j = binarySearchFirstItem(starts, x => x >= end, i);
if (diffs[j][0] > end) { if (starts[j] > end) {
--j; --j;
} }
// First char in the old string. // First char in the old string.
const oldStart = start + diffs[i][1]; const oldStart = start + shifts[i];
// Last char in the old string. // Last char in the old string.
const oldEnd = end + diffs[j][1]; const oldEnd = end + shifts[j];
const oldLen = oldEnd + 1 - oldStart; const oldLen = oldEnd + 1 - oldStart;
return [oldStart, oldLen]; return [oldStart, oldLen];