Merge pull request #20465 from calixteman/update_char_to_nfkc

Fix the regex string used to find the chars to normalize with NFKC when searching
This commit is contained in:
calixteman 2025-11-28 16:59:23 +01:00 committed by GitHub
commit ec5330f78c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -13,6 +13,8 @@
* limitations under the License.
*/
import { FeatureTest } from "pdfjs-lib";
const CharacterType = {
SPACE: 0,
ALPHA_LETTER: 1,
@ -114,8 +116,13 @@ function getCharacterType(charCode) {
let NormalizeWithNFKC;
function getNormalizeWithNFKC() {
/* eslint-disable no-irregular-whitespace */
NormalizeWithNFKC ||= ` ¨ª¯²-µ¸-º¼-¾IJ-ijĿ-ŀʼnſDŽ-njDZ-dzʰ-ʸ˘-˝ˠ-ˤʹͺ;΄-΅·ϐ-ϖϰ-ϲϴ-ϵϹևٵ-ٸक़-य़ড়-ঢ়য়ਲ਼ਸ਼ਖ਼-ਜ਼ਫ਼ଡ଼-ଢ଼ำຳໜ-ໝ༌གྷཌྷདྷབྷཛྷཀྵჼᴬ-ᴮᴰ-ᴺᴼ-ᵍᵏ-ᵪᵸᶛ-ᶿẚ-ẛάέήίόύώΆ᾽-῁ΈΉ῍-῏ΐΊ῝-῟ΰΎ῭-`ΌΏ´- - ‑‗․-… ″-‴‶-‷‼‾⁇-⁉⁗ ⁰-ⁱ⁴-₎ₐ-ₜ₨℀-℃℅-ℇ℉--№ℙ-ℝ℠-™ℤΩℨK---ℹ℻-⅀ⅅ-ⅉ⅐-ⅿ↉∬-∭∯-∰〈-〉①-⓪⨌⩴-⩶⫝̸ⱼ-ⱽⵯ⺟⻳⼀-⿕ 〶〸-〺゛-゜ゟヿㄱ-ㆎ㆒-㆟㈀-㈞㈠-㉇㉐-㉾㊀-㏿ꚜ-ꚝꝰꟲ-ꟴꟸ-ꟹꭜ-ꭟꭩ豈-嗀塚晴凞-羽蘒諸逸-都飯-舘並-龎ff-stﬓ-ﬗיִײַ-זּטּ-לּמּנּ-סּףּ-פּצּ-ﮱﯓ-ﴽﵐ-ﶏﶒ-ﷇﷰ-﷼︐-︙︰-﹄﹇-﹒﹔-﹦﹨-﹫ﹰ-ﹲﹴﹶ-ﻼ!-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ¢-₩`;
if (
(typeof PDFJSDev === "undefined" && FeatureTest.platform.isFirefox) ||
(typeof PDFJSDev !== "undefined" && PDFJSDev.test("MOZCENTRAL"))
) {
/* eslint-disable no-irregular-whitespace */
NormalizeWithNFKC ||= `\xA0¨ª¯²-µ¸-º¼-¾IJ-ijĿ-ŀʼnſDŽ-njDZ-dzʰ-ʸ˘-˝ˠ-ˤʹͺ;΄-΅·ϐ-ϖϰ-ϲϴ-ϵϹևٵ-ٸक़-य़ড়-ঢ়য়ਲ਼ਸ਼ਖ਼-ਜ਼ਫ਼ଡ଼-ଢ଼ำຳໜ-ໝ༌གྷཌྷདྷབྷཛྷཀྵჼᴬ-ᴮᴰ-ᴺᴼ-ᵍᵏ-ᵪᵸᶛ-ᶿẚ-ẛάέήίόύώΆ᾽-῁ΈΉ῍-῏ΐΊ῝-῟ΰΎ῭-`ΌΏ´- - ‑‗․-… ″-‴‶-‷‼‾⁇-⁉⁗ ⁰-ⁱ⁴-₎ₐ-ₜ₨℀-℃℅-ℇ℉--№ℙ-ℝ℠-™ℤΩℨK---ℹ℻-⅀ⅅ-ⅉ⅐-ⅿ↉∬-∭∯-∰〈-〉①-⓪⨌⩴-⩶⫝̸ⱼ-ⱽⵯ⺟⻳⼀-⿕ 〶〸-〺゛-゜ゟヿㄱ-ㆎ㆒-㆟㈀-㈞㈠-㉇㉐-㉾㊀-㏿ꚜ-ꚝꝰ꟱-ꟴꟸ-ꟹꭜ-ꭟꭩ豈-嗀塚晴凞-羽蘒諸逸-都飯-舘並-龎ff-stﬓ-ﬗיִײַ-זּטּ-לּמּנּ-סּףּ-פּצּ-ﮱﯓ-ﴽﵐ-ﶏﶒ-ﷇﷰ-﷼︐-︙︰-﹄﹇-﹒﹔-﹦﹨-﹫ﹰ-ﹲﹴﹶ-ﻼ!-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ¢-₩`;
}
if (typeof PDFJSDev === "undefined" || PDFJSDev.test("TESTING")) {
const ranges = [];
@ -145,7 +152,24 @@ function getNormalizeWithNFKC() {
}
}
}
if (ranges.join("") !== NormalizeWithNFKC) {
const rangesStr = ranges.join("");
if (!NormalizeWithNFKC) {
NormalizeWithNFKC = rangesStr;
} else if (rangesStr !== NormalizeWithNFKC) {
for (let i = 1; i < rangesStr.length; i++) {
if (rangesStr[i] !== NormalizeWithNFKC[i]) {
console.log(
`Difference at index ${i}: ` +
`U+${rangesStr.charCodeAt(i).toString(16).toUpperCase().padStart(4, "0")}` +
`!== U+${NormalizeWithNFKC.charCodeAt(i)
.toString(16)
.toUpperCase()
.padStart(4, "0")}`
);
break;
}
}
throw new Error(
"getNormalizeWithNFKC - update the `NormalizeWithNFKC` string."
);