Fix some edge cases in the autolinking logic with the regex as well as validating email domains and add unit tests for them. Fixes: https://github.com/mozilla/pdf.js/issues/19462
143 lines
4.3 KiB
JavaScript
143 lines
4.3 KiB
JavaScript
/* Copyright 2025 Mozilla Foundation
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
import { AnnotationType, createValidAbsoluteUrl, Util } from "pdfjs-lib";
|
|
import { getOriginalIndex, normalize } from "./pdf_find_controller.js";
|
|
|
|
function DOMRectToPDF({ width, height, left, top }, pdfPageView) {
|
|
if (width === 0 || height === 0) {
|
|
return null;
|
|
}
|
|
|
|
const pageBox = pdfPageView.textLayer.div.getBoundingClientRect();
|
|
const bottomLeft = pdfPageView.getPagePoint(
|
|
left - pageBox.left,
|
|
top - pageBox.top
|
|
);
|
|
const topRight = pdfPageView.getPagePoint(
|
|
left - pageBox.left + width,
|
|
top - pageBox.top + height
|
|
);
|
|
|
|
return Util.normalizeRect([
|
|
bottomLeft[0],
|
|
bottomLeft[1],
|
|
topRight[0],
|
|
topRight[1],
|
|
]);
|
|
}
|
|
|
|
function calculateLinkPosition(range, pdfPageView) {
|
|
const rangeRects = range.getClientRects();
|
|
if (rangeRects.length === 1) {
|
|
return { rect: DOMRectToPDF(rangeRects[0], pdfPageView) };
|
|
}
|
|
|
|
const rect = [Infinity, Infinity, -Infinity, -Infinity];
|
|
const quadPoints = [];
|
|
let i = 0;
|
|
for (const domRect of rangeRects) {
|
|
const normalized = DOMRectToPDF(domRect, pdfPageView);
|
|
if (normalized === null) {
|
|
continue;
|
|
}
|
|
|
|
quadPoints[i] = quadPoints[i + 4] = normalized[0];
|
|
quadPoints[i + 1] = quadPoints[i + 3] = normalized[3];
|
|
quadPoints[i + 2] = quadPoints[i + 6] = normalized[2];
|
|
quadPoints[i + 5] = quadPoints[i + 7] = normalized[1];
|
|
|
|
rect[0] = Math.min(rect[0], normalized[0]);
|
|
rect[1] = Math.min(rect[1], normalized[1]);
|
|
rect[2] = Math.max(rect[2], normalized[2]);
|
|
rect[3] = Math.max(rect[3], normalized[3]);
|
|
|
|
i += 8;
|
|
}
|
|
return { quadPoints, rect };
|
|
}
|
|
|
|
function createLinkAnnotation({ url, index, length }, pdfPageView, id) {
|
|
const highlighter = pdfPageView._textHighlighter;
|
|
const [{ begin, end }] = highlighter._convertMatches([index], [length]);
|
|
|
|
const range = new Range();
|
|
range.setStart(highlighter.textDivs[begin.divIdx].firstChild, begin.offset);
|
|
range.setEnd(highlighter.textDivs[end.divIdx].firstChild, end.offset);
|
|
|
|
return {
|
|
id: `inferred_link_${id}`,
|
|
unsafeUrl: url,
|
|
url,
|
|
annotationType: AnnotationType.LINK,
|
|
rotation: 0,
|
|
...calculateLinkPosition(range, pdfPageView),
|
|
// Populated in the annotationLayer to avoid unnecessary object creation,
|
|
// since most inferred links overlap existing LinkAnnotations:
|
|
borderStyle: null,
|
|
};
|
|
}
|
|
|
|
class Autolinker {
|
|
static #index = 0;
|
|
|
|
static #regex;
|
|
|
|
static findLinks(text) {
|
|
// Regex can be tested and verified at https://regex101.com/r/rXoLiT/2.
|
|
this.#regex ??=
|
|
/\b(?:https?:\/\/|mailto:|www\.)(?:[\S--[\p{P}<>]]|\/|[\S--[\[\]]]+[\S--[\p{P}<>]])+|\b[\S--[@\p{Ps}\p{Pe}<>]]+@([\S--[\p{P}<>]]+(?:\.[\S--[\p{P}<>]]+)+)/gmv;
|
|
|
|
const [normalizedText, diffs] = normalize(text);
|
|
const matches = normalizedText.matchAll(this.#regex);
|
|
const links = [];
|
|
for (const match of matches) {
|
|
const [url, emailDomain] = match;
|
|
let raw;
|
|
if (
|
|
url.startsWith("www.") ||
|
|
url.startsWith("http://") ||
|
|
url.startsWith("https://")
|
|
) {
|
|
raw = url;
|
|
} else if (URL.canParse(`http://${emailDomain}`)) {
|
|
raw = url.startsWith("mailto:") ? url : `mailto:${url}`;
|
|
} else {
|
|
continue;
|
|
}
|
|
const absoluteURL = createValidAbsoluteUrl(raw, null, {
|
|
addDefaultProtocol: true,
|
|
});
|
|
if (absoluteURL) {
|
|
const [index, length] = getOriginalIndex(
|
|
diffs,
|
|
match.index,
|
|
url.length
|
|
);
|
|
links.push({ url: absoluteURL.href, index, length });
|
|
}
|
|
}
|
|
return links;
|
|
}
|
|
|
|
static processLinks(pdfPageView) {
|
|
return this.findLinks(
|
|
pdfPageView._textHighlighter.textContentItemsStr.join("\n")
|
|
).map(link => createLinkAnnotation(link, pdfPageView, this.#index++));
|
|
}
|
|
}
|
|
|
|
export { Autolinker };
|