193 lines
5.0 KiB
JavaScript
193 lines
5.0 KiB
JavaScript
'use strict';
|
|
class Extractor {
|
|
constructor(pdfViewer) {
|
|
this.pdfViewer = pdfViewer;
|
|
this.charsCache = {};
|
|
this.pageLabelsCache = {};
|
|
this.pageLabelPointsCache = undefined;
|
|
}
|
|
|
|
async getPageChars(pageIndex) {
|
|
if (this.charsCache[pageIndex]) {
|
|
return this.charsCache[pageIndex];
|
|
}
|
|
|
|
let page = await this.pdfViewer.pdfDocument.getPage(pageIndex + 1);
|
|
let textContent = await page.getTextContent();
|
|
|
|
// console.log(page,textContent);
|
|
|
|
let chars = [];
|
|
for (let item of textContent.items) {
|
|
for (let char of item.chars) {
|
|
// Note: Rotation is rounded in PDF.js
|
|
if (char.rotation % 90 === 0 && char.c !== ' ') {
|
|
chars.push(char);
|
|
}
|
|
}
|
|
}
|
|
|
|
this.charsCache[pageIndex] = chars;
|
|
return chars;
|
|
}
|
|
|
|
getPageCharsSync(pageIndex) {
|
|
let chars = this.charsCache[pageIndex];
|
|
return chars && chars.length ? chars : null;
|
|
}
|
|
|
|
getNextLineClosestOffset(pageIndex, offset) {
|
|
let chars = this.getPageCharsSync(pageIndex);
|
|
return chars && getNextLineClosestOffset(chars, offset);
|
|
}
|
|
|
|
getPrevLineClosestOffset(pageIndex, offset) {
|
|
let chars = this.getPageCharsSync(pageIndex);
|
|
return chars && getPrevLineClosestOffset(chars, offset);
|
|
}
|
|
|
|
getClosestWord(position) {
|
|
let chars = this.getPageCharsSync(position.pageIndex);
|
|
return chars && getClosestWord(chars, position.rects[0]);
|
|
}
|
|
|
|
getClosestLine(position) {
|
|
let chars = this.getPageCharsSync(position.pageIndex);
|
|
return chars && getClosestLine(chars, position.rects[0]);
|
|
}
|
|
|
|
extractRange({ pageIndex, anchor, head, reverse }) {
|
|
let chars = this.getPageCharsSync(pageIndex);
|
|
// console.log(chars,pageIndex,anchor,head);
|
|
if (!chars) {
|
|
return null;
|
|
}
|
|
let range = getRangeBySelection({ chars, anchor, head, reverse });
|
|
if (!range) {
|
|
return null;
|
|
}
|
|
|
|
range.position = {
|
|
pageIndex,
|
|
rects: range.rects
|
|
};
|
|
delete range.rects;
|
|
return range;
|
|
}
|
|
|
|
async getSortIndex(position) {
|
|
let chars = await this.getPageChars(position.pageIndex);
|
|
let page = position.pageIndex;
|
|
let offset = chars.length && getClosestOffset(chars, position.rects[0]) || 0;
|
|
let pageHeight = (await this.pdfViewer.pdfDocument.getPage(position.pageIndex + 1)).view[3];
|
|
let top = pageHeight - position.rects[0][3];
|
|
if (top < 0) {
|
|
top = 0;
|
|
}
|
|
|
|
return [
|
|
page.toString().slice(0, 5).padStart(5, '0'),
|
|
offset.toString().slice(0, 6).padStart(6, '0'),
|
|
Math.floor(top).toString().slice(0, 5).padStart(5, '0')
|
|
].join('|');
|
|
}
|
|
|
|
async extractPageLabelPoints() {
|
|
if (this.pageLabelPointsCache !== undefined) {
|
|
return this.pageLabelPointsCache;
|
|
}
|
|
for (let i = 0; i < 5 && i + 3 < this.pdfViewer.pdfDocument.numPages; i++) {
|
|
let pageHeight = (await this.pdfViewer.pdfDocument.getPage(i + 1)).view[3];
|
|
let chars1 = await this.getPageChars(i);
|
|
let chars2 = await this.getPageChars(i + 1);
|
|
let chars3 = await this.getPageChars(i + 2);
|
|
let chars4 = await this.getPageChars(i + 3);
|
|
let res = getPageLabelPoints(i, chars1, chars2, chars3, chars4, pageHeight);
|
|
if (res) {
|
|
this.pageLabelPointsCache = res;
|
|
return res;
|
|
}
|
|
}
|
|
|
|
this.pageLabelPointsCache = null;
|
|
return null;
|
|
}
|
|
|
|
async extractPageLabel(pageIndex) {
|
|
let points = await this.extractPageLabelPoints();
|
|
if (!points) {
|
|
return null;
|
|
}
|
|
|
|
let charsPrev, charsCur, charsNext;
|
|
if (pageIndex > 0) {
|
|
charsPrev = await this.getPageChars(pageIndex - 1);
|
|
}
|
|
charsCur = await this.getPageChars(pageIndex);
|
|
|
|
if (pageIndex < this.pdfViewer.pdfDocument.numPages - 1) {
|
|
charsNext = await this.getPageChars(pageIndex + 1);
|
|
}
|
|
|
|
return getPageLabel(pageIndex, charsPrev, charsCur, charsNext, points);
|
|
}
|
|
|
|
async getPageLabel(pageIndex) {
|
|
if (this.pageLabelsCache[pageIndex]) {
|
|
return this.pageLabelsCache[pageIndex];
|
|
}
|
|
|
|
let extractedPageLabel = await this.extractPageLabel(pageIndex);
|
|
let assignedPageLabel;
|
|
let pageLabels = this.pdfViewer._pageLabels;
|
|
if (pageLabels && pageLabels[pageIndex]) {
|
|
assignedPageLabel = pageLabels[pageIndex];
|
|
}
|
|
|
|
let pageLabel = (pageIndex + 1).toString();
|
|
|
|
if (extractedPageLabel) {
|
|
pageLabel = extractedPageLabel;
|
|
}
|
|
else if (assignedPageLabel) {
|
|
pageLabel = assignedPageLabel;
|
|
}
|
|
|
|
this.pageLabelsCache[pageIndex] = pageLabel;
|
|
return pageLabel;
|
|
}
|
|
|
|
getCachedPageLabel(pageIndex) {
|
|
if (this.pageLabelsCache[pageIndex]) {
|
|
return this.pageLabelsCache[pageIndex];
|
|
}
|
|
return null;
|
|
}
|
|
|
|
async getPageIndexByLabel(pageLabel) {
|
|
let numericPageLabel = parseInt(pageLabel);
|
|
let points = await this.extractPageLabelPoints();
|
|
if (points && numericPageLabel == pageLabel) {
|
|
let targetPageIndex = points[0].idx + (numericPageLabel - points[0].num);
|
|
let targetPageLabel = await this.extractPageLabel(targetPageIndex);
|
|
if (targetPageLabel == pageLabel) {
|
|
return targetPageIndex;
|
|
}
|
|
}
|
|
|
|
let pageLabels = this.pdfViewer._pageLabels;
|
|
if (pageLabels) {
|
|
let targetPageIndex = pageLabels.indexOf(pageLabel);
|
|
if (targetPageIndex !== -1) {
|
|
return targetPageIndex;
|
|
}
|
|
}
|
|
|
|
if (numericPageLabel == pageLabel && numericPageLabel > 0) {
|
|
return (numericPageLabel - 1).toString();
|
|
}
|
|
|
|
return null;
|
|
}
|
|
}
|