193 lines
		
	
	
		
			5.0 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
			
		
		
	
	
			193 lines
		
	
	
		
			5.0 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
| 'use strict';
 | |
|  class Extractor {
 | |
| 	constructor(pdfViewer) {
 | |
| 		this.pdfViewer = pdfViewer;
 | |
| 		this.charsCache = {};
 | |
| 		this.pageLabelsCache = {};
 | |
| 		this.pageLabelPointsCache = undefined;
 | |
| 	}
 | |
| 
 | |
| 	async getPageChars(pageIndex) {
 | |
| 		if (this.charsCache[pageIndex]) {
 | |
| 			return this.charsCache[pageIndex];
 | |
| 		}
 | |
| 
 | |
| 		let page = await this.pdfViewer.pdfDocument.getPage(pageIndex + 1);
 | |
| 		let textContent = await page.getTextContent();
 | |
| 
 | |
| 	//	console.log(page,textContent);
 | |
| 
 | |
| 		let chars = [];
 | |
| 		for (let item of textContent.items) {
 | |
| 			for (let char of item.chars) {
 | |
| 				// Note: Rotation is rounded in PDF.js
 | |
| 				if (char.rotation % 90 === 0 && char.c !== ' ') {
 | |
| 					chars.push(char);
 | |
| 				}
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		this.charsCache[pageIndex] = chars;
 | |
| 		return chars;
 | |
| 	}
 | |
| 
 | |
| 	getPageCharsSync(pageIndex) {
 | |
| 	    let chars = this.charsCache[pageIndex];
 | |
| 		return chars && chars.length ? chars : null;
 | |
| 	}
 | |
| 
 | |
| 	getNextLineClosestOffset(pageIndex, offset) {
 | |
| 		let chars = this.getPageCharsSync(pageIndex);
 | |
| 		return chars && getNextLineClosestOffset(chars, offset);
 | |
| 	}
 | |
| 
 | |
| 	getPrevLineClosestOffset(pageIndex, offset) {
 | |
| 		let chars = this.getPageCharsSync(pageIndex);
 | |
| 		return chars && getPrevLineClosestOffset(chars, offset);
 | |
| 	}
 | |
| 
 | |
| 	getClosestWord(position) {
 | |
| 		let chars = this.getPageCharsSync(position.pageIndex);
 | |
| 		return chars && getClosestWord(chars, position.rects[0]);
 | |
| 	}
 | |
| 
 | |
| 	getClosestLine(position) {
 | |
| 		let chars = this.getPageCharsSync(position.pageIndex);
 | |
| 		return chars && getClosestLine(chars, position.rects[0]);
 | |
| 	}
 | |
| 
 | |
| 	extractRange({ pageIndex, anchor, head, reverse }) {
 | |
| 		let chars = this.getPageCharsSync(pageIndex);
 | |
| 	//	console.log(chars,pageIndex,anchor,head);
 | |
| 		if (!chars) {
 | |
| 			return null;
 | |
| 		}
 | |
| 		let range = getRangeBySelection({ chars, anchor, head, reverse });
 | |
| 		if (!range) {
 | |
| 			return null;
 | |
| 		}
 | |
| 
 | |
| 		range.position = {
 | |
| 			pageIndex,
 | |
| 			rects: range.rects
 | |
| 		};
 | |
| 		delete range.rects;
 | |
| 		return range;
 | |
| 	}
 | |
| 
 | |
| 	async getSortIndex(position) {
 | |
| 		let chars = await this.getPageChars(position.pageIndex);
 | |
| 		let page = position.pageIndex;
 | |
| 		let offset = chars.length && getClosestOffset(chars, position.rects[0]) || 0;
 | |
| 		let pageHeight = (await this.pdfViewer.pdfDocument.getPage(position.pageIndex + 1)).view[3];
 | |
| 		let top = pageHeight - position.rects[0][3];
 | |
| 		if (top < 0) {
 | |
| 			top = 0;
 | |
| 		}
 | |
| 
 | |
| 		return [
 | |
| 			page.toString().slice(0, 5).padStart(5, '0'),
 | |
| 			offset.toString().slice(0, 6).padStart(6, '0'),
 | |
| 			Math.floor(top).toString().slice(0, 5).padStart(5, '0')
 | |
| 		].join('|');
 | |
| 	}
 | |
| 
 | |
| 	async extractPageLabelPoints() {
 | |
| 		if (this.pageLabelPointsCache !== undefined) {
 | |
| 			return this.pageLabelPointsCache;
 | |
| 		}
 | |
| 		for (let i = 0; i < 5 && i + 3 < this.pdfViewer.pdfDocument.numPages; i++) {
 | |
| 			let pageHeight = (await this.pdfViewer.pdfDocument.getPage(i + 1)).view[3];
 | |
| 			let chars1 = await this.getPageChars(i);
 | |
| 			let chars2 = await this.getPageChars(i + 1);
 | |
| 			let chars3 = await this.getPageChars(i + 2);
 | |
| 			let chars4 = await this.getPageChars(i + 3);
 | |
| 			let res = getPageLabelPoints(i, chars1, chars2, chars3, chars4, pageHeight);
 | |
| 			if (res) {
 | |
| 				this.pageLabelPointsCache = res;
 | |
| 				return res;
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		this.pageLabelPointsCache = null;
 | |
| 		return null;
 | |
| 	}
 | |
| 
 | |
| 	async extractPageLabel(pageIndex) {
 | |
| 		let points = await this.extractPageLabelPoints();
 | |
| 		if (!points) {
 | |
| 			return null;
 | |
| 		}
 | |
| 
 | |
| 		let charsPrev, charsCur, charsNext;
 | |
| 		if (pageIndex > 0) {
 | |
| 			charsPrev = await this.getPageChars(pageIndex - 1);
 | |
| 		}
 | |
| 		charsCur = await this.getPageChars(pageIndex);
 | |
| 
 | |
| 		if (pageIndex < this.pdfViewer.pdfDocument.numPages - 1) {
 | |
| 			charsNext = await this.getPageChars(pageIndex + 1);
 | |
| 		}
 | |
| 
 | |
| 		return getPageLabel(pageIndex, charsPrev, charsCur, charsNext, points);
 | |
| 	}
 | |
| 
 | |
| 	async getPageLabel(pageIndex) {
 | |
| 		if (this.pageLabelsCache[pageIndex]) {
 | |
| 			return this.pageLabelsCache[pageIndex];
 | |
| 		}
 | |
| 
 | |
| 		let extractedPageLabel = await this.extractPageLabel(pageIndex);
 | |
| 		let assignedPageLabel;
 | |
| 		let pageLabels = this.pdfViewer._pageLabels;
 | |
| 		if (pageLabels && pageLabels[pageIndex]) {
 | |
| 			assignedPageLabel = pageLabels[pageIndex];
 | |
| 		}
 | |
| 
 | |
| 		let pageLabel = (pageIndex + 1).toString();
 | |
| 
 | |
| 		if (extractedPageLabel) {
 | |
| 			pageLabel = extractedPageLabel;
 | |
| 		}
 | |
| 		else if (assignedPageLabel) {
 | |
| 			pageLabel = assignedPageLabel;
 | |
| 		}
 | |
| 
 | |
| 		this.pageLabelsCache[pageIndex] = pageLabel;
 | |
| 		return pageLabel;
 | |
| 	}
 | |
| 
 | |
| 	getCachedPageLabel(pageIndex) {
 | |
| 		if (this.pageLabelsCache[pageIndex]) {
 | |
| 			return this.pageLabelsCache[pageIndex];
 | |
| 		}
 | |
| 		return null;
 | |
| 	}
 | |
| 
 | |
| 	async getPageIndexByLabel(pageLabel) {
 | |
| 		let numericPageLabel = parseInt(pageLabel);
 | |
| 		let points = await this.extractPageLabelPoints();
 | |
| 		if (points && numericPageLabel == pageLabel) {
 | |
| 			let targetPageIndex = points[0].idx + (numericPageLabel - points[0].num);
 | |
| 			let targetPageLabel = await this.extractPageLabel(targetPageIndex);
 | |
| 			if (targetPageLabel == pageLabel) {
 | |
| 				return targetPageIndex;
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		let pageLabels = this.pdfViewer._pageLabels;
 | |
| 		if (pageLabels) {
 | |
| 			let targetPageIndex = pageLabels.indexOf(pageLabel);
 | |
| 			if (targetPageIndex !== -1) {
 | |
| 				return targetPageIndex;
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		if (numericPageLabel == pageLabel && numericPageLabel > 0) {
 | |
| 			return (numericPageLabel - 1).toString();
 | |
| 		}
 | |
| 
 | |
| 		return null;
 | |
| 	}
 | |
| }
 |