|
1 | 1 | /** |
2 | | - * PDF_Tools.js - Final Combined Library |
| 2 | + * PDF_Tools.js - Final Combined Library (v1.4.2) |
3 | 3 | */ |
4 | 4 |
|
| 5 | +// NetWebView2Lib_pdfjs_Tools.js |
5 | 6 | async function PDF_ExtractToJSON() { |
6 | | - try { |
7 | | - const pdfUrl = window.PDFViewerApplication.url; |
8 | | - const pdf = await pdfjsLib.getDocument(pdfUrl).promise; |
9 | | - const meta = await pdf.getMetadata(); |
10 | | - let pdfData = { |
11 | | - type: 'PDF_DATA_PACKAGE', |
12 | | - metadata: { |
13 | | - title: meta.info.Title || 'N/A', |
14 | | - author: meta.info.Author || 'N/A', |
15 | | - pagesCount: pdf.numPages |
16 | | - }, |
17 | | - pages: [] |
18 | | - }; |
19 | | - for (let i = 1; i <= pdf.numPages; i++) { |
20 | | - const page = await pdf.getPage(i); |
21 | | - const content = await page.getTextContent(); |
22 | | - pdfData.pages.push({ |
23 | | - pageIndex: i, |
24 | | - text: content.items.map(item => item.str).join(' ') |
25 | | - }); |
| 7 | + if (typeof PDFViewerApplication === 'undefined') return; |
| 8 | + |
| 9 | + const pdf = PDFViewerApplication.pdfDocument; |
| 10 | + const pdfData = { |
| 11 | + type: 'PDF_DATA_PACKAGE', |
| 12 | + metadata: (await pdf.getMetadata()).info, |
| 13 | + pagesCount: pdf.numPages, // Explicitly send page count for AutoIt |
| 14 | + pages: [] |
| 15 | + }; |
| 16 | + |
| 17 | + for (let i = 1; i <= pdf.numPages; i++) { |
| 18 | + const page = await pdf.getPage(i); |
| 19 | + const textContent = await page.getTextContent(); |
| 20 | + |
| 21 | + // Map items and include the actual width provided by PDF.js |
| 22 | + // Then sort by Y (top to bottom) and X (left to right) |
| 23 | + let items = textContent.items.map(item => ({ |
| 24 | + str: item.str, |
| 25 | + x: item.transform[4], |
| 26 | + y: item.transform[5], |
| 27 | + width: item.width |
| 28 | + })).sort((a, b) => Math.abs(b.y - a.y) > 5 ? b.y - a.y : a.x - b.x); |
| 29 | + |
| 30 | + let pageText = ""; |
| 31 | + let lastY = -1; |
| 32 | + let lastX = 0; |
| 33 | + const charWidth = 5; // Standard multiplier for visual spacing |
| 34 | + |
| 35 | + for (const item of items) { |
| 36 | + // Check for line change based on Y coordinate threshold |
| 37 | + if (lastY !== -1 && Math.abs(lastY - item.y) > 5) { |
| 38 | + pageText = pageText.trimEnd() + "\n"; |
| 39 | + lastX = 0; |
| 40 | + } |
| 41 | + |
| 42 | + // Calculate horizontal spacing based on distance from last item |
| 43 | + let distance = item.x - lastX; |
| 44 | + let spaces = Math.floor(distance / charWidth); |
| 45 | + |
| 46 | + pageText += " ".repeat(Math.max(0, spaces)) + item.str; |
| 47 | + |
| 48 | + // Update lastX using the actual width of the current text element |
| 49 | + lastX = item.x + item.width; |
| 50 | + lastY = item.y; |
26 | 51 | } |
27 | | - window.chrome.webview.postMessage(JSON.stringify(pdfData)); |
28 | | - } catch (e) { |
29 | | - window.chrome.webview.postMessage(JSON.stringify({type: 'error', message: e.message})); |
| 52 | + |
| 53 | + // Push processed page data to the package |
| 54 | + pdfData.pages.push({ |
| 55 | + pageIndex: i, |
| 56 | + text: pageText.trim() |
| 57 | + }); |
30 | 58 | } |
| 59 | + |
| 60 | + // Send the final JSON package back to AutoIt |
| 61 | + window.chrome.webview.postMessage(JSON.stringify(pdfData)); |
31 | 62 | } |
32 | 63 |
|
33 | 64 | async function PDF_ExtractLegacy() { |
|
0 commit comments