Skip to content

Commit 15dad27

Browse files
authored
Merge pull request #97 from mlipok/patch-1
Update 014-pdfJS-Static_PDF_Viewer.au3 - GET PDF DATA
2 parents ed1e55c + 221c188 commit 15dad27

2 files changed

Lines changed: 135 additions & 44 deletions

File tree

examples/014-pdfJS-Static_PDF_Viewer.au3

Lines changed: 80 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
#WIP - this Example is imported from 1.5.0 UDF - and is in "WORK IN PROGRESS" state
2-
31
#AutoIt3Wrapper_UseX64=n
42
#AutoIt3Wrapper_Run_AU3Check=Y
53
#AutoIt3Wrapper_AU3Check_Stop_OnWarning=y
@@ -19,6 +17,8 @@
1917
#include <GUIConstantsEx.au3>
2018
#include <WindowsConstants.au3>
2119
#include "..\NetWebView2Lib.au3"
20+
#include <Array.au3>
21+
#include <String.au3>
2222

2323
; Global objects
2424

@@ -66,7 +66,52 @@ Func _Example()
6666
Local $s_PDF_TEXT = ''
6767

6868
$s_JavaScript_snipp = "PDF_ExtractToJSON();"
69-
$s_PDF_TEXT = _NetWebView2_ExecuteScript($oWeb, $s_JavaScript_snipp, $NETWEBVIEW2_EXECUTEJS_MODE2_RESULT)
69+
_NetWebView2_ExecuteScript($oWeb, $s_JavaScript_snipp, $NETWEBVIEW2_EXECUTEJS_MODE2_RESULT)
70+
71+
; Get JSON data
72+
$s_PDF_TEXT = Get_Data_Sync("", "PDF_DATA_PACKAGE")
73+
74+
If $s_PDF_TEXT <> "" Then ; === JSON REPORT ===
75+
Local $oJson = _NetJson_CreateParser($s_PDF_TEXT)
76+
If @error Then Return ConsoleWrite("!> Error: Failed to create NetJson object." & @CRLF)
77+
78+
ConsoleWrite(@CRLF & "==================== PDF REPORT ====================" & @CRLF)
79+
80+
; METADATA SECTION
81+
Local $sTitle = $oJson.GetTokenValue("metadata.Title")
82+
Local $sAuthor = $oJson.GetTokenValue("metadata.Author")
83+
ConsoleWrite(StringFormat("+ Title: %s\n+ Author: %s\n", $sTitle, $sAuthor))
84+
ConsoleWrite("+ Format: " & $oJson.GetTokenValue("metadata.PDFFormatVersion") & @CRLF)
85+
86+
; PAGES SECTION
87+
;~ Local $iActualPages = $oJson.GetArrayLength("pages")
88+
Local $iPages = Number($oJson.GetTokenValue("pagesCount"))
89+
ConsoleWrite("----------------------------------------------------" & @CRLF)
90+
ConsoleWrite("- Total Pages Detected: " & $iPages & @CRLF)
91+
92+
For $i = 0 To $iPages - 1 ; Get and Clean Page Text
93+
Local $sRawPageText = $oJson.GetTokenValue("pages[" & $i & "].text")
94+
Local $sCleanText = StringReplace($sRawPageText, Chr(160), " ") ; Normalize spaces
95+
ConsoleWrite(StringFormat(">>> Page [%d] Content:\n%s\n", $i + 1, $sCleanText))
96+
ConsoleWrite("----------------------------------------------------" & @CRLF)
97+
Next
98+
99+
; DATA GRID SECTION
100+
Local $sTable = $oJson.FlattenToTable("", @CRLF)
101+
Local $aFinalGrid = _ArrayFromString($sTable, "", @CRLF, True)
102+
103+
If Not @error Then
104+
ConsoleWrite("+ Data Grid: Success. Displaying UI Table..." & @CRLF)
105+
_ArrayDisplay($aFinalGrid, "v1.4.1 Final Table View")
106+
Else
107+
ConsoleWrite("!> Error: FlattenToTable failed to generate array." & @CRLF)
108+
EndIf
109+
110+
ConsoleWrite("====================================================" & @CRLF & @CRLF)
111+
Else
112+
ConsoleWrite("!> Error: No PDF data received within timeout." & @CRLF)
113+
EndIf
114+
70115
MsgBox($MB_TOPMOST, "TEST #" & @ScriptLineNumber, "After:" & @CRLF & $s_JavaScript_snipp & @CRLF & $s_PDF_TEXT)
71116

72117
$s_JavaScript_snipp = "PDF_HighlightSpansContainingText('2016', 'blue', 'pink');"
@@ -108,24 +153,17 @@ Func __UserEventHandler__Bridge_OnMessageReceived($oWebV2M, $hGUI, $sMsg)
108153

109154
If $sFirstChar = "{" Or $sFirstChar = "[" Then ; 1. JSON Messaging
110155
ConsoleWrite("+> : Processing JSON Messaging..." & @CRLF)
111-
Local $oJson = ObjCreate("NetJson.Parser")
112-
If ObjName($oJson, $OBJ_PROGID) <> 'NetWebView2.Manager' Then Return ConsoleWrite("!> Error: Failed to create NetJson object." & @CRLF)
156+
Local $oJson = _NetJson_CreateParser($sMsg)
157+
If @error Then Return ConsoleWrite("!> Error: Failed to create NetJson object." & @CRLF)
113158

114-
$oJson.Parse($sMsg)
115159
Local $sJobType = $oJson.GetTokenValue("type")
116160

117161
Switch $sJobType
118162
Case "COM_TEST"
119163
ConsoleWrite("- COM_TEST Confirmed: " & $oJson.GetTokenValue("status") & @CRLF)
120164

121165
Case "PDF_DATA_PACKAGE"
122-
ConsoleWrite("> PDF Metadata: " & $oJson.GetTokenValue("metadata.title") & " by " & $oJson.GetTokenValue("metadata.author") & @CRLF)
123-
124-
; Loop through pages (if your parser supports it)
125-
Local $iPages = $oJson.GetTokenValue("metadata.pagesCount")
126-
For $i = 0 To $iPages - 1
127-
ConsoleWrite("- Page " & ($i + 1) & " content: " & StringLeft($oJson.GetTokenValue("pages[" & $i & "].text"), 150) & "..." & @CRLF)
128-
Next
166+
Get_Data_Sync($sMsg, "PDF_DATA_PACKAGE")
129167
EndSwitch
130168

131169
Else ; 2. Legacy / Native Pipe-Delimited Messaging
@@ -154,6 +192,28 @@ Func __UserEventHandler__Bridge_OnMessageReceived($oWebV2M, $hGUI, $sMsg)
154192
EndIf
155193
EndFunc ;==>__UserEventHandler__Bridge_OnMessageReceived
156194

195+
Func Get_Data_Sync($sData = "", $sJobType = "DEFAULT", $iTimeout = 5000)
196+
; We use a Map to hold many different types of data at the same time.
197+
Local Static $mDataMap[]
198+
199+
; If we send data (from the Event Handler)
200+
If $sData <> "" Then
201+
$mDataMap[$sJobType] = $sData
202+
Return True
203+
EndIf
204+
205+
; If we request data (from the main Script)
206+
Local $iStart = TimerInit()
207+
While Not MapExists($mDataMap, $sJobType)
208+
If TimerDiff($iStart) > $iTimeout Then Return SetError(1, 0, "")
209+
Sleep(10)
210+
WEnd
211+
212+
Local $sResult = $mDataMap[$sJobType]
213+
MapRemove($mDataMap, $sJobType) ; Cleaning for next time
214+
Return $sResult
215+
EndFunc ;==>Get_Data_Sync
216+
157217
Func __SetupStaticPDF(ByRef $oWeb, $s_PDF_Path, $sExpectedTitle, $bBlockLinks = False, $bBlockSelection = False, $bShowToolbar = False)
158218
; 🏆 https://mozilla.github.io/pdf.js/
159219

@@ -230,17 +290,17 @@ Func __SetupStaticPDF(ByRef $oWeb, $s_PDF_Path, $sExpectedTitle, $bBlockLinks =
230290

231291
$oWeb.AddInitializationScript($sCleanupJS)
232292

293+
233294
; Fix slashes in Path for URL
234-
Local $s_PDF_URL = StringReplace($s_PDF_Path, "\", "/")
235-
$s_PDF_URL = $oWeb.EncodeURI($s_PDF_URL)
236-
Local $s_PDF_JS_URL = StringReplace(@ScriptDir & "\JS_Lib\pdfjs\web\viewer.html" & "?file=", "\", "/")
237-
Local $s_Viewer_URL = "file:///" & $s_PDF_JS_URL & $s_PDF_URL
238-
ConsoleWrite("- $s_Viewer_URL= " & $s_Viewer_URL & @CRLF)
295+
Local $sViewerPath = StringReplace(@ScriptDir & "\JS_Lib\pdfjs\web\viewer.html", "\", "/")
296+
Local $sPDF_URL = "file:///" & StringReplace($s_PDF_Path, "\", "/")
297+
Local $sFinalURL = "file:///" & $sViewerPath & "?file=" & $oWeb.EncodeURI($sPDF_URL)
298+
ConsoleWrite("Correct URL: " & $sFinalURL & @CRLF)
239299

240-
_NetWebView2_Navigate($oWeb, $s_Viewer_URL, $NETWEBVIEW2_MESSAGE__TITLE_CHANGED, $sExpectedTitle, 5000)
300+
_NetWebView2_Navigate($oWeb, $sFinalURL, $NETWEBVIEW2_MESSAGE__TITLE_CHANGED, $sExpectedTitle, 5000)
241301
ConsoleWrite("! we're done with navigation, but check how many more messages there are below. SLN=" & @ScriptLineNumber & @CRLF)
242302
MsgBox($MB_TOPMOST, "TEST #" & @ScriptLineNumber, 'Wait for all messages to full loading PDF by pdf.js')
243-
#EndRegion ; mLipok #TODO this should be fixed by better LoadWait, I mean adding a check if the desired title appears
303+
; mLipok #TODO this should be fixed by better LoadWait, I mean adding a check if the desired title appears
244304

245305
; $oWeb.IsZoomControlEnabled = False ; <--- It doesn't work in PDF.
246306
$oWeb.DisableBrowserFeatures()

examples/JS_Lib/NetWebView2Lib_pdfjs_Tools.js

Lines changed: 55 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,64 @@
11
/**
2-
* PDF_Tools.js - Final Combined Library
2+
* PDF_Tools.js - Final Combined Library (v1.4.2)
33
*/
44

5+
// NetWebView2Lib_pdfjs_Tools.js
56
async function PDF_ExtractToJSON() {
6-
try {
7-
const pdfUrl = window.PDFViewerApplication.url;
8-
const pdf = await pdfjsLib.getDocument(pdfUrl).promise;
9-
const meta = await pdf.getMetadata();
10-
let pdfData = {
11-
type: 'PDF_DATA_PACKAGE',
12-
metadata: {
13-
title: meta.info.Title || 'N/A',
14-
author: meta.info.Author || 'N/A',
15-
pagesCount: pdf.numPages
16-
},
17-
pages: []
18-
};
19-
for (let i = 1; i <= pdf.numPages; i++) {
20-
const page = await pdf.getPage(i);
21-
const content = await page.getTextContent();
22-
pdfData.pages.push({
23-
pageIndex: i,
24-
text: content.items.map(item => item.str).join(' ')
25-
});
7+
if (typeof PDFViewerApplication === 'undefined') return;
8+
9+
const pdf = PDFViewerApplication.pdfDocument;
10+
const pdfData = {
11+
type: 'PDF_DATA_PACKAGE',
12+
metadata: (await pdf.getMetadata()).info,
13+
pagesCount: pdf.numPages, // Explicitly send page count for AutoIt
14+
pages: []
15+
};
16+
17+
for (let i = 1; i <= pdf.numPages; i++) {
18+
const page = await pdf.getPage(i);
19+
const textContent = await page.getTextContent();
20+
21+
// Map items and include the actual width provided by PDF.js
22+
// Then sort by Y (top to bottom) and X (left to right)
23+
let items = textContent.items.map(item => ({
24+
str: item.str,
25+
x: item.transform[4],
26+
y: item.transform[5],
27+
width: item.width
28+
})).sort((a, b) => Math.abs(b.y - a.y) > 5 ? b.y - a.y : a.x - b.x);
29+
30+
let pageText = "";
31+
let lastY = -1;
32+
let lastX = 0;
33+
const charWidth = 5; // Standard multiplier for visual spacing
34+
35+
for (const item of items) {
36+
// Check for line change based on Y coordinate threshold
37+
if (lastY !== -1 && Math.abs(lastY - item.y) > 5) {
38+
pageText = pageText.trimEnd() + "\n";
39+
lastX = 0;
40+
}
41+
42+
// Calculate horizontal spacing based on distance from last item
43+
let distance = item.x - lastX;
44+
let spaces = Math.floor(distance / charWidth);
45+
46+
pageText += " ".repeat(Math.max(0, spaces)) + item.str;
47+
48+
// Update lastX using the actual width of the current text element
49+
lastX = item.x + item.width;
50+
lastY = item.y;
2651
}
27-
window.chrome.webview.postMessage(JSON.stringify(pdfData));
28-
} catch (e) {
29-
window.chrome.webview.postMessage(JSON.stringify({type: 'error', message: e.message}));
52+
53+
// Push processed page data to the package
54+
pdfData.pages.push({
55+
pageIndex: i,
56+
text: pageText.trim()
57+
});
3058
}
59+
60+
// Send the final JSON package back to AutoIt
61+
window.chrome.webview.postMessage(JSON.stringify(pdfData));
3162
}
3263

3364
async function PDF_ExtractLegacy() {

0 commit comments

Comments
 (0)