extract and compress text from original PDF report

johnseekins · johnseekins · commit 7658ed26a724 · 2025-12-03T22:16:49.000-07:00
Signed-off-by: John Seekins &lt;john@robot-house.us&gt;
diff --git a/.config/mise.toml b/.config/mise.toml
@@ -1,5 +1,5 @@
 [tools]
-python = "3.13.3"
+python = "3.14.1"
 node = "latest"
 lefthook = "latest"
 yamllint = "latest"
diff --git a/ice_scrapers/inspections.py b/ice_scrapers/inspections.py
@@ -1,8 +1,10 @@
 from bs4 import BeautifulSoup
+from compression import zstd
 import os
 import pdfplumber
 from pprint import pformat
 import re
+import sys
 from utils import (
     logger,
     output_folder,
@@ -53,9 +55,15 @@ def find_inspections() -> dict:
         # fifth capture group should be the inspection date
         date: str = matches.group(5)  # type: ignore [union-attr]
         obj["date"] = date
-        logger.debug("    Facility: %s, date: %s, details: %s", location, date, url)
-        obj["text"] = _extract_txt(str(url))
-        exit(1)
+        text = zstd.compress(_extract_txt(str(url)).encode("utf-8"))
+        logger.debug(
+            "    Facility: %s, date: %s, url: %s, report length (compressed): %s",
+            location,
+            date,
+            url,
+            sys.getsizeof(text),
+        )
+        obj["text"] = text
         if location in inspections:
             inspections[location].append(obj)
         else:
diff --git a/uv.lock b/uv.lock