Skip to content

Commit 7658ed2

Browse files
committed
extract and compress text from original PDF report
Signed-off-by: John Seekins <john@robot-house.us>
1 parent 657f1ce commit 7658ed2

3 files changed

Lines changed: 122 additions & 55 deletions

File tree

.config/mise.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[tools]
2-
python = "3.13.3"
2+
python = "3.14.1"
33
node = "latest"
44
lefthook = "latest"
55
yamllint = "latest"

ice_scrapers/inspections.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
from bs4 import BeautifulSoup
2+
from compression import zstd
23
import os
34
import pdfplumber
45
from pprint import pformat
56
import re
7+
import sys
68
from utils import (
79
logger,
810
output_folder,
@@ -53,9 +55,15 @@ def find_inspections() -> dict:
5355
# fifth capture group should be the inspection date
5456
date: str = matches.group(5) # type: ignore [union-attr]
5557
obj["date"] = date
56-
logger.debug(" Facility: %s, date: %s, details: %s", location, date, url)
57-
obj["text"] = _extract_txt(str(url))
58-
exit(1)
58+
text = zstd.compress(_extract_txt(str(url)).encode("utf-8"))
59+
logger.debug(
60+
" Facility: %s, date: %s, url: %s, report length (compressed): %s",
61+
location,
62+
date,
63+
url,
64+
sys.getsizeof(text),
65+
)
66+
obj["text"] = text
5967
if location in inspections:
6068
inspections[location].append(obj)
6169
else:

0 commit comments

Comments
 (0)