add text extraction

johnseekins · johnseekins · commit 657f1ce86448 · 2025-12-03T22:05:26.000-07:00
Signed-off-by: John Seekins &lt;john@robot-house.us&gt;
diff --git a/ice_scrapers/general.py b/ice_scrapers/general.py
@@ -15,6 +15,7 @@
 def facilities_scrape_wrapper(
     keep_sheet: bool = True, force_download: bool = True, skip_vera: bool = False
 ) -> tuple[dict, dict]:
+    _ = find_inspections()
     agencies = scrape_agencies(keep_sheet, force_download)
     facilities_data = copy.deepcopy(facilities_schema)
     facilities = load_sheet(keep_sheet, force_download)
@@ -25,6 +26,5 @@ def facilities_scrape_wrapper(
     field_offices = scrape_field_offices()
     facilities_data = merge_field_offices(facilities_data, field_offices)
     facilities_data = insert_additional_facilities(facilities_data)
-    _ = find_inspections()
 
     return facilities_data, agencies
diff --git a/ice_scrapers/inspections.py b/ice_scrapers/inspections.py
@@ -1,5 +1,6 @@
 from bs4 import BeautifulSoup
 import os
+import pdfplumber
 from pprint import pformat
 import re
 from utils import (
@@ -11,10 +12,27 @@
 
 root_url = "https://www.ice.gov/foia/odo-facility-inspections"
 storage_dir = f"{output_folder}{os.sep}inspections{os.sep}"
+"""
+example: 2011 Calhoun County Correctional Facility, Battle Creek, MI - Dec. 6-8, 2011
+example 2: 2024 Chippewa County, Sault Sainte Marie, MI – Apr. 23-25, 2024
+example 3: FY 2018 South Texas ICE Processing Center Compliance Inspection Report – Pearsall, TX - May 1-3, 2018
+"""
 text_re = re.compile(r"^(\w+\s)?(\d+)\s(.+)\s(-|–)\s(.+)$")
 
 
-def find_inspections(download_reports: bool = False):
+def _extract_txt(url: str) -> str:
+    file_name = url.split("/")[-1]  # type: ignore [union-attr]
+    download_file(str(url), f"{storage_dir}{file_name}")
+    full_text = ""
+    with pdfplumber.open(f"{storage_dir}{file_name}") as pdf:
+        for idx, page in enumerate(pdf.pages):
+            txt = page.extract_text()
+            logger.debug("    Page %s: %s", idx + 1, txt)
+            full_text = f"{full_text}\n{txt}"
+    return full_text
+
+
+def find_inspections() -> dict:
     os.makedirs(storage_dir, exist_ok=True)
     inspections: dict = {}
     logger.info("Collecting inspection reports from %s", root_url)
@@ -25,24 +43,23 @@ def find_inspections(download_reports: bool = False):
     links = content.select("a")  # type: ignore [union-attr]
     for link in links:
         url = link["href"]
-        file_name = url.split("/")[-1]  # type: ignore [union-attr]
-        """
-        example: 2011 Calhoun County Correctional Facility, Battle Creek, MI - Dec. 6-8, 2011
-        example 2: 2024 Chippewa County, Sault Sainte Marie, MI – Apr. 23-25, 2024
-        example 3: FY 2018 South Texas ICE Processing Center Compliance Inspection Report – Pearsall, TX - May 1-3, 2018
-        There are inconsistent hyphens!
-        """
-        text = text_re.search(link.text.strip())
+        obj = {"date": "", "url": url, "text": ""}
+        matches = text_re.search(link.text.strip())
+        if len(matches.groups()) < 5:  # type: ignore [union-attr]
+            logger.warning("  Did not find all expected groups in %s. Skipping...", link.text.strip())
+            continue
         # third capture group should be the facility name
-        location: str = text[3]  # type: ignore [index]
+        location: str = matches.group(3)  # type: ignore [union-attr]
         # fifth capture group should be the inspection date
-        date: str = text[5]  # type: ignore [index]
-        logger.debug("Facility: %s, date: %s, details: %s", location, date, url)
-        if download_reports:
-            download_file(str(url), f"{output_folder}{os.sep}inspections{os.sep}{file_name}")
+        date: str = matches.group(5)  # type: ignore [union-attr]
+        obj["date"] = date
+        logger.debug("    Facility: %s, date: %s, details: %s", location, date, url)
+        obj["text"] = _extract_txt(str(url))
+        exit(1)
         if location in inspections:
-            inspections[location].append({"date": date, "details": url})
+            inspections[location].append(obj)
         else:
-            inspections[location] = [{"date": date, "details": url}]
+            inspections[location] = [obj]
+
     logger.debug(pformat(inspections))
     return inspections
diff --git a/ice_scrapers/utils.py b/ice_scrapers/utils.py
@@ -1,22 +1,25 @@
 from bs4 import BeautifulSoup
+import os
 import re
 from utils import (
     logger,
     session,
 )
 
 
-def download_file(link: str, path: str) -> None:
+def download_file(link: str, path: str, redownload: bool = False) -> None:
     """
     Standard pattern for downloading a binary file from a URL
     """
+    if os.path.exists(path) and os.path.getsize(path) > 0 and not redownload:
+        logger.debug("    Skipping redownload of existing file %s", path)
     resp = session.get(link, timeout=120, stream=True)
     size = len(resp.content)
     with open(path, "wb") as f:
         for chunk in resp.iter_content(chunk_size=1024):
             if chunk:
                 f.write(chunk)
-    logger.debug("Wrote %s byte sheet to %s", size, path)
+    logger.debug("    Wrote %s byte file to %s", size, path)
 
 
 def special_facilities(facility: dict) -> dict:
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,6 +8,7 @@ dependencies = [
     "beautifulsoup4>=4.13.5",
     "fastexcel>=0.15.1",
     "lxml>=6.0.1",
+    "pdfplumber>=0.11.8",
     "polars>=1.33.0",
     "pyarrow>=21.0.0",
     "requests>=2.32.5",
diff --git a/uv.lock b/uv.lock