Merge pull request #103 from johnseekins/fix-flatten-and-dep-tracking

HongPong · web-flow · commit 04f03bd70580 · 2026-02-19T02:10:47.000Z
Fix flatten, dep tracking, and other small bugs
diff --git a/.config/mise.toml b/.config/mise.toml
@@ -1,10 +1,12 @@
 [tools]
-python = "3.14.1"
+# languages
 node = "latest"
-lefthook = "latest"
-yamllint = "latest"
+python = "3.14.1"
+# linters and tooling
 actionlint = "latest"
-shellcheck = "latest"
-markdownlint-cli2 = "latest"
 jq = "latest"
-uv = "latest"
+lefthook = "latest"
+markdownlint-cli2 = "latest"
+ruff = "latest"
+shellcheck = "latest"
+yamllint = "latest"
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -5,6 +5,11 @@ updates:
     directory: "/"
     schedule:
       interval: "weekly"
+    groups:
+      prod-dependencies:
+        dependency-type: "production"
+      dev-dependencies:
+        dependency-type: "development"
 
   - package-ecosystem: "github-actions"
     directory: "/"
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -15,18 +15,18 @@ jobs:
         run: |
           curl -SsL https://mise.run | bash > /dev/null
           ~/.local/bin/mise trust --quiet .config/mise.toml
-          ~/.local/bin/mise install --quiet python
+          ~/.local/bin/mise install --quiet ruff python
           eval "$(~/.local/bin/mise activate bash)" > /dev/null
           pip install -q --no-cache-dir --upgrade pip wheel uv
           uv sync
       - name: Ruff Format
         run: |
           eval "$(~/.local/bin/mise activate bash)" > /dev/null
-          uv run ruff format --check
+          ruff format --check
       - name: Ruff Check
         run: |
           eval "$(~/.local/bin/mise activate bash)" > /dev/null
-          uv run ruff check
+          ruff check
       - name: run mypy
         run: |
           eval "$(~/.local/bin/mise activate bash)" > /dev/null
diff --git a/.lefthook.yml b/.lefthook.yml
@@ -27,13 +27,13 @@ pre-commit:
         - ".github/workflows/*.y*ml"
 
     - name: Ruff Formatting
-      run: uv run ruff format -q .
+      run: ruff format -q .
       glob:
         - "*.py"
       stage_fixed: true
 
     - name: Ruff Syntax checking
-      run: uv run ruff check --fix -q
+      run: ruff check --fix -q
       glob:
         - "*.py"
       stage_fixed: true
diff --git a/file_utils.py b/file_utils.py
@@ -1,7 +1,6 @@
 import copy
 import json
 import os
-import polars as pl
 from schemas import enrichment_print_schema
 from utils import (
     convert_to_dataframe,
@@ -11,22 +10,6 @@
 import xlsxwriter  # type: ignore [import-untyped]
 
 
-# Deals with list columns data that CSV cannot deal with.
-def _stringify_list_columns(df: pl.DataFrame) -> pl.DataFrame:
-    """Convert any List-type columns to JSON strings so CSV/Excel can handle them."""
-    list_cols = [col for col, dtype in zip(df.columns, df.dtypes) if dtype.base_type() == pl.List]
-    if list_cols:
-        df = df.with_columns(
-            [
-                pl.col(c)
-                .map_elements(lambda val: json.dumps(val.to_list(), default=str), return_dtype=pl.String)
-                .alias(c)
-                for c in list_cols
-            ]
-        )
-    return df
-
-
 def export_to_file(
     facilities_data: dict,
     filename: str = "ice_detention_facilities_enriched",
@@ -40,13 +23,12 @@ def export_to_file(
         writer = convert_to_dataframe(facilities_data["facilities"])
         match file_type:
             case "xlsx":
+                # Excel doesn't support timezones properly, so...
                 with xlsxwriter.Workbook(full_name, {"remove_timezone": True}) as wb:
-                    _ = _stringify_list_columns(writer).write_excel(workbook=wb, include_header=True, autofit=True)
-                    # _ = writer.write_excel(workbook=wb, include_header=True, autofit=True)
+                    _ = writer.write_excel(workbook=wb, include_header=True, autofit=True)
             case "csv":
                 with open(full_name, "w", newline="", encoding="utf-8") as f_out:
-                    # writer.write_csv(file=f_out, include_header=True)
-                    _stringify_list_columns(writer).write_csv(file=f_out, include_header=True)
+                    _ = writer.write_csv(file=f_out, include_header=True)
             case "parquet":
                 writer.write_parquet(full_name, use_pyarrow=True)
             case _:
diff --git a/ice_scrapers/general.py b/ice_scrapers/general.py
@@ -33,9 +33,14 @@ def facilities_scrape_wrapper(
         logger.debug("  Matching %s for inspection details...", facility)
         # exact match (extremely unlikely)
         if facility.lower() in facility_name_map:
+            """
+            flip the order so the newest inspection is likely first in the list
+            because trying to convert these wildly inconsistent dates to sortable
+            objects is probably a fool's errand, so we'll just hope for the best...
+            """
             facilities_data["facilities"][facility_name_map[facility.lower()]]["inspection"]["details"] = copy.deepcopy(
                 inspect
-            )
+            ).reverse()
             break
         # logger.debug("    Checking fuzzy matches:")
         for k, v in facility_name_map.items():
diff --git a/ice_scrapers/inspections.py b/ice_scrapers/inspections.py
@@ -1,15 +1,18 @@
 from bs4 import BeautifulSoup
-import zstandard as zstd
+import copy
 import os
 import pdfplumber
 from pprint import pformat
 import re
+from schemas import inspection_schema
 import sys
 from utils import (
     logger,
     output_folder,
     req_get,
 )
+import zstandard as zstd
+
 from .utils import download_file
 
 root_url = "https://www.ice.gov/foia/odo-facility-inspections"
@@ -45,7 +48,7 @@ def find_inspections(keep_text: bool = True) -> dict:
     links = content.select("a")  # type: ignore [union-attr]
     for link in links:
         url = link["href"]
-        obj = {"date": "", "url": url, "text": ""}
+        obj = copy.deepcopy(inspection_schema)
         matches = text_re.search(link.text.strip())
         if len(matches.groups()) < 5:  # type: ignore [union-attr]
             logger.warning("  Did not find all expected groups in %s. Skipping...", link.text.strip())
@@ -69,6 +72,5 @@ def find_inspections(keep_text: bool = True) -> dict:
             inspections[location].append(obj)
         else:
             inspections[location] = [obj]
-
     logger.debug(pformat(inspections))
     return inspections
diff --git a/ice_scrapers/spreadsheet_load.py b/ice_scrapers/spreadsheet_load.py
@@ -56,7 +56,6 @@
     "Guaranteed Minimum",
     "Last Inspection Type",
     "Last Inspection End Date",
-    # "Pending YEAR Inspection",
     "Last Inspection Standard",
     "Last Final Rating",
 ]
@@ -126,6 +125,9 @@ def load_sheet(keep_sheet: bool = True, force_download: bool = True) -> dict:
         if not all(row[k] is not None for k in required_cols):
             logger.debug("Skipping bad row in spreadsheet: %s", row)
             continue
+        if row["Name"] == "Name":
+            logger.debug("Skipping bad header row: %s", row)
+            continue
         # logger.debug("processing %s", row)
         details = copy.deepcopy(facility_schema)
         zcode, cleaned, other_zips = repair_zip(row["Zip"], row["City"])
diff --git a/main.py b/main.py
@@ -56,6 +56,7 @@ def main() -> None:
         "--file-type",
         choices=supported_output_types,
         type=str,
+        default="csv",
         help="type of file to export",
     )
     _ = parser.add_argument(
diff --git a/pyproject.toml b/pyproject.toml
@@ -20,7 +20,6 @@ dependencies = [
 [dependency-groups]
 dev = [
     "mypy>=1.17.1",
-    "ruff>=0.12.12",
     "types-beautifulsoup4>=4.12.0.20250516",
     "types-requests>=2.32.4.20250809",
 ]
diff --git a/schemas.py b/schemas.py
@@ -14,6 +14,12 @@
     "scraped_date": datetime.datetime.now(datetime.UTC),
 }
 
+inspection_schema: dict = {
+    "date": "",
+    "url": "",
+    "text": "",
+}
+
 field_office_schema: dict = {
     "address": {
         "administrative_area": "",
diff --git a/utils.py b/utils.py
@@ -32,12 +32,9 @@
 
 # all values that will only complicate workbook output types
 flatdata_filtered_keys = [
-    "_repaired_record",
     "address_str",
     "field_office.address_str",
-    "field_office.source_urls",
     "osm.search_query",
-    "source_urls",
     "wikipedia.search_query",
     "wikidata.search_query",
 ]
@@ -73,13 +70,22 @@ def req_get(url: str, **kwargs) -> requests.Response:
     return response
 
 
-def _flatdict(d: dict, parent_key: str = "", sep: str = ".") -> dict:
+def _flatdict(d: dict, parent_key: str = "", sep: str = ".", list_sep: str = ",") -> dict:
     """flatten a nested dictionary for nicer printing to workbooks (excel/csv/etc.)"""
     items: list = []
     for k, v in d.items():
         new_key = f"{parent_key}{sep}{str(k)}" if parent_key else str(k)
         if isinstance(v, dict):
-            items.extend(_flatdict(v, new_key, sep=sep).items())
+            items.extend(_flatdict(v, new_key, sep=sep, list_sep=list_sep).items())
+        elif isinstance(v, list):
+            if not v:
+                items.append((new_key, ""))
+            elif isinstance(v[0], dict):
+                for idx, value in enumerate(v):
+                    items.extend(_flatdict(value, f"{new_key}{sep}{idx}", sep=sep, list_sep=list_sep).items())
+
+            else:
+                items.append((new_key, list_sep.join(v)))
         else:
             items.append((new_key, v))
     return dict(items)
@@ -88,9 +94,21 @@ def _flatdict(d: dict, parent_key: str = "", sep: str = ".") -> dict:
 def convert_to_dataframe(d: dict) -> polars.DataFrame:
     """internal dict to dataframe"""
     flatdata = [_flatdict(f) for f in d.values()]
+    """
+    Ideally we'd look for the longest row to use as our schema,
+    but dataframes are picky about services missing those extra rows,
+    so for simpler logic, we'll just use the first row
+    """
     fieldnames = [k for k in flatdata[0].keys() if k not in flatdata_filtered_keys]
     # https://docs.pola.rs/api/python/stable/reference/api/polars.from_dicts.html
-    df = polars.from_dicts(flatdata, schema=fieldnames)
-    # logger.debug("Dataframe: %s", df)
+    df = polars.from_dicts(
+        flatdata,
+        schema=fieldnames,
+        schema_overrides={
+            "address.postal_code": polars.Utf8,
+            "field_office.address.postal_code": polars.Utf8,
+        },
+    )
+    logger.info("Dataframe schema: %s", df.schema)
     # logger.debug("All header fields: %s", fieldnames)
     return df
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -56,6 +56,7 @@ def main() -> None:`
`56`	`56`	`"--file-type",`
`57`	`57`	`choices=supported_output_types,`
`58`	`58`	`type=str,`
	`59`	`+ default="csv",`
`59`	`60`	`help="type of file to export",`
`60`	`61`	`)`
`61`	`62`	`_ = parser.add_argument(`
Original file line number	Diff line number	Diff line change
`@@ -20,7 +20,6 @@ dependencies = [`
`20`	`20`	`[dependency-groups]`
`21`	`21`	`dev = [`
`22`	`22`	`"mypy>=1.17.1",`
`23`		`- "ruff>=0.12.12",`
`24`	`23`	`"types-beautifulsoup4>=4.12.0.20250516",`
`25`	`24`	`"types-requests>=2.32.4.20250809",`
`26`	`25`	`]`