correct usage of ruff in CI and reverse inspections order

johnseekins · johnseekins · commit bf060df6c6b4 · 2026-02-16T22:38:44.000-07:00
Signed-off-by: John Seekins &lt;john@robot-house.us&gt;
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -15,18 +15,18 @@ jobs:
         run: |
           curl -SsL https://mise.run | bash > /dev/null
           ~/.local/bin/mise trust --quiet .config/mise.toml
-          ~/.local/bin/mise install --quiet python
+          ~/.local/bin/mise install --quiet ruff python
           eval "$(~/.local/bin/mise activate bash)" > /dev/null
           pip install -q --no-cache-dir --upgrade pip wheel uv
           uv sync
       - name: Ruff Format
         run: |
           eval "$(~/.local/bin/mise activate bash)" > /dev/null
-          uv run ruff format --check
+          ruff format --check
       - name: Ruff Check
         run: |
           eval "$(~/.local/bin/mise activate bash)" > /dev/null
-          uv run ruff check
+          ruff check
       - name: run mypy
         run: |
           eval "$(~/.local/bin/mise activate bash)" > /dev/null
diff --git a/ice_scrapers/general.py b/ice_scrapers/general.py
@@ -33,9 +33,14 @@ def facilities_scrape_wrapper(
         logger.debug("  Matching %s for inspection details...", facility)
         # exact match (extremely unlikely)
         if facility.lower() in facility_name_map:
+            """
+            flip the order so the newest inspection is likely first in the list
+            because trying to convert these wildly inconsistent dates to sortable
+            objects is probably a fool's errand, so we'll just hope for the best...
+            """
             facilities_data["facilities"][facility_name_map[facility.lower()]]["inspection"]["details"] = copy.deepcopy(
                 inspect
-            )
+            ).reverse()
             break
         # logger.debug("    Checking fuzzy matches:")
         for k, v in facility_name_map.items():
diff --git a/ice_scrapers/inspections.py b/ice_scrapers/inspections.py
@@ -1,15 +1,18 @@
 from bs4 import BeautifulSoup
-import zstandard as zstd
+import copy
 import os
 import pdfplumber
 from pprint import pformat
 import re
+from schemas import inspection_schema
 import sys
 from utils import (
     logger,
     output_folder,
     req_get,
 )
+import zstandard as zstd
+
 from .utils import download_file
 
 root_url = "https://www.ice.gov/foia/odo-facility-inspections"
@@ -45,7 +48,7 @@ def find_inspections(keep_text: bool = True) -> dict:
     links = content.select("a")  # type: ignore [union-attr]
     for link in links:
         url = link["href"]
-        obj = {"date": "", "url": url, "text": ""}
+        obj = copy.deepcopy(inspection_schema)
         matches = text_re.search(link.text.strip())
         if len(matches.groups()) < 5:  # type: ignore [union-attr]
             logger.warning("  Did not find all expected groups in %s. Skipping...", link.text.strip())
@@ -69,6 +72,5 @@ def find_inspections(keep_text: bool = True) -> dict:
             inspections[location].append(obj)
         else:
             inspections[location] = [obj]
-
     logger.debug(pformat(inspections))
     return inspections
diff --git a/schemas.py b/schemas.py
@@ -14,6 +14,12 @@
     "scraped_date": datetime.datetime.now(datetime.UTC),
 }
 
+inspection_schema: dict = {
+    "date": "",
+    "url": "",
+    "text": "",
+}
+
 field_office_schema: dict = {
     "address": {
         "administrative_area": "",
diff --git a/utils.py b/utils.py
@@ -1,5 +1,4 @@
 # For general helpers, regexes, or shared logic (e.g. phone/address parsing functions).
-import copy
 import logging
 import os
 import polars
@@ -96,18 +95,11 @@ def convert_to_dataframe(d: dict) -> polars.DataFrame:
     """internal dict to dataframe"""
     flatdata = [_flatdict(f) for f in d.values()]
     """
-    Field names should find the _longest_ set of keys, not just the first one
-    to avoid dropping data by accident from some rows (with things like additional inspections)
+    Ideally we'd look for the longest row to use as our schema,
+    but dataframes are picky about services missing those extra rows,
+    so for simpler logic, we'll just use the first row
     """
-    longest: list = list(flatdata[0].keys())
-    longest_len: int = len(longest)
-    for dobj in flatdata[1:]:
-        keys = list(dobj.keys())
-        if len(keys) > longest_len:
-            longest = copy.deepcopy(keys)
-            longest_len = len(longest)
-    logger.info("Key list is: %s", longest)
-    fieldnames = [k for k in longest if k not in flatdata_filtered_keys]
+    fieldnames = [k for k in flatdata[0].keys() if k not in flatdata_filtered_keys]
     # https://docs.pola.rs/api/python/stable/reference/api/polars.from_dicts.html
     df = polars.from_dicts(flatdata, schema=fieldnames)
     # logger.debug("Dataframe: %s", df)