start fixing flatdict function

johnseekins · johnseekins · commit b60e6eac5a0f · 2026-02-16T22:10:44.000-07:00
Signed-off-by: John Seekins &lt;john@robot-house.us&gt;
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -5,6 +5,11 @@ updates:
     directory: "/"
     schedule:
       interval: "weekly"
+    groups:
+      prod-dependencies:
+        dependency-type: "production"
+      dev-dependencies:
+        dependency-type: "development"
 
   - package-ecosystem: "github-actions"
     directory: "/"
diff --git a/enrichers/general.py b/enrichers/general.py
@@ -59,7 +59,7 @@ def _enrich_facility(facility_data: tuple) -> tuple:
     if lat:
         enriched_facility["osm"]["latitude"] = lat
     if long:
-        enriched_facility["osm"]["longitude"] = lat
+        enriched_facility["osm"]["longitude"] = long
     url = osm_res.get("url", None)
     if url:
         enriched_facility["osm"]["url"] = url
diff --git a/enrichers/openstreetmap.py b/enrichers/openstreetmap.py
@@ -74,7 +74,7 @@ def search(self) -> dict:
         osm_type = first_result.get("osm_type", "")
         osm_id = first_result.get("osm_id", "")
         self.resp_info["title"] = first_result.get("display_name", "")
-        self.resp_info["details"] = {"latitude": lat, "logitude": lon, "class": first_result.get("class", "")}
+        self.resp_info["details"] = {"latitude": lat, "longitude": lon, "class": first_result.get("class", "")}
         if osm_type == "way":
             self.resp_info["url"] = f"https://www.openstreetmap.org/way/{osm_id}"
         else:
diff --git a/file_utils.py b/file_utils.py
@@ -1,7 +1,6 @@
 import copy
 import json
 import os
-import polars as pl
 from schemas import enrichment_print_schema
 from utils import (
     convert_to_dataframe,
@@ -10,15 +9,6 @@
 )
 import xlsxwriter  # type: ignore [import-untyped]
 
-# Deals with list columns data that CSV cannot deal with.
-def _stringify_list_columns(df: pl.DataFrame) -> pl.DataFrame:
-    """Convert any List-type columns to JSON strings so CSV/Excel can handle them."""
-    list_cols = [col for col, dtype in zip(df.columns, df.dtypes) if dtype.base_type() == pl.List]
-    if list_cols:
-        df = df.with_columns(
-            [pl.col(c).map_elements(lambda val: json.dumps(val, default=str), return_dtype=pl.String).alias(c) for c in list_cols]
-        )
-    return df
 
 def export_to_file(
     facilities_data: dict,
@@ -33,13 +23,15 @@ def export_to_file(
         writer = convert_to_dataframe(facilities_data["facilities"])
         match file_type:
             case "xlsx":
+                """
+                objects end up with dates without timezones, so trying to guess it can
+                mess up the translation to XLSX.
+                """
                 with xlsxwriter.Workbook(full_name, {"remove_timezone": True}) as wb:
-                    _ = _stringify_list_columns(writer).write_excel(workbook=wb, include_header=True, autofit=True)
-                    # _ = writer.write_excel(workbook=wb, include_header=True, autofit=True)
+                    _ = writer.write_excel(workbook=wb, include_header=True, autofit=True)
             case "csv":
                 with open(full_name, "w", newline="", encoding="utf-8") as f_out:
-                    # writer.write_csv(file=f_out, include_header=True)
-                    _stringify_list_columns(writer).write_csv(file=f_out, include_header=True)
+                    _ = writer.write_csv(file=f_out, include_header=True)
             case "parquet":
                 writer.write_parquet(full_name, use_pyarrow=True)
             case _:
diff --git a/ice_scrapers/spreadsheet_load.py b/ice_scrapers/spreadsheet_load.py
@@ -56,7 +56,6 @@
     "Guaranteed Minimum",
     "Last Inspection Type",
     "Last Inspection End Date",
-    # "Pending YEAR Inspection",
     "Last Inspection Standard",
     "Last Final Rating",
 ]
@@ -126,6 +125,9 @@ def load_sheet(keep_sheet: bool = True, force_download: bool = True) -> dict:
         if not all(row[k] is not None for k in required_cols):
             logger.debug("Skipping bad row in spreadsheet: %s", row)
             continue
+        if row["Name"] == "Name":
+            logger.debug("Skipping bad header row: %s", row)
+            continue
         # logger.debug("processing %s", row)
         details = copy.deepcopy(facility_schema)
         zcode, cleaned, other_zips = repair_zip(row["Zip"], row["City"])
diff --git a/utils.py b/utils.py
@@ -32,12 +32,9 @@
 
 # all values that will only complicate workbook output types
 flatdata_filtered_keys = [
-    "_repaired_record",
     "address_str",
     "field_office.address_str",
-    "field_office.source_urls",
     "osm.search_query",
-    "source_urls",
     "wikipedia.search_query",
     "wikidata.search_query",
 ]
@@ -73,13 +70,23 @@ def req_get(url: str, **kwargs) -> requests.Response:
     return response
 
 
-def _flatdict(d: dict, parent_key: str = "", sep: str = ".") -> dict:
+def _flatdict(d: dict, parent_key: str = "", sep: str = ".", list_sep: str = ",") -> dict:
     """flatten a nested dictionary for nicer printing to workbooks (excel/csv/etc.)"""
     items: list = []
     for k, v in d.items():
         new_key = f"{parent_key}{sep}{str(k)}" if parent_key else str(k)
         if isinstance(v, dict):
-            items.extend(_flatdict(v, new_key, sep=sep).items())
+            items.extend(_flatdict(v, new_key, sep=sep, list_sep=list_sep).items())
+        elif isinstance(v, list):
+            if not v:
+                items.append((new_key, ""))
+            elif isinstance(v[0], dict):
+                logger.info("List of dicts: %s", v)
+                for idx, value in enumerate(v):
+                    items.extend(_flatdict(value, f"{new_key}{sep}{idx}", sep=sep, list_sep=list_sep).items())
+
+            else:
+                items.append((new_key, list_sep.join(v)))
         else:
             items.append((new_key, v))
     return dict(items)