Skip to content

Commit b60e6ea

Browse files
committed
start fixing flatdict function
Signed-off-by: John Seekins <john@robot-house.us>
1 parent b7c1ae4 commit b60e6ea

6 files changed

Lines changed: 28 additions & 22 deletions

File tree

.github/dependabot.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@ updates:
55
directory: "/"
66
schedule:
77
interval: "weekly"
8+
groups:
9+
prod-dependencies:
10+
dependency-type: "production"
11+
dev-dependencies:
12+
dependency-type: "development"
813

914
- package-ecosystem: "github-actions"
1015
directory: "/"

enrichers/general.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def _enrich_facility(facility_data: tuple) -> tuple:
5959
if lat:
6060
enriched_facility["osm"]["latitude"] = lat
6161
if long:
62-
enriched_facility["osm"]["longitude"] = lat
62+
enriched_facility["osm"]["longitude"] = long
6363
url = osm_res.get("url", None)
6464
if url:
6565
enriched_facility["osm"]["url"] = url

enrichers/openstreetmap.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def search(self) -> dict:
7474
osm_type = first_result.get("osm_type", "")
7575
osm_id = first_result.get("osm_id", "")
7676
self.resp_info["title"] = first_result.get("display_name", "")
77-
self.resp_info["details"] = {"latitude": lat, "logitude": lon, "class": first_result.get("class", "")}
77+
self.resp_info["details"] = {"latitude": lat, "longitude": lon, "class": first_result.get("class", "")}
7878
if osm_type == "way":
7979
self.resp_info["url"] = f"https://www.openstreetmap.org/way/{osm_id}"
8080
else:

file_utils.py

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import copy
22
import json
33
import os
4-
import polars as pl
54
from schemas import enrichment_print_schema
65
from utils import (
76
convert_to_dataframe,
@@ -10,15 +9,6 @@
109
)
1110
import xlsxwriter # type: ignore [import-untyped]
1211

13-
# Deals with list columns data that CSV cannot deal with.
14-
def _stringify_list_columns(df: pl.DataFrame) -> pl.DataFrame:
15-
"""Convert any List-type columns to JSON strings so CSV/Excel can handle them."""
16-
list_cols = [col for col, dtype in zip(df.columns, df.dtypes) if dtype.base_type() == pl.List]
17-
if list_cols:
18-
df = df.with_columns(
19-
[pl.col(c).map_elements(lambda val: json.dumps(val, default=str), return_dtype=pl.String).alias(c) for c in list_cols]
20-
)
21-
return df
2212

2313
def export_to_file(
2414
facilities_data: dict,
@@ -33,13 +23,15 @@ def export_to_file(
3323
writer = convert_to_dataframe(facilities_data["facilities"])
3424
match file_type:
3525
case "xlsx":
26+
"""
27+
objects end up with dates without timezones, so trying to guess it can
28+
mess up the translation to XLSX.
29+
"""
3630
with xlsxwriter.Workbook(full_name, {"remove_timezone": True}) as wb:
37-
_ = _stringify_list_columns(writer).write_excel(workbook=wb, include_header=True, autofit=True)
38-
# _ = writer.write_excel(workbook=wb, include_header=True, autofit=True)
31+
_ = writer.write_excel(workbook=wb, include_header=True, autofit=True)
3932
case "csv":
4033
with open(full_name, "w", newline="", encoding="utf-8") as f_out:
41-
# writer.write_csv(file=f_out, include_header=True)
42-
_stringify_list_columns(writer).write_csv(file=f_out, include_header=True)
34+
_ = writer.write_csv(file=f_out, include_header=True)
4335
case "parquet":
4436
writer.write_parquet(full_name, use_pyarrow=True)
4537
case _:

ice_scrapers/spreadsheet_load.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,6 @@
5656
"Guaranteed Minimum",
5757
"Last Inspection Type",
5858
"Last Inspection End Date",
59-
# "Pending YEAR Inspection",
6059
"Last Inspection Standard",
6160
"Last Final Rating",
6261
]
@@ -126,6 +125,9 @@ def load_sheet(keep_sheet: bool = True, force_download: bool = True) -> dict:
126125
if not all(row[k] is not None for k in required_cols):
127126
logger.debug("Skipping bad row in spreadsheet: %s", row)
128127
continue
128+
if row["Name"] == "Name":
129+
logger.debug("Skipping bad header row: %s", row)
130+
continue
129131
# logger.debug("processing %s", row)
130132
details = copy.deepcopy(facility_schema)
131133
zcode, cleaned, other_zips = repair_zip(row["Zip"], row["City"])

utils.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,9 @@
3232

3333
# all values that will only complicate workbook output types
3434
flatdata_filtered_keys = [
35-
"_repaired_record",
3635
"address_str",
3736
"field_office.address_str",
38-
"field_office.source_urls",
3937
"osm.search_query",
40-
"source_urls",
4138
"wikipedia.search_query",
4239
"wikidata.search_query",
4340
]
@@ -73,13 +70,23 @@ def req_get(url: str, **kwargs) -> requests.Response:
7370
return response
7471

7572

76-
def _flatdict(d: dict, parent_key: str = "", sep: str = ".") -> dict:
73+
def _flatdict(d: dict, parent_key: str = "", sep: str = ".", list_sep: str = ",") -> dict:
7774
"""flatten a nested dictionary for nicer printing to workbooks (excel/csv/etc.)"""
7875
items: list = []
7976
for k, v in d.items():
8077
new_key = f"{parent_key}{sep}{str(k)}" if parent_key else str(k)
8178
if isinstance(v, dict):
82-
items.extend(_flatdict(v, new_key, sep=sep).items())
79+
items.extend(_flatdict(v, new_key, sep=sep, list_sep=list_sep).items())
80+
elif isinstance(v, list):
81+
if not v:
82+
items.append((new_key, ""))
83+
elif isinstance(v[0], dict):
84+
logger.info("List of dicts: %s", v)
85+
for idx, value in enumerate(v):
86+
items.extend(_flatdict(value, f"{new_key}{sep}{idx}", sep=sep, list_sep=list_sep).items())
87+
88+
else:
89+
items.append((new_key, list_sep.join(v)))
8390
else:
8491
items.append((new_key, v))
8592
return dict(items)

0 commit comments

Comments
 (0)