Skip to content

Commit b7c1ae4

Browse files
committed
stringify list columns and add zstandard #97
1 parent dbca23a commit b7c1ae4

5 files changed

Lines changed: 59 additions & 3 deletions

File tree

.config/mise.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@ actionlint = "latest"
77
shellcheck = "latest"
88
markdownlint-cli2 = "latest"
99
jq = "latest"
10+
uv = "latest"

file_utils.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import copy
22
import json
33
import os
4+
import polars as pl
45
from schemas import enrichment_print_schema
56
from utils import (
67
convert_to_dataframe,
@@ -9,6 +10,15 @@
910
)
1011
import xlsxwriter # type: ignore [import-untyped]
1112

13+
# Deals with list columns data that CSV cannot deal with.
14+
def _stringify_list_columns(df: pl.DataFrame) -> pl.DataFrame:
15+
"""Convert any List-type columns to JSON strings so CSV/Excel can handle them."""
16+
list_cols = [col for col, dtype in zip(df.columns, df.dtypes) if dtype.base_type() == pl.List]
17+
if list_cols:
18+
df = df.with_columns(
19+
[pl.col(c).map_elements(lambda val: json.dumps(val, default=str), return_dtype=pl.String).alias(c) for c in list_cols]
20+
)
21+
return df
1222

1323
def export_to_file(
1424
facilities_data: dict,
@@ -24,10 +34,12 @@ def export_to_file(
2434
match file_type:
2535
case "xlsx":
2636
with xlsxwriter.Workbook(full_name, {"remove_timezone": True}) as wb:
27-
_ = writer.write_excel(workbook=wb, include_header=True, autofit=True)
37+
_ = _stringify_list_columns(writer).write_excel(workbook=wb, include_header=True, autofit=True)
38+
# _ = writer.write_excel(workbook=wb, include_header=True, autofit=True)
2839
case "csv":
2940
with open(full_name, "w", newline="", encoding="utf-8") as f_out:
30-
writer.write_csv(file=f_out, include_header=True)
41+
# writer.write_csv(file=f_out, include_header=True)
42+
_stringify_list_columns(writer).write_csv(file=f_out, include_header=True)
3143
case "parquet":
3244
writer.write_parquet(full_name, use_pyarrow=True)
3345
case _:

ice_scrapers/inspections.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from bs4 import BeautifulSoup
2-
from compression import zstd
2+
import zstandard as zstd
33
import os
44
import pdfplumber
55
from pprint import pformat

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ dependencies = [
1414
"requests>=2.32.5",
1515
"thefuzz>=0.22.1",
1616
"xlsxwriter>=3.2.5",
17+
"zstandard>=0.25.0",
1718
]
1819

1920
[dependency-groups]

uv.lock

Lines changed: 42 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)