Skip to content

Commit 04f03bd

Browse files
authored
Merge pull request #103 from johnseekins/fix-flatten-and-dep-tracking
Fix flatten, dep tracking, and other small bugs
2 parents 8821a06 + 2e78ed5 commit 04f03bd

13 files changed

Lines changed: 67 additions & 73 deletions

File tree

.config/mise.toml

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
[tools]
2-
python = "3.14.1"
2+
# languages
33
node = "latest"
4-
lefthook = "latest"
5-
yamllint = "latest"
4+
python = "3.14.1"
5+
# linters and tooling
66
actionlint = "latest"
7-
shellcheck = "latest"
8-
markdownlint-cli2 = "latest"
97
jq = "latest"
10-
uv = "latest"
8+
lefthook = "latest"
9+
markdownlint-cli2 = "latest"
10+
ruff = "latest"
11+
shellcheck = "latest"
12+
yamllint = "latest"

.github/dependabot.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@ updates:
55
directory: "/"
66
schedule:
77
interval: "weekly"
8+
groups:
9+
prod-dependencies:
10+
dependency-type: "production"
11+
dev-dependencies:
12+
dependency-type: "development"
813

914
- package-ecosystem: "github-actions"
1015
directory: "/"

.github/workflows/ci.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,18 @@ jobs:
1515
run: |
1616
curl -SsL https://mise.run | bash > /dev/null
1717
~/.local/bin/mise trust --quiet .config/mise.toml
18-
~/.local/bin/mise install --quiet python
18+
~/.local/bin/mise install --quiet ruff python
1919
eval "$(~/.local/bin/mise activate bash)" > /dev/null
2020
pip install -q --no-cache-dir --upgrade pip wheel uv
2121
uv sync
2222
- name: Ruff Format
2323
run: |
2424
eval "$(~/.local/bin/mise activate bash)" > /dev/null
25-
uv run ruff format --check
25+
ruff format --check
2626
- name: Ruff Check
2727
run: |
2828
eval "$(~/.local/bin/mise activate bash)" > /dev/null
29-
uv run ruff check
29+
ruff check
3030
- name: run mypy
3131
run: |
3232
eval "$(~/.local/bin/mise activate bash)" > /dev/null

.lefthook.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,13 @@ pre-commit:
2727
- ".github/workflows/*.y*ml"
2828

2929
- name: Ruff Formatting
30-
run: uv run ruff format -q .
30+
run: ruff format -q .
3131
glob:
3232
- "*.py"
3333
stage_fixed: true
3434

3535
- name: Ruff Syntax checking
36-
run: uv run ruff check --fix -q
36+
run: ruff check --fix -q
3737
glob:
3838
- "*.py"
3939
stage_fixed: true

file_utils.py

Lines changed: 3 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import copy
22
import json
33
import os
4-
import polars as pl
54
from schemas import enrichment_print_schema
65
from utils import (
76
convert_to_dataframe,
@@ -11,22 +10,6 @@
1110
import xlsxwriter # type: ignore [import-untyped]
1211

1312

14-
# Deals with list columns data that CSV cannot deal with.
15-
def _stringify_list_columns(df: pl.DataFrame) -> pl.DataFrame:
16-
"""Convert any List-type columns to JSON strings so CSV/Excel can handle them."""
17-
list_cols = [col for col, dtype in zip(df.columns, df.dtypes) if dtype.base_type() == pl.List]
18-
if list_cols:
19-
df = df.with_columns(
20-
[
21-
pl.col(c)
22-
.map_elements(lambda val: json.dumps(val.to_list(), default=str), return_dtype=pl.String)
23-
.alias(c)
24-
for c in list_cols
25-
]
26-
)
27-
return df
28-
29-
3013
def export_to_file(
3114
facilities_data: dict,
3215
filename: str = "ice_detention_facilities_enriched",
@@ -40,13 +23,12 @@ def export_to_file(
4023
writer = convert_to_dataframe(facilities_data["facilities"])
4124
match file_type:
4225
case "xlsx":
26+
# Excel doesn't support timezones properly, so...
4327
with xlsxwriter.Workbook(full_name, {"remove_timezone": True}) as wb:
44-
_ = _stringify_list_columns(writer).write_excel(workbook=wb, include_header=True, autofit=True)
45-
# _ = writer.write_excel(workbook=wb, include_header=True, autofit=True)
28+
_ = writer.write_excel(workbook=wb, include_header=True, autofit=True)
4629
case "csv":
4730
with open(full_name, "w", newline="", encoding="utf-8") as f_out:
48-
# writer.write_csv(file=f_out, include_header=True)
49-
_stringify_list_columns(writer).write_csv(file=f_out, include_header=True)
31+
_ = writer.write_csv(file=f_out, include_header=True)
5032
case "parquet":
5133
writer.write_parquet(full_name, use_pyarrow=True)
5234
case _:

ice_scrapers/general.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,14 @@ def facilities_scrape_wrapper(
3333
logger.debug(" Matching %s for inspection details...", facility)
3434
# exact match (extremely unlikely)
3535
if facility.lower() in facility_name_map:
36+
"""
37+
flip the order so the newest inspection is likely first in the list
38+
because trying to convert these wildly inconsistent dates to sortable
39+
objects is probably a fool's errand, so we'll just hope for the best...
40+
"""
3641
facilities_data["facilities"][facility_name_map[facility.lower()]]["inspection"]["details"] = copy.deepcopy(
3742
inspect
38-
)
43+
).reverse()
3944
break
4045
# logger.debug(" Checking fuzzy matches:")
4146
for k, v in facility_name_map.items():

ice_scrapers/inspections.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
11
from bs4 import BeautifulSoup
2-
import zstandard as zstd
2+
import copy
33
import os
44
import pdfplumber
55
from pprint import pformat
66
import re
7+
from schemas import inspection_schema
78
import sys
89
from utils import (
910
logger,
1011
output_folder,
1112
req_get,
1213
)
14+
import zstandard as zstd
15+
1316
from .utils import download_file
1417

1518
root_url = "https://www.ice.gov/foia/odo-facility-inspections"
@@ -45,7 +48,7 @@ def find_inspections(keep_text: bool = True) -> dict:
4548
links = content.select("a") # type: ignore [union-attr]
4649
for link in links:
4750
url = link["href"]
48-
obj = {"date": "", "url": url, "text": ""}
51+
obj = copy.deepcopy(inspection_schema)
4952
matches = text_re.search(link.text.strip())
5053
if len(matches.groups()) < 5: # type: ignore [union-attr]
5154
logger.warning(" Did not find all expected groups in %s. Skipping...", link.text.strip())
@@ -69,6 +72,5 @@ def find_inspections(keep_text: bool = True) -> dict:
6972
inspections[location].append(obj)
7073
else:
7174
inspections[location] = [obj]
72-
7375
logger.debug(pformat(inspections))
7476
return inspections

ice_scrapers/spreadsheet_load.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,6 @@
5656
"Guaranteed Minimum",
5757
"Last Inspection Type",
5858
"Last Inspection End Date",
59-
# "Pending YEAR Inspection",
6059
"Last Inspection Standard",
6160
"Last Final Rating",
6261
]
@@ -126,6 +125,9 @@ def load_sheet(keep_sheet: bool = True, force_download: bool = True) -> dict:
126125
if not all(row[k] is not None for k in required_cols):
127126
logger.debug("Skipping bad row in spreadsheet: %s", row)
128127
continue
128+
if row["Name"] == "Name":
129+
logger.debug("Skipping bad header row: %s", row)
130+
continue
129131
# logger.debug("processing %s", row)
130132
details = copy.deepcopy(facility_schema)
131133
zcode, cleaned, other_zips = repair_zip(row["Zip"], row["City"])

main.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ def main() -> None:
5656
"--file-type",
5757
choices=supported_output_types,
5858
type=str,
59+
default="csv",
5960
help="type of file to export",
6061
)
6162
_ = parser.add_argument(

pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ dependencies = [
2020
[dependency-groups]
2121
dev = [
2222
"mypy>=1.17.1",
23-
"ruff>=0.12.12",
2423
"types-beautifulsoup4>=4.12.0.20250516",
2524
"types-requests>=2.32.4.20250809",
2625
]

0 commit comments

Comments
 (0)