Skip to content

Commit bf060df

Browse files
committed
correct usage of ruff in CI and reverse inspections order
Signed-off-by: John Seekins <john@robot-house.us>
1 parent f786ad3 commit bf060df

5 files changed

Lines changed: 24 additions & 19 deletions

File tree

.github/workflows/ci.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,18 @@ jobs:
1515
run: |
1616
curl -SsL https://mise.run | bash > /dev/null
1717
~/.local/bin/mise trust --quiet .config/mise.toml
18-
~/.local/bin/mise install --quiet python
18+
~/.local/bin/mise install --quiet ruff python
1919
eval "$(~/.local/bin/mise activate bash)" > /dev/null
2020
pip install -q --no-cache-dir --upgrade pip wheel uv
2121
uv sync
2222
- name: Ruff Format
2323
run: |
2424
eval "$(~/.local/bin/mise activate bash)" > /dev/null
25-
uv run ruff format --check
25+
ruff format --check
2626
- name: Ruff Check
2727
run: |
2828
eval "$(~/.local/bin/mise activate bash)" > /dev/null
29-
uv run ruff check
29+
ruff check
3030
- name: run mypy
3131
run: |
3232
eval "$(~/.local/bin/mise activate bash)" > /dev/null

ice_scrapers/general.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,14 @@ def facilities_scrape_wrapper(
3333
logger.debug(" Matching %s for inspection details...", facility)
3434
# exact match (extremely unlikely)
3535
if facility.lower() in facility_name_map:
36+
"""
37+
flip the order so the newest inspection is likely first in the list
38+
because trying to convert these wildly inconsistent dates to sortable
39+
objects is probably a fool's errand, so we'll just hope for the best...
40+
"""
3641
facilities_data["facilities"][facility_name_map[facility.lower()]]["inspection"]["details"] = copy.deepcopy(
3742
inspect
38-
)
43+
).reverse()
3944
break
4045
# logger.debug(" Checking fuzzy matches:")
4146
for k, v in facility_name_map.items():

ice_scrapers/inspections.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
11
from bs4 import BeautifulSoup
2-
import zstandard as zstd
2+
import copy
33
import os
44
import pdfplumber
55
from pprint import pformat
66
import re
7+
from schemas import inspection_schema
78
import sys
89
from utils import (
910
logger,
1011
output_folder,
1112
req_get,
1213
)
14+
import zstandard as zstd
15+
1316
from .utils import download_file
1417

1518
root_url = "https://www.ice.gov/foia/odo-facility-inspections"
@@ -45,7 +48,7 @@ def find_inspections(keep_text: bool = True) -> dict:
4548
links = content.select("a") # type: ignore [union-attr]
4649
for link in links:
4750
url = link["href"]
48-
obj = {"date": "", "url": url, "text": ""}
51+
obj = copy.deepcopy(inspection_schema)
4952
matches = text_re.search(link.text.strip())
5053
if len(matches.groups()) < 5: # type: ignore [union-attr]
5154
logger.warning(" Did not find all expected groups in %s. Skipping...", link.text.strip())
@@ -69,6 +72,5 @@ def find_inspections(keep_text: bool = True) -> dict:
6972
inspections[location].append(obj)
7073
else:
7174
inspections[location] = [obj]
72-
7375
logger.debug(pformat(inspections))
7476
return inspections

schemas.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,12 @@
1414
"scraped_date": datetime.datetime.now(datetime.UTC),
1515
}
1616

17+
inspection_schema: dict = {
18+
"date": "",
19+
"url": "",
20+
"text": "",
21+
}
22+
1723
field_office_schema: dict = {
1824
"address": {
1925
"administrative_area": "",

utils.py

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
# For general helpers, regexes, or shared logic (e.g. phone/address parsing functions).
2-
import copy
32
import logging
43
import os
54
import polars
@@ -96,18 +95,11 @@ def convert_to_dataframe(d: dict) -> polars.DataFrame:
9695
"""internal dict to dataframe"""
9796
flatdata = [_flatdict(f) for f in d.values()]
9897
"""
99-
Field names should find the _longest_ set of keys, not just the first one
100-
to avoid dropping data by accident from some rows (with things like additional inspections)
98+
Ideally we'd look for the longest row to use as our schema,
99+
but dataframes are picky about services missing those extra rows,
100+
so for simpler logic, we'll just use the first row
101101
"""
102-
longest: list = list(flatdata[0].keys())
103-
longest_len: int = len(longest)
104-
for dobj in flatdata[1:]:
105-
keys = list(dobj.keys())
106-
if len(keys) > longest_len:
107-
longest = copy.deepcopy(keys)
108-
longest_len = len(longest)
109-
logger.info("Key list is: %s", longest)
110-
fieldnames = [k for k in longest if k not in flatdata_filtered_keys]
102+
fieldnames = [k for k in flatdata[0].keys() if k not in flatdata_filtered_keys]
111103
# https://docs.pola.rs/api/python/stable/reference/api/polars.from_dicts.html
112104
df = polars.from_dicts(flatdata, schema=fieldnames)
113105
# logger.debug("Dataframe: %s", df)

0 commit comments

Comments
 (0)