Skip to content

Commit 3f995ef

Browse files
committed
fix conflicts and matching for JTF again
Signed-off-by: John Seekins <john@robot-house.us>
2 parents 23886dc + f1e6818 commit 3f995ef

6 files changed

Lines changed: 78 additions & 32 deletions

File tree

ice_scrapers/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@
137137
repair_name, # noqa: F401
138138
repair_street, # noqa: F401
139139
repair_zip, # noqa: F401
140+
special_facilities, # noqa: F401
140141
update_facility, # noqa: F401
141142
)
142143
from .facilities_scraper import scrape_facilities # noqa: F401,E402

ice_scrapers/facilities_scraper.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
repair_locality,
88
repair_street,
99
repair_zip,
10+
special_facilities,
1011
update_facility,
1112
)
1213
from schemas import facility_schema
@@ -27,7 +28,7 @@ def scrape_facilities(facilities_data: dict) -> dict:
2728
logger.info("Starting to scrape ICE.gov detention facilities...")
2829
facilities_data["scraped_date"] = datetime.datetime.now(datetime.UTC)
2930
urls = get_ice_scrape_pages(base_scrape_url)
30-
31+
scraped_count = 0
3132
for page_num, url in enumerate(urls):
3233
logger.info("Scraping page %s/%s...", page_num + 1, len(urls))
3334
try:
@@ -36,7 +37,9 @@ def scrape_facilities(facilities_data: dict) -> dict:
3637
logger.error("Error scraping page %s: %s", page_num + 1, e)
3738
logger.debug("Found %s facilities on page %s", len(facilities), page_num + 1)
3839
time.sleep(1) # Be respectful to the server
40+
scraped_count += len(facilities)
3941
for facility in facilities:
42+
facility = special_facilities(facility)
4043
addr = facility["address"]
4144
street, cleaned = repair_street(addr["street"], addr["locality"])
4245
if cleaned:
@@ -75,7 +78,7 @@ def scrape_facilities(facilities_data: dict) -> dict:
7578
facilities_data["facilities"][facility["name"]] = facility # type: ignore [index]
7679

7780
facilities_data["scrape_runtime"] = time.time() - start_time
78-
logger.info("Total facilities scraped: %s", len(list(facilities_data["facilities"].keys()))) # type: ignore [attr-defined]
81+
logger.info("Total facilities scraped: %s", scraped_count)
7982
logger.info(" Completed in %s seconds", facilities_data["scrape_runtime"])
8083
return facilities_data
8184

ice_scrapers/spreadsheet_load.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
repair_name,
99
repair_street,
1010
repair_zip,
11+
special_facilities,
1112
)
1213
import os
1314
import polars
@@ -131,12 +132,20 @@ def load_sheet(keep_sheet: bool = True, force_download: bool = True) -> dict:
131132
name, cleaned = repair_name(row["Name"], row["City"])
132133
if cleaned:
133134
details["_repaired_record"] = True
134-
full_address = ",".join([street, locality, row["State"], zcode]).upper()
135135
details["address"]["administrative_area"] = row["State"]
136136
details["address"]["locality"] = locality
137137
details["address"]["postal_code"] = zcode
138138
details["address"]["street"] = street
139139
details["name"] = name
140+
details = special_facilities(details)
141+
full_address = ",".join(
142+
[
143+
details["address"]["street"],
144+
details["address"]["locality"],
145+
details["address"]["administrative_area"],
146+
details["address"]["postal_code"],
147+
]
148+
).upper()
140149

141150
"""
142151
population statistics

ice_scrapers/utils.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,34 @@
77
)
88

99

10+
def special_facilities(facility: dict) -> dict:
11+
"""
12+
Some very specific facilities have unique fixes
13+
that are hard to fit into our normal repair_* pattern.
14+
15+
Please don't expand this function unless it's necessary
16+
"""
17+
match facility["name"]:
18+
case "Naval Station Guantanamo Bay (JTF Camp Six and Migrant Ops Center Main A)":
19+
"""
20+
First special case? JTF Camp Six is purely a mess.
21+
While we work on getting a consistent address for this facility,
22+
we'll need to make the two records converge.
23+
"""
24+
facility["address"]["country"] = "Cuba"
25+
facility["address"]["administrative_area"] = "FPO"
26+
facility["address"]["locality"] = "FPO"
27+
facility["address"]["postal_code"] = "34009"
28+
facility["address"]["street"] = "AVENUE C PSC 1005 BOX 55"
29+
case "JTF CAMP SIX":
30+
facility["address"]["country"] = "Cuba"
31+
facility["address"]["administrative_area"] = "FPO"
32+
facility["name"] = "Naval Station Guantanamo Bay (JTF Camp Six and Migrant Ops Center Main A)"
33+
case _:
34+
pass
35+
return facility
36+
37+
1038
def repair_street(street: str, locality: str = "") -> Tuple[str, bool]:
1139
"""Generally, we'll let the spreadsheet win arguments just to be consistent"""
1240
street_filters = [

ice_scrapers/vera_data.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -240,15 +240,21 @@ def collect_vera_facility_data(facilities_data: dict, keep_sheet: bool = True, f
240240
continue
241241
found = False
242242
facility_name, fixed_name = _vera_name_fixes(row["detention_facility_name"], row["city"])
243+
row["name"] = facility_name
243244
city, fixed_city = _vera_city_fixes(row["city"], row["state"])
245+
row["city"] = city
244246
if fixed_name or fixed_city:
245247
fixed += 1
246-
addr_str = f"{facility_name},{city},{row['state']},United States"
248+
if row["name"] == "JTF Camp Six":
249+
row["state"] = "FPO"
250+
row["city"] = "FPO"
251+
row["name"] = "Naval Station Guantanamo Bay (JTF Camp Six and Migrant Ops Center Main A)"
252+
addr_str = f"{row['name']},{row['city']},{row['state']}"
247253
for k, v in facilities_data["facilities"].items():
248254
if (
249-
v["name"].upper() == facility_name.upper()
255+
v["name"].upper() == row["name"].upper()
250256
and v["address"]["administrative_area"].upper() == row["state"].upper()
251-
and v["address"]["locality"].upper() == city.upper()
257+
and v["address"]["locality"].upper() == row["city"].upper()
252258
):
253259
logger.debug(" Found matching facility %s...", v["name"])
254260
facilities_data["facilities"][k]["osm"]["latitude"] = row["latitude"]
@@ -261,10 +267,9 @@ def collect_vera_facility_data(facilities_data: dict, keep_sheet: bool = True, f
261267
if not found:
262268
facilities_data["facilities"][addr_str] = copy.deepcopy(facility_schema)
263269
facilities_data["facilities"][addr_str]["source_urls"].append(base_url)
264-
facilities_data["facilities"][addr_str]["name"] = facility_name
270+
facilities_data["facilities"][addr_str]["name"] = row["name"]
265271
facilities_data["facilities"][addr_str]["address"]["administrative_area"] = row["state"]
266-
facilities_data["facilities"][addr_str]["address"]["locality"] = city
267-
facilities_data["facilities"][addr_str]["address"]["country"] = "United States"
272+
facilities_data["facilities"][addr_str]["address"]["locality"] = row["city"]
268273
facilities_data["facilities"][addr_str]["address_str"] = addr_str
269274
facilities_data["facilities"][addr_str]["osm"]["latitude"] = row["latitude"]
270275
facilities_data["facilities"][addr_str]["osm"]["longitude"] = row["longitude"]

0 commit comments

Comments
 (0)