Skip to content

Commit 16a5184

Browse files
committed
start adding additional names/etc
Signed-off-by: John Seekins <john@robot-house.us>
1 parent 7ea12f5 commit 16a5184

5 files changed

Lines changed: 93 additions & 68 deletions

File tree

ice_scrapers/facilities_scraper.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,19 +43,23 @@ def scrape_facilities(facilities_data: dict) -> dict:
4343
for facility in facilities:
4444
facility = special_facilities(facility)
4545
addr = facility["address"]
46-
street, cleaned = repair_street(addr["street"], addr["locality"])
46+
street, cleaned, other_st = repair_street(addr["street"], addr["locality"])
47+
addr["other_streets"].extend(other_st)
4748
if cleaned:
4849
addr["street"] = street
4950
facility["_repaired_record"] = True
50-
zcode, cleaned = repair_zip(addr["postal_code"], addr["locality"])
51+
zcode, cleaned, other_zip = repair_zip(addr["postal_code"], addr["locality"])
52+
addr["other_postal_codes"].extend(other_zip)
5153
if cleaned:
5254
addr["postal_code"] = zcode
5355
facility["_repaired_record"] = True
54-
locality, cleaned = repair_locality(addr["locality"], addr["administrative_area"])
56+
locality, cleaned, other_city = repair_locality(addr["locality"], addr["administrative_area"])
57+
addr["other_localities"].extend(other_city)
5558
if cleaned:
5659
addr["locality"] = locality
5760
facility["_repaired_record"] = True
58-
name, cleaned = repair_name(facility["name"], addr["locality"])
61+
name, cleaned, other_name = repair_name(facility["name"], addr["locality"])
62+
facility["other_names"].extend(other_name)
5963
if cleaned:
6064
facility["name"] = name
6165
facility["_repaired_record"] = True

ice_scrapers/spreadsheet_load.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -111,20 +111,26 @@ def load_sheet(keep_sheet: bool = True, force_download: bool = True) -> dict:
111111
phone_re = re.compile(r".+(\d{3}\s\d{3}\s\d{4})$")
112112
for row in df.iter_rows(named=True):
113113
details = copy.deepcopy(facility_schema)
114-
zcode, cleaned = repair_zip(row["Zip"], row["City"])
114+
zcode, cleaned, other_zips = repair_zip(row["Zip"], row["City"])
115+
details["address"]["other_postal_codes"].extend(other_zips)
115116
if cleaned:
116117
details["_repaired_record"] = True
117-
street, cleaned = repair_street(row["Address"], row["City"])
118+
street, cleaned, other_st = repair_street(row["Address"], row["City"])
119+
details["address"]["other_streets"].extend(other_st)
118120
if cleaned:
119121
details["_repaired_record"] = True
120122
match = phone_re.search(row["Address"])
121123
if match:
124+
if details.get("phone", None):
125+
details["other_phones"].append(details["phone"])
122126
details["phone"] = match.group(1)
123127
details["_repaired_record"] = True
124-
locality, cleaned = repair_locality(row["City"], row["State"])
128+
locality, cleaned, other_city = repair_locality(row["City"], row["State"])
129+
details["address"]["other_localities"].extend(other_city)
125130
if cleaned:
126131
details["_repaired_record"] = True
127-
name, cleaned = repair_name(row["Name"], row["City"])
132+
name, cleaned, other_names = repair_name(row["Name"], row["City"])
133+
details["other_names"].extend(other_names)
128134
if cleaned:
129135
details["_repaired_record"] = True
130136
details["address"]["administrative_area"] = row["State"]

ice_scrapers/utils.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,13 @@ def special_facilities(facility: dict) -> dict:
4949
facility["address"]["country"] = "Cuba"
5050
facility["address"]["administrative_area"] = "FPO"
5151
facility["name"] = "Naval Station Guantanamo Bay (JTF Camp Six and Migrant Ops Center Main A)"
52+
facility["other_names"] = ["JTF CAMP SIX"]
5253
case _:
5354
pass
5455
return facility
5556

5657

57-
def repair_name(name: str, locality: str) -> tuple[str, bool]:
58+
def repair_name(name: str, locality: str) -> tuple[str, bool, list[str]]:
5859
"""Even facility names are occasionally bad"""
5960
matches = [
6061
{"match": "ALEXANDRIA STAGING FACILI", "replace": "Alexandria Staging Facility", "locality": "ALEXANDRIA"},
@@ -107,15 +108,17 @@ def repair_name(name: str, locality: str) -> tuple[str, bool]:
107108
},
108109
]
109110
cleaned = False
111+
other_names = []
110112
for m in matches:
111113
if m["match"] == name and m["locality"] == locality:
114+
other_names = [m["match"]]
112115
name = m["replace"]
113116
cleaned = True
114117
break
115-
return name, cleaned
118+
return name, cleaned, other_names
116119

117120

118-
def repair_street(street: str, locality: str = "") -> tuple[str, bool]:
121+
def repair_street(street: str, locality: str = "") -> tuple[str, bool, list[str]]:
119122
"""Generally, we'll let the spreadsheet win arguments just to be consistent"""
120123
street_filters = [
121124
# address mismatch between site and spreadsheet
@@ -218,8 +221,10 @@ def repair_street(street: str, locality: str = "") -> tuple[str, bool]:
218221
# default matches should come last
219222
]
220223
cleaned = False
224+
other_streets = []
221225
for f in street_filters:
222226
if (f["match"] in street) and ((f["locality"] and f["locality"] == locality) or not f["locality"]):
227+
other_streets = [f["match"]]
223228
street = street.replace(f["match"], f["replace"])
224229
cleaned = True
225230
break
@@ -233,22 +238,24 @@ def repair_street(street: str, locality: str = "") -> tuple[str, bool]:
233238
if f["match"] in street:
234239
street = street.replace(f["match"], f["replace"])
235240
cleaned = True
236-
return street, cleaned
241+
return street, cleaned, other_streets
237242

238243

239-
def repair_zip(zip_code: int, locality: str) -> tuple[str, bool]:
244+
def repair_zip(zip_code: int, locality: str) -> tuple[str, bool, list[str]]:
240245
"""
241246
Excel does a cool thing where it strips leading 0s
242247
Also, many zip codes are mysteriously discordant
243248
"""
249+
other_zips = []
244250
zcode = str(zip_code)
245251
cleaned = False
246252
# don't replace an empty zip with all 0s
247253
if 0 < len(zcode) < 5:
254+
other_zips = [zcode]
248255
# pad any prefix
249256
zeros = "0" * (5 - len(zcode))
250257
zcode = f"{zeros}{zcode}"
251-
return zcode, cleaned
258+
return zcode, cleaned, other_zips
252259
matches = [
253260
{"match": "89512", "replace": "89506", "locality": "Reno"},
254261
{"match": "82901", "replace": "82935", "locality": "Rock Springs"},
@@ -261,18 +268,20 @@ def repair_zip(zip_code: int, locality: str) -> tuple[str, bool]:
261268
]
262269
for z in matches:
263270
if z["match"] == zcode and z["locality"] == locality:
271+
other_zips = [z["match"]]
264272
zcode = z["replace"]
265273
cleaned = True
266274
break
267-
return zcode, cleaned
275+
return zcode, cleaned, other_zips
268276

269277

270-
def repair_locality(locality: str, administrative_area: str) -> tuple[str, bool]:
278+
def repair_locality(locality: str, administrative_area: str) -> tuple[str, bool, list[str]]:
271279
"""
272280
There is no consistency with any address.
273281
How the post office ever successfully delivered a letter is beyond me
274282
"""
275283
cleaned = False
284+
other_city = []
276285
matches = [
277286
{"match": "LaGrange", "replace": "La Grange", "area": "KY"},
278287
{"match": "Leachfield", "replace": "LEITCHFIELD", "area": "KY"},
@@ -282,10 +291,11 @@ def repair_locality(locality: str, administrative_area: str) -> tuple[str, bool]
282291
]
283292
for f in matches:
284293
if f["match"] == locality and f["area"] == administrative_area:
294+
other_city = [f["match"]]
285295
locality = f["replace"]
286296
cleaned = True
287297
break
288-
return locality, cleaned
298+
return locality, cleaned, other_city
289299

290300

291301
def update_facility(old: dict, new: dict) -> dict:

schemas.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@
3838
"administrative_area": "",
3939
"country": "",
4040
"locality": "",
41+
"other_localities": [],
42+
"other_postal_codes": [],
43+
"other_streets": [],
4144
"postal_code": "",
4245
"street": "",
4346
},
@@ -62,6 +65,8 @@
6265
"url": "",
6366
},
6467
"name": "",
68+
"other_names": [],
69+
"other_phones": [],
6570
"page_updated_date": None,
6671
"phone": "",
6772
"population": {

0 commit comments

Comments
 (0)