Skip to content

Commit 8363d6e

Browse files
committed
wrapper function for requests across whole project
Signed-off-by: John Seekins <john@robot-house.us>
1 parent 3a1bdbc commit 8363d6e

9 files changed

Lines changed: 94 additions & 84 deletions

File tree

enrichers/__init__.py

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,13 @@
55
"""
66

77
import copy
8-
import requests
98
from schemas import enrich_resp_schema
10-
import time
11-
from utils import (
12-
default_headers,
13-
session,
14-
)
159

1610

1711
class Enrichment(object):
1812
_required_keys = [
1913
"facility_name",
2014
]
21-
# in seconds
22-
_wait_time: float = 1
2315

2416
def __init__(self, **kwargs):
2517
self.resp_info = copy.deepcopy(enrich_resp_schema)
@@ -32,28 +24,6 @@ def search(self) -> dict:
3224
"""Child objects should implement this"""
3325
return {}
3426

35-
def _req(self, url: str, **kwargs) -> requests.Response:
36-
"""requests response wrapper to ensure we honor waits"""
37-
headers = kwargs.get("headers", {})
38-
# ensure we get all headers configured correctly
39-
# but manually applied headers win the argument
40-
for k, v in default_headers.items():
41-
if k in headers.keys():
42-
continue
43-
headers[k] = v
44-
45-
response = session.get(
46-
url,
47-
allow_redirects=True,
48-
timeout=kwargs.get("timeout", 10),
49-
params=kwargs.get("params", {}),
50-
stream=kwargs.get("stream", False),
51-
headers=headers,
52-
)
53-
response.raise_for_status()
54-
time.sleep(self._wait_time)
55-
return response
56-
5727
def _minimal_clean_facility_name(self, name: str) -> str:
5828
"""Minimal cleaning that preserves important context like 'County Jail'"""
5929
cleaned = name

enrichers/openstreetmap.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from enrichers import Enrichment
2-
from utils import logger
2+
from utils import logger, req_get
33

44

55
class OpenStreetMap(Enrichment):
@@ -40,13 +40,13 @@ def search(self) -> dict:
4040
"dedupe": 1,
4141
},
4242
"street_address": {
43-
"q": f"{full_address}",
43+
"q": full_address,
4444
"format": "json",
4545
"limit": 5,
4646
"dedupe": 1,
4747
},
4848
"locality": {
49-
"q": f"{locality}",
49+
"q": locality,
5050
"format": "json",
5151
"limit": 5,
5252
"dedupe": 1,
@@ -56,7 +56,7 @@ def search(self) -> dict:
5656
logger.debug("Searching OSM for %s", params["q"])
5757
self.resp_info["search_query_steps"].append(params["q"]) # type: ignore [attr-defined]
5858
try:
59-
response = self._req(search_url, params=params, timeout=15)
59+
response = req_get(search_url, params=params, timeout=15)
6060
data.extend(response.json())
6161
except Exception as e:
6262
logger.debug(" OSM search error for '%s': %s", facility_name, e)
@@ -73,10 +73,8 @@ def search(self) -> dict:
7373
lon = first_result.get("lon", self.default_coords["longitude"])
7474
osm_type = first_result.get("osm_type", "")
7575
osm_id = first_result.get("osm_id", "")
76-
self.resp_info["details"]["latitude"] = lat # type: ignore [index]
77-
self.resp_info["details"]["longitude"] = lon # type: ignore [index]
7876
self.resp_info["title"] = first_result.get("display_name", "")
79-
self.resp_info["details"]["class"] = first_result.get("class", "") # type: ignore [index]
77+
self.resp_info["details"] = {"latitude": lat, "logitude": lon, "class": first_result.get("class", "")}
8078
if osm_type == "way":
8179
self.resp_info["url"] = f"https://www.openstreetmap.org/way/{osm_id}"
8280
else:

enrichers/wikidata.py

Lines changed: 30 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
from enrichers import Enrichment
2-
from utils import logger
2+
from utils import (
3+
logger,
4+
req_get,
5+
)
36

47

58
class Wikidata(Enrichment):
@@ -11,29 +14,32 @@ def search(self) -> dict:
1114
# Fetches 3 results based on _clean_facility_name (not exact name). todo: needs adjustment.
1215
# Falls back to first result (usually truncated, eg. county)
1316
search_name_fallback = self._clean_facility_name(facility_name)
17+
self.resp_info["enrichment_type"] = "wikidata"
1418
logger.debug("Searching wikidata for %s and %s", facility_name, search_name_fallback)
1519
search_url = "https://www.wikidata.org/w/api.php"
1620
params = {
17-
"action": "wbsearchentities",
18-
"search": facility_name,
19-
"language": "en",
20-
"format": "json",
21-
"limit": 3,
21+
"facility_name": {
22+
"action": "wbsearchentities",
23+
"search": facility_name,
24+
"language": "en",
25+
"format": "json",
26+
"limit": 3,
27+
},
28+
"fallback": {
29+
"action": "wbsearchentities",
30+
"search": search_name_fallback,
31+
"language": "en",
32+
"format": "json",
33+
"limit": 3,
34+
},
2235
}
23-
self.resp_info["enrichment_type"] = "wikidata"
2436
data = {}
25-
try:
26-
response = self._req(search_url, params=params)
27-
data = response.json()
28-
except Exception as e:
29-
logger.debug(" Wikidata search error for '%s': %s", facility_name, e)
30-
self.resp_info["search_query_steps"].append(f"(Failed -> {e})") # type: ignore [attr-defined]
31-
if not data.get("search"):
32-
params["search"] = search_name_fallback
33-
self.resp_info["search_query_steps"].append(search_name_fallback) # type: ignore [attr-defined]
37+
for search, params in params.items():
38+
self.resp_info["search_query_steps"].append(params["search"]) # type: ignore [attr-defined]
3439
try:
35-
response = self._req(search_url, params=params)
40+
response = req_get(search_url, params=params, wait_time=self._wait_time)
3641
data = response.json()
42+
break
3743
except Exception as e:
3844
logger.debug(" Wikidata search error for '%s': %s", facility_name, e)
3945
self.resp_info["search_query_steps"].append(f"(Failed -> {e})") # type: ignore [attr-defined]
@@ -45,10 +51,11 @@ def search(self) -> dict:
4551
if any(term in description for term in match_terms):
4652
self.resp_info["url"] = f"https://www.wikidata.org/wiki/{result['id']}"
4753
self.resp_info["title"] = result.get("label", "")
48-
return self.resp_info
49-
# fallback to first result
50-
first = data["search"][0]
51-
logger.debug(" Closer matching failed, falling back to first result %s", first)
52-
self.resp_info["url"] = f"https://www.wikidata.org/wiki/{result['id']}"
53-
self.resp_info["title"] = result.get("label", "")
54+
break
55+
else:
56+
# fall back to first result
57+
first = data["search"][0]
58+
logger.debug(" Closer matching failed, falling back to first result %s", first)
59+
self.resp_info["url"] = f"https://www.wikidata.org/wiki/{first['id']}"
60+
self.resp_info["title"] = first.get("label", "")
5461
return self.resp_info

enrichers/wikipedia.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from enrichers import Enrichment
22
from urllib.parse import quote
3-
from utils import logger
3+
from utils import logger, req_get
44

55

66
class Wikipedia(Enrichment):
@@ -32,15 +32,15 @@ def search(self) -> dict:
3232
self.resp_info["search_query_steps"].append(wiki_url) # type: ignore [attr-defined]
3333
initial_response = False
3434
try:
35-
response = self._req(wiki_url)
35+
response = req_get(wiki_url, wait_time=self._wait_time)
3636
initial_response = True
3737
except Exception as e:
3838
logger.debug(" Wikipedia search error for '%s': %s", wiki_url, e)
3939
self.resp_info["search_query_steps"].append(f"(Failed -> {e})") # type: ignore [attr-defined]
4040
wiki_url = f"{self.static_search}{quote(facility_name.replace(' ', '_').replace('|', '_'))}"
4141
self.resp_info["search_query_steps"].append(wiki_url) # type: ignore [attr-defined]
4242
try:
43-
response = self._req(wiki_url)
43+
response = req_get(wiki_url, wait_time=self._wait_time)
4444
initial_response = True
4545
except Exception as e:
4646
logger.debug(" Wikipedia search error for '%s': %s", wiki_url, e)
@@ -101,7 +101,7 @@ def search(self) -> dict:
101101
}
102102

103103
try:
104-
response = self._req(self.api_search, params=params)
104+
response = req_get(self.api_search, params=params, wait_time=self._wait_time)
105105
data = response.json()
106106
except Exception as e:
107107
logger.debug(" Wikipedia search for %s failed: %s", self.api_search, e)
@@ -161,7 +161,7 @@ def search(self) -> dict:
161161

162162
# Verify the page exists and isn't a redirect to something unrelated
163163
try:
164-
verify_response = self._req(final_url)
164+
verify_response = req_get(final_url, wait_time=self._wait_time)
165165
except Exception as e:
166166
logger.debug(" Wikipedia query for %s failed: %s", final_url, e)
167167
self.resp_info["search_query_steps"].append(final_url) # type: ignore [attr-defined]

ice_scrapers/facilities_scraper.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from utils import (
88
default_timestamp,
99
logger,
10-
session,
10+
req_get,
1111
timestamp_format,
1212
)
1313
from .utils import (
@@ -95,8 +95,7 @@ def _scrape_updated(url: str) -> datetime.datetime:
9595
return datetime.datetime.strptime(default_timestamp, timestamp_format)
9696
logger.debug(" Fetching: %s", url)
9797
try:
98-
response = session.get(url, timeout=30)
99-
response.raise_for_status()
98+
response = req_get(url, timeout=30)
10099
except Exception as e:
101100
logger.error(" Error parsing %s: %s", url, e)
102101
return datetime.datetime.strptime(default_timestamp, timestamp_format)
@@ -118,8 +117,7 @@ def _scrape_page(page_url: str) -> list:
118117
"""Scrape a single page of facilities using BeautifulSoup"""
119118
logger.debug(" Fetching: %s", page_url)
120119
try:
121-
response = session.get(page_url, timeout=30)
122-
response.raise_for_status()
120+
response = req_get(page_url, timeout=30)
123121
except Exception as e:
124122
logger.error(" Error parsing %s: %s", page_url, e)
125123
return []

ice_scrapers/utils.py

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import re
44
from utils import (
55
logger,
6-
session,
6+
req_get,
77
)
88

99

@@ -13,13 +13,17 @@ def download_file(link: str, path: str, redownload: bool = False) -> None:
1313
"""
1414
if os.path.exists(path) and os.path.getsize(path) > 0 and not redownload:
1515
logger.debug(" Skipping redownload of existing file %s", path)
16-
resp = session.get(link, timeout=120, stream=True)
17-
size = len(resp.content)
18-
with open(path, "wb") as f:
19-
for chunk in resp.iter_content(chunk_size=1024):
20-
if chunk:
21-
f.write(chunk)
22-
logger.debug(" Wrote %s byte file to %s", size, path)
16+
try:
17+
resp = req_get(link, timeout=120, stream=True)
18+
except Exception as e:
19+
logger.error("Failed to download %s :: %s", link, e)
20+
else:
21+
size = len(resp.content)
22+
with open(path, "wb") as f:
23+
for chunk in resp.iter_content(chunk_size=1024):
24+
if chunk:
25+
f.write(chunk)
26+
logger.debug(" Wrote %s byte file to %s", size, path)
2327

2428

2529
def special_facilities(facility: dict) -> dict:
@@ -294,13 +298,15 @@ def update_facility(old: dict, new: dict) -> dict:
294298
return old
295299

296300

297-
def get_ice_scrape_pages(url: str) -> list:
301+
def get_ice_scrape_pages(url: str) -> list[str]:
298302
"""
299303
Discover all facility pages
300304
This _may_ be generic to Drupal's pagination code...
301305
"""
302-
resp = session.get(url, timeout=30)
303-
resp.raise_for_status()
306+
try:
307+
resp = req_get(url, timeout=30)
308+
except Exception:
309+
return []
304310
soup = BeautifulSoup(resp.content, "html.parser")
305311
links = soup.findAll("a", href=re.compile(r"\?page="))
306312
if not links:

main.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
from enrichers import enrich_facility_data
2828
from schemas import supported_output_types
2929
from utils import logger
30-
# CLI, argument parsing, script orchestration
3130

3231

3332
def main() -> None:

schemas.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
"last_date": None,
5353
"last_rating": "",
5454
"last_type": "",
55+
"details": [],
5556
},
5657
"image_url": "",
5758
"osm": {

utils.py

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import polars
55
import requests
66
from requests.adapters import HTTPAdapter
7+
import time
78
import urllib3
89

910
SCRIPTDIR = os.path.dirname(os.path.realpath(__file__))
@@ -42,6 +43,36 @@
4243
]
4344

4445

46+
def req_get(url: str, **kwargs) -> requests.Response:
47+
"""requests response wrapper to ensure we honor waits"""
48+
headers = kwargs.get("headers", {})
49+
# ensure we get all headers configured correctly
50+
# but manually applied headers win the argument
51+
for k, v in default_headers.items():
52+
if k in headers.keys():
53+
continue
54+
headers[k] = v
55+
56+
response = session.get(
57+
url,
58+
allow_redirects=True,
59+
timeout=kwargs.get("timeout", 10),
60+
params=kwargs.get("params", {}),
61+
stream=kwargs.get("stream", False),
62+
headers=headers,
63+
)
64+
if not kwargs.get("raise_err", False):
65+
response.raise_for_status()
66+
else:
67+
if response.status_code > 399:
68+
if response.status_code < 500:
69+
logger.error("Client-side error in request to %s :: %s", url, response.text)
70+
else:
71+
logger.error("Server-side error in request to %s :: %s", url, response.text)
72+
time.sleep(kwargs.get("wait_time", 1))
73+
return response
74+
75+
4576
def _flatdict(d: dict, parent_key: str = "", sep: str = ".") -> dict:
4677
"""flatten a nested dictionary for nicer printing to workbooks (excel/csv/etc.)"""
4778
items: list = []
@@ -60,6 +91,6 @@ def convert_to_dataframe(d: dict) -> polars.DataFrame:
6091
fieldnames = [k for k in flatdata[0].keys() if k not in flatdata_filtered_keys]
6192
# https://docs.pola.rs/api/python/stable/reference/api/polars.from_dicts.html
6293
df = polars.from_dicts(flatdata, schema=fieldnames)
63-
logger.debug("Dataframe: %s", df)
64-
logger.debug("All header fields: %s", fieldnames)
94+
# logger.debug("Dataframe: %s", df)
95+
# logger.debug("All header fields: %s", fieldnames)
6596
return df

0 commit comments

Comments
 (0)