wrapper function for requests across whole project

johnseekins · johnseekins · commit 8363d6e9599e · 2025-12-06T12:11:51.000-07:00
Signed-off-by: John Seekins &lt;john@robot-house.us&gt;
diff --git a/enrichers/__init__.py b/enrichers/__init__.py
@@ -5,21 +5,13 @@
 """
 
 import copy
-import requests
 from schemas import enrich_resp_schema
-import time
-from utils import (
-    default_headers,
-    session,
-)
 
 
 class Enrichment(object):
     _required_keys = [
         "facility_name",
     ]
-    # in seconds
-    _wait_time: float = 1
 
     def __init__(self, **kwargs):
         self.resp_info = copy.deepcopy(enrich_resp_schema)
@@ -32,28 +24,6 @@ def search(self) -> dict:
         """Child objects should implement this"""
         return {}
 
-    def _req(self, url: str, **kwargs) -> requests.Response:
-        """requests response wrapper to ensure we honor waits"""
-        headers = kwargs.get("headers", {})
-        # ensure we get all headers configured correctly
-        # but manually applied headers win the argument
-        for k, v in default_headers.items():
-            if k in headers.keys():
-                continue
-            headers[k] = v
-
-        response = session.get(
-            url,
-            allow_redirects=True,
-            timeout=kwargs.get("timeout", 10),
-            params=kwargs.get("params", {}),
-            stream=kwargs.get("stream", False),
-            headers=headers,
-        )
-        response.raise_for_status()
-        time.sleep(self._wait_time)
-        return response
-
     def _minimal_clean_facility_name(self, name: str) -> str:
         """Minimal cleaning that preserves important context like 'County Jail'"""
         cleaned = name
diff --git a/enrichers/openstreetmap.py b/enrichers/openstreetmap.py
@@ -1,5 +1,5 @@
 from enrichers import Enrichment
-from utils import logger
+from utils import logger, req_get
 
 
 class OpenStreetMap(Enrichment):
@@ -40,13 +40,13 @@ def search(self) -> dict:
                     "dedupe": 1,
                 },
                 "street_address": {
-                    "q": f"{full_address}",
+                    "q": full_address,
                     "format": "json",
                     "limit": 5,
                     "dedupe": 1,
                 },
                 "locality": {
-                    "q": f"{locality}",
+                    "q": locality,
                     "format": "json",
                     "limit": 5,
                     "dedupe": 1,
@@ -56,7 +56,7 @@ def search(self) -> dict:
             logger.debug("Searching OSM for %s", params["q"])
             self.resp_info["search_query_steps"].append(params["q"])  # type: ignore [attr-defined]
             try:
-                response = self._req(search_url, params=params, timeout=15)
+                response = req_get(search_url, params=params, timeout=15)
                 data.extend(response.json())
             except Exception as e:
                 logger.debug(" OSM search error for '%s': %s", facility_name, e)
@@ -73,10 +73,8 @@ def search(self) -> dict:
         lon = first_result.get("lon", self.default_coords["longitude"])
         osm_type = first_result.get("osm_type", "")
         osm_id = first_result.get("osm_id", "")
-        self.resp_info["details"]["latitude"] = lat  # type: ignore [index]
-        self.resp_info["details"]["longitude"] = lon  # type: ignore [index]
         self.resp_info["title"] = first_result.get("display_name", "")
-        self.resp_info["details"]["class"] = first_result.get("class", "")  # type: ignore [index]
+        self.resp_info["details"] = {"latitude": lat, "logitude": lon, "class": first_result.get("class", "")}
         if osm_type == "way":
             self.resp_info["url"] = f"https://www.openstreetmap.org/way/{osm_id}"
         else:
diff --git a/enrichers/wikidata.py b/enrichers/wikidata.py
@@ -1,5 +1,8 @@
 from enrichers import Enrichment
-from utils import logger
+from utils import (
+    logger,
+    req_get,
+)
 
 
 class Wikidata(Enrichment):
@@ -11,29 +14,32 @@ def search(self) -> dict:
         # Fetches 3 results based on _clean_facility_name (not exact name). todo: needs adjustment.
         # Falls back to first result (usually truncated, eg. county)
         search_name_fallback = self._clean_facility_name(facility_name)
+        self.resp_info["enrichment_type"] = "wikidata"
         logger.debug("Searching wikidata for %s and %s", facility_name, search_name_fallback)
         search_url = "https://www.wikidata.org/w/api.php"
         params = {
-            "action": "wbsearchentities",
-            "search": facility_name,
-            "language": "en",
-            "format": "json",
-            "limit": 3,
+            "facility_name": {
+                "action": "wbsearchentities",
+                "search": facility_name,
+                "language": "en",
+                "format": "json",
+                "limit": 3,
+            },
+            "fallback": {
+                "action": "wbsearchentities",
+                "search": search_name_fallback,
+                "language": "en",
+                "format": "json",
+                "limit": 3,
+            },
         }
-        self.resp_info["enrichment_type"] = "wikidata"
         data = {}
-        try:
-            response = self._req(search_url, params=params)
-            data = response.json()
-        except Exception as e:
-            logger.debug("  Wikidata search error for '%s': %s", facility_name, e)
-            self.resp_info["search_query_steps"].append(f"(Failed -> {e})")  # type: ignore [attr-defined]
-        if not data.get("search"):
-            params["search"] = search_name_fallback
-            self.resp_info["search_query_steps"].append(search_name_fallback)  # type: ignore [attr-defined]
+        for search, params in params.items():
+            self.resp_info["search_query_steps"].append(params["search"])  # type: ignore [attr-defined]
             try:
-                response = self._req(search_url, params=params)
+                response = req_get(search_url, params=params, wait_time=self._wait_time)
                 data = response.json()
+                break
             except Exception as e:
                 logger.debug("  Wikidata search error for '%s': %s", facility_name, e)
                 self.resp_info["search_query_steps"].append(f"(Failed -> {e})")  # type: ignore [attr-defined]
@@ -45,10 +51,11 @@ def search(self) -> dict:
             if any(term in description for term in match_terms):
                 self.resp_info["url"] = f"https://www.wikidata.org/wiki/{result['id']}"
                 self.resp_info["title"] = result.get("label", "")
-                return self.resp_info
-        # fallback to first result
-        first = data["search"][0]
-        logger.debug("   Closer matching failed, falling back to first result %s", first)
-        self.resp_info["url"] = f"https://www.wikidata.org/wiki/{result['id']}"
-        self.resp_info["title"] = result.get("label", "")
+                break
+        else:
+            # fall back to first result
+            first = data["search"][0]
+            logger.debug("   Closer matching failed, falling back to first result %s", first)
+            self.resp_info["url"] = f"https://www.wikidata.org/wiki/{first['id']}"
+            self.resp_info["title"] = first.get("label", "")
         return self.resp_info
diff --git a/enrichers/wikipedia.py b/enrichers/wikipedia.py
@@ -1,6 +1,6 @@
 from enrichers import Enrichment
 from urllib.parse import quote
-from utils import logger
+from utils import logger, req_get
 
 
 class Wikipedia(Enrichment):
@@ -32,15 +32,15 @@ def search(self) -> dict:
         self.resp_info["search_query_steps"].append(wiki_url)  # type: ignore [attr-defined]
         initial_response = False
         try:
-            response = self._req(wiki_url)
+            response = req_get(wiki_url, wait_time=self._wait_time)
             initial_response = True
         except Exception as e:
             logger.debug("  Wikipedia search error for '%s': %s", wiki_url, e)
             self.resp_info["search_query_steps"].append(f"(Failed -> {e})")  # type: ignore [attr-defined]
             wiki_url = f"{self.static_search}{quote(facility_name.replace(' ', '_').replace('|', '_'))}"
             self.resp_info["search_query_steps"].append(wiki_url)  # type: ignore [attr-defined]
             try:
-                response = self._req(wiki_url)
+                response = req_get(wiki_url, wait_time=self._wait_time)
                 initial_response = True
             except Exception as e:
                 logger.debug("  Wikipedia search error for '%s': %s", wiki_url, e)
@@ -101,7 +101,7 @@ def search(self) -> dict:
             }
 
             try:
-                response = self._req(self.api_search, params=params)
+                response = req_get(self.api_search, params=params, wait_time=self._wait_time)
                 data = response.json()
             except Exception as e:
                 logger.debug("   Wikipedia search for %s failed: %s", self.api_search, e)
@@ -161,7 +161,7 @@ def search(self) -> dict:
 
                         # Verify the page exists and isn't a redirect to something unrelated
                         try:
-                            verify_response = self._req(final_url)
+                            verify_response = req_get(final_url, wait_time=self._wait_time)
                         except Exception as e:
                             logger.debug("    Wikipedia query for %s failed: %s", final_url, e)
                             self.resp_info["search_query_steps"].append(final_url)  # type: ignore [attr-defined]
diff --git a/ice_scrapers/facilities_scraper.py b/ice_scrapers/facilities_scraper.py
@@ -7,7 +7,7 @@
 from utils import (
     default_timestamp,
     logger,
-    session,
+    req_get,
     timestamp_format,
 )
 from .utils import (
@@ -95,8 +95,7 @@ def _scrape_updated(url: str) -> datetime.datetime:
         return datetime.datetime.strptime(default_timestamp, timestamp_format)
     logger.debug("  Fetching: %s", url)
     try:
-        response = session.get(url, timeout=30)
-        response.raise_for_status()
+        response = req_get(url, timeout=30)
     except Exception as e:
         logger.error("  Error parsing %s: %s", url, e)
         return datetime.datetime.strptime(default_timestamp, timestamp_format)
@@ -118,8 +117,7 @@ def _scrape_page(page_url: str) -> list:
     """Scrape a single page of facilities using BeautifulSoup"""
     logger.debug("  Fetching: %s", page_url)
     try:
-        response = session.get(page_url, timeout=30)
-        response.raise_for_status()
+        response = req_get(page_url, timeout=30)
     except Exception as e:
         logger.error("  Error parsing %s: %s", page_url, e)
         return []
diff --git a/ice_scrapers/utils.py b/ice_scrapers/utils.py
@@ -3,7 +3,7 @@
 import re
 from utils import (
     logger,
-    session,
+    req_get,
 )
 
 
@@ -13,13 +13,17 @@ def download_file(link: str, path: str, redownload: bool = False) -> None:
     """
     if os.path.exists(path) and os.path.getsize(path) > 0 and not redownload:
         logger.debug("    Skipping redownload of existing file %s", path)
-    resp = session.get(link, timeout=120, stream=True)
-    size = len(resp.content)
-    with open(path, "wb") as f:
-        for chunk in resp.iter_content(chunk_size=1024):
-            if chunk:
-                f.write(chunk)
-    logger.debug("    Wrote %s byte file to %s", size, path)
+    try:
+        resp = req_get(link, timeout=120, stream=True)
+    except Exception as e:
+        logger.error("Failed to download %s :: %s", link, e)
+    else:
+        size = len(resp.content)
+        with open(path, "wb") as f:
+            for chunk in resp.iter_content(chunk_size=1024):
+                if chunk:
+                    f.write(chunk)
+        logger.debug("    Wrote %s byte file to %s", size, path)
 
 
 def special_facilities(facility: dict) -> dict:
@@ -294,13 +298,15 @@ def update_facility(old: dict, new: dict) -> dict:
     return old
 
 
-def get_ice_scrape_pages(url: str) -> list:
+def get_ice_scrape_pages(url: str) -> list[str]:
     """
     Discover all facility pages
     This _may_ be generic to Drupal's pagination code...
     """
-    resp = session.get(url, timeout=30)
-    resp.raise_for_status()
+    try:
+        resp = req_get(url, timeout=30)
+    except Exception:
+        return []
     soup = BeautifulSoup(resp.content, "html.parser")
     links = soup.findAll("a", href=re.compile(r"\?page="))
     if not links:
diff --git a/main.py b/main.py
@@ -27,7 +27,6 @@
 from enrichers import enrich_facility_data
 from schemas import supported_output_types
 from utils import logger
-# CLI, argument parsing, script orchestration
 
 
 def main() -> None:
diff --git a/schemas.py b/schemas.py
@@ -52,6 +52,7 @@
         "last_date": None,
         "last_rating": "",
         "last_type": "",
+        "details": [],
     },
     "image_url": "",
     "osm": {
diff --git a/utils.py b/utils.py
@@ -4,6 +4,7 @@
 import polars
 import requests
 from requests.adapters import HTTPAdapter
+import time
 import urllib3
 
 SCRIPTDIR = os.path.dirname(os.path.realpath(__file__))
@@ -42,6 +43,36 @@
 ]
 
 
+def req_get(url: str, **kwargs) -> requests.Response:
+    """requests response wrapper to ensure we honor waits"""
+    headers = kwargs.get("headers", {})
+    # ensure we get all headers configured correctly
+    # but manually applied headers win the argument
+    for k, v in default_headers.items():
+        if k in headers.keys():
+            continue
+        headers[k] = v
+
+    response = session.get(
+        url,
+        allow_redirects=True,
+        timeout=kwargs.get("timeout", 10),
+        params=kwargs.get("params", {}),
+        stream=kwargs.get("stream", False),
+        headers=headers,
+    )
+    if not kwargs.get("raise_err", False):
+        response.raise_for_status()
+    else:
+        if response.status_code > 399:
+            if response.status_code < 500:
+                logger.error("Client-side error in request to %s :: %s", url, response.text)
+            else:
+                logger.error("Server-side error in request to %s :: %s", url, response.text)
+    time.sleep(kwargs.get("wait_time", 1))
+    return response
+
+
 def _flatdict(d: dict, parent_key: str = "", sep: str = ".") -> dict:
     """flatten a nested dictionary for nicer printing to workbooks (excel/csv/etc.)"""
     items: list = []
@@ -60,6 +91,6 @@ def convert_to_dataframe(d: dict) -> polars.DataFrame:
     fieldnames = [k for k in flatdata[0].keys() if k not in flatdata_filtered_keys]
     # https://docs.pola.rs/api/python/stable/reference/api/polars.from_dicts.html
     df = polars.from_dicts(flatdata, schema=fieldnames)
-    logger.debug("Dataframe: %s", df)
-    logger.debug("All header fields: %s", fieldnames)
+    # logger.debug("Dataframe: %s", df)
+    # logger.debug("All header fields: %s", fieldnames)
     return df