Skip to content

Commit fa50dfc

Browse files
committed
reduce logging of inspections
Signed-off-by: John Seekins <john@robot-house.us>
1 parent 6067ffd commit fa50dfc

8 files changed

Lines changed: 17 additions & 21 deletions

File tree

ice_scrapers/agencies.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from utils import (
1313
logger,
1414
output_folder,
15-
session,
15+
req_get,
1616
)
1717
from .utils import download_file
1818

@@ -22,8 +22,7 @@
2222
def scrape_agencies(keep_sheet: bool = True, force_download: bool = True) -> dict:
2323
"""Collect data on participating agencies"""
2424
start_time = time.time()
25-
resp = session.get(base_xlsx_url, timeout=120)
26-
resp.raise_for_status()
25+
resp = req_get(base_xlsx_url, timeout=120)
2726
soup = BeautifulSoup(resp.content, "html.parser")
2827
links = [link["href"] for link in soup.findAll("a", href=re.compile("^https://www.ice.gov/doclib.*xlsx"))]
2928
if not links:

ice_scrapers/facilities_scraper.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ def _scrape_updated(url: str) -> datetime.datetime:
9595
return datetime.datetime.strptime(default_timestamp, timestamp_format)
9696
logger.debug(" Fetching: %s", url)
9797
try:
98-
response = req_get(url, timeout=30)
98+
response = req_get(url, timeout=30, wait_time=0.1)
9999
except Exception as e:
100100
logger.error(" Error parsing %s: %s", url, e)
101101
return datetime.datetime.strptime(default_timestamp, timestamp_format)
@@ -117,7 +117,7 @@ def _scrape_page(page_url: str) -> list:
117117
"""Scrape a single page of facilities using BeautifulSoup"""
118118
logger.debug(" Fetching: %s", page_url)
119119
try:
120-
response = req_get(page_url, timeout=30)
120+
response = req_get(page_url, timeout=30, wait_time=0.1)
121121
except Exception as e:
122122
logger.error(" Error parsing %s: %s", page_url, e)
123123
return []

ice_scrapers/field_offices.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
import time
1515
from utils import (
1616
logger,
17-
session,
17+
req_get,
1818
)
1919
from .utils import get_ice_scrape_pages
2020

@@ -45,8 +45,7 @@ def _scrape_page(page_url: str) -> list[dict]:
4545
"""Scrape a single page of facilities using BeautifulSoup"""
4646
logger.debug(" Fetching: %s", page_url)
4747
try:
48-
response = session.get(page_url, timeout=30)
49-
response.raise_for_status()
48+
response = req_get(page_url, timeout=30)
5049
except Exception as e:
5150
logger.error(" Error parsing %s: %s", page_url, e)
5251
return []

ice_scrapers/general.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ def facilities_scrape_wrapper(
2626
facilities_data["facilities"] = copy.deepcopy(facilities)
2727
facility_name_map = {v["name"].lower(): k for k, v in facilities_data["facilities"].items()}
2828
inspections = find_inspections(keep_text=inspection_text)
29+
facilities_data = scrape_facilities(facilities_data)
2930

3031
# actually attach inspections to facilities
3132
for facility, inspect in inspections.items():
@@ -36,16 +37,15 @@ def facilities_scrape_wrapper(
3637
inspect
3738
)
3839
break
39-
logger.debug(" Checking fuzzy matches:")
40+
# logger.debug(" Checking fuzzy matches:")
4041
for k, v in facility_name_map.items():
4142
r = fuzz.partial_ratio(facility, k)
42-
logger.debug(" %s === %s, ratio: %s", facility, k, r)
43+
# logger.debug(" %s === %s, ratio: %s", facility, k, r)
4344
if r > 80:
44-
logger.info(" Probably the right facility %s => %s, (ratio %s)", k, facility, r)
45+
logger.debug(" Probably the right facility %s => %s, (ratio %s)", k, facility, r)
4546
facilities_data["facilities"][facility_name_map[k]]["inspection"]["details"] = copy.deepcopy(inspect)
4647
break
4748

48-
facilities_data = scrape_facilities(facilities_data)
4949
if not skip_vera:
5050
facilities_data = collect_vera_facility_data(facilities_data, keep_sheet, force_download)
5151
field_offices = scrape_field_offices()

ice_scrapers/inspections.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from utils import (
99
logger,
1010
output_folder,
11-
session,
11+
req_get,
1212
)
1313
from .utils import download_file
1414

@@ -38,7 +38,7 @@ def find_inspections(keep_text: bool = True) -> dict:
3838
os.makedirs(storage_dir, exist_ok=True)
3939
inspections: dict = {}
4040
logger.info("Collecting inspection reports from %s", root_url)
41-
resp = session.get(root_url, timeout=120)
41+
resp = req_get(root_url, timeout=120)
4242
resp.raise_for_status()
4343
soup = BeautifulSoup(resp.content, "html.parser")
4444
content = soup.select_one("div.facility-inspections")

ice_scrapers/spreadsheet_load.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from utils import (
1616
logger,
1717
output_folder,
18-
session,
18+
req_get,
1919
)
2020
from .utils import (
2121
download_file,
@@ -64,8 +64,7 @@
6464

6565
def _download_sheet(keep_sheet: bool = True, force_download: bool = True) -> tuple[polars.DataFrame, str]:
6666
"""Download the detention stats sheet from ice.gov"""
67-
resp = session.get(base_xlsx_url, timeout=120)
68-
resp.raise_for_status()
67+
resp = req_get(base_xlsx_url, timeout=120)
6968
soup = BeautifulSoup(resp.content, "html.parser")
7069
links = soup.findAll("a", href=re.compile("^https://www.ice.gov/doclib.*xlsx"))
7170
if not links:

ice_scrapers/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -304,7 +304,7 @@ def get_ice_scrape_pages(url: str) -> list[str]:
304304
This _may_ be generic to Drupal's pagination code...
305305
"""
306306
try:
307-
resp = req_get(url, timeout=30)
307+
resp = req_get(url, timeout=30, wait_time=0.1)
308308
except Exception:
309309
return []
310310
soup = BeautifulSoup(resp.content, "html.parser")

ice_scrapers/vera_data.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from utils import (
77
logger,
88
output_folder,
9-
session,
9+
req_get,
1010
)
1111

1212
# Github can aggressively rate-limit requests, so this may fail in surprising ways!
@@ -216,8 +216,7 @@ def _vera_city_fixes(city: str, state: str) -> tuple[str, bool]:
216216
def collect_vera_facility_data(facilities_data: dict, keep_sheet: bool = True, force_download: bool = True) -> dict:
217217
logger.info("Collecting and extracting data from vera.org facility data...")
218218
if force_download or not os.path.exists(filename):
219-
res = session.get(base_url, timeout=120, stream=True)
220-
res.raise_for_status()
219+
res = req_get(base_url, timeout=120, stream=True)
221220
size = len(res.content)
222221
with open(filename, "wb") as f:
223222
for chunk in res.iter_content(chunk_size=1024):

0 commit comments

Comments
 (0)