Skip to content

Commit 657f1ce

Browse files
committed
add text extraction
Signed-off-by: John Seekins <john@robot-house.us>
1 parent 19c15f4 commit 657f1ce

5 files changed

Lines changed: 257 additions & 19 deletions

File tree

ice_scrapers/general.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
def facilities_scrape_wrapper(
1616
keep_sheet: bool = True, force_download: bool = True, skip_vera: bool = False
1717
) -> tuple[dict, dict]:
18+
_ = find_inspections()
1819
agencies = scrape_agencies(keep_sheet, force_download)
1920
facilities_data = copy.deepcopy(facilities_schema)
2021
facilities = load_sheet(keep_sheet, force_download)
@@ -25,6 +26,5 @@ def facilities_scrape_wrapper(
2526
field_offices = scrape_field_offices()
2627
facilities_data = merge_field_offices(facilities_data, field_offices)
2728
facilities_data = insert_additional_facilities(facilities_data)
28-
_ = find_inspections()
2929

3030
return facilities_data, agencies

ice_scrapers/inspections.py

Lines changed: 33 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from bs4 import BeautifulSoup
22
import os
3+
import pdfplumber
34
from pprint import pformat
45
import re
56
from utils import (
@@ -11,10 +12,27 @@
1112

1213
root_url = "https://www.ice.gov/foia/odo-facility-inspections"
1314
storage_dir = f"{output_folder}{os.sep}inspections{os.sep}"
15+
"""
16+
example: 2011 Calhoun County Correctional Facility, Battle Creek, MI - Dec. 6-8, 2011
17+
example 2: 2024 Chippewa County, Sault Sainte Marie, MI – Apr. 23-25, 2024
18+
example 3: FY 2018 South Texas ICE Processing Center Compliance Inspection Report – Pearsall, TX - May 1-3, 2018
19+
"""
1420
text_re = re.compile(r"^(\w+\s)?(\d+)\s(.+)\s(-|–)\s(.+)$")
1521

1622

17-
def find_inspections(download_reports: bool = False):
23+
def _extract_txt(url: str) -> str:
24+
file_name = url.split("/")[-1] # type: ignore [union-attr]
25+
download_file(str(url), f"{storage_dir}{file_name}")
26+
full_text = ""
27+
with pdfplumber.open(f"{storage_dir}{file_name}") as pdf:
28+
for idx, page in enumerate(pdf.pages):
29+
txt = page.extract_text()
30+
logger.debug(" Page %s: %s", idx + 1, txt)
31+
full_text = f"{full_text}\n{txt}"
32+
return full_text
33+
34+
35+
def find_inspections() -> dict:
1836
os.makedirs(storage_dir, exist_ok=True)
1937
inspections: dict = {}
2038
logger.info("Collecting inspection reports from %s", root_url)
@@ -25,24 +43,23 @@ def find_inspections(download_reports: bool = False):
2543
links = content.select("a") # type: ignore [union-attr]
2644
for link in links:
2745
url = link["href"]
28-
file_name = url.split("/")[-1] # type: ignore [union-attr]
29-
"""
30-
example: 2011 Calhoun County Correctional Facility, Battle Creek, MI - Dec. 6-8, 2011
31-
example 2: 2024 Chippewa County, Sault Sainte Marie, MI – Apr. 23-25, 2024
32-
example 3: FY 2018 South Texas ICE Processing Center Compliance Inspection Report – Pearsall, TX - May 1-3, 2018
33-
There are inconsistent hyphens!
34-
"""
35-
text = text_re.search(link.text.strip())
46+
obj = {"date": "", "url": url, "text": ""}
47+
matches = text_re.search(link.text.strip())
48+
if len(matches.groups()) < 5: # type: ignore [union-attr]
49+
logger.warning(" Did not find all expected groups in %s. Skipping...", link.text.strip())
50+
continue
3651
# third capture group should be the facility name
37-
location: str = text[3] # type: ignore [index]
52+
location: str = matches.group(3) # type: ignore [union-attr]
3853
# fifth capture group should be the inspection date
39-
date: str = text[5] # type: ignore [index]
40-
logger.debug("Facility: %s, date: %s, details: %s", location, date, url)
41-
if download_reports:
42-
download_file(str(url), f"{output_folder}{os.sep}inspections{os.sep}{file_name}")
54+
date: str = matches.group(5) # type: ignore [union-attr]
55+
obj["date"] = date
56+
logger.debug(" Facility: %s, date: %s, details: %s", location, date, url)
57+
obj["text"] = _extract_txt(str(url))
58+
exit(1)
4359
if location in inspections:
44-
inspections[location].append({"date": date, "details": url})
60+
inspections[location].append(obj)
4561
else:
46-
inspections[location] = [{"date": date, "details": url}]
62+
inspections[location] = [obj]
63+
4764
logger.debug(pformat(inspections))
4865
return inspections

ice_scrapers/utils.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,25 @@
11
from bs4 import BeautifulSoup
2+
import os
23
import re
34
from utils import (
45
logger,
56
session,
67
)
78

89

9-
def download_file(link: str, path: str) -> None:
10+
def download_file(link: str, path: str, redownload: bool = False) -> None:
1011
"""
1112
Standard pattern for downloading a binary file from a URL
1213
"""
14+
if os.path.exists(path) and os.path.getsize(path) > 0 and not redownload:
15+
logger.debug(" Skipping redownload of existing file %s", path)
1316
resp = session.get(link, timeout=120, stream=True)
1417
size = len(resp.content)
1518
with open(path, "wb") as f:
1619
for chunk in resp.iter_content(chunk_size=1024):
1720
if chunk:
1821
f.write(chunk)
19-
logger.debug("Wrote %s byte sheet to %s", size, path)
22+
logger.debug(" Wrote %s byte file to %s", size, path)
2023

2124

2225
def special_facilities(facility: dict) -> dict:

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ dependencies = [
88
"beautifulsoup4>=4.13.5",
99
"fastexcel>=0.15.1",
1010
"lxml>=6.0.1",
11+
"pdfplumber>=0.11.8",
1112
"polars>=1.33.0",
1213
"pyarrow>=21.0.0",
1314
"requests>=2.32.5",

0 commit comments

Comments
 (0)