11from bs4 import BeautifulSoup
22import os
3+ import pdfplumber
34from pprint import pformat
45import re
56from utils import (
1112
1213root_url = "https://www.ice.gov/foia/odo-facility-inspections"
1314storage_dir = f"{ output_folder } { os .sep } inspections{ os .sep } "
15+ """
16+ example: 2011 Calhoun County Correctional Facility, Battle Creek, MI - Dec. 6-8, 2011
17+ example 2: 2024 Chippewa County, Sault Sainte Marie, MI – Apr. 23-25, 2024
18+ example 3: FY 2018 South Texas ICE Processing Center Compliance Inspection Report – Pearsall, TX - May 1-3, 2018
19+ """
1420text_re = re .compile (r"^(\w+\s)?(\d+)\s(.+)\s(-|–)\s(.+)$" )
1521
1622
17- def find_inspections (download_reports : bool = False ):
23+ def _extract_txt (url : str ) -> str :
24+ file_name = url .split ("/" )[- 1 ] # type: ignore [union-attr]
25+ download_file (str (url ), f"{ storage_dir } { file_name } " )
26+ full_text = ""
27+ with pdfplumber .open (f"{ storage_dir } { file_name } " ) as pdf :
28+ for idx , page in enumerate (pdf .pages ):
29+ txt = page .extract_text ()
30+ logger .debug (" Page %s: %s" , idx + 1 , txt )
31+ full_text = f"{ full_text } \n { txt } "
32+ return full_text
33+
34+
35+ def find_inspections () -> dict :
1836 os .makedirs (storage_dir , exist_ok = True )
1937 inspections : dict = {}
2038 logger .info ("Collecting inspection reports from %s" , root_url )
@@ -25,24 +43,23 @@ def find_inspections(download_reports: bool = False):
2543 links = content .select ("a" ) # type: ignore [union-attr]
2644 for link in links :
2745 url = link ["href" ]
28- file_name = url .split ("/" )[- 1 ] # type: ignore [union-attr]
29- """
30- example: 2011 Calhoun County Correctional Facility, Battle Creek, MI - Dec. 6-8, 2011
31- example 2: 2024 Chippewa County, Sault Sainte Marie, MI – Apr. 23-25, 2024
32- example 3: FY 2018 South Texas ICE Processing Center Compliance Inspection Report – Pearsall, TX - May 1-3, 2018
33- There are inconsistent hyphens!
34- """
35- text = text_re .search (link .text .strip ())
46+ obj = {"date" : "" , "url" : url , "text" : "" }
47+ matches = text_re .search (link .text .strip ())
48+ if len (matches .groups ()) < 5 : # type: ignore [union-attr]
49+ logger .warning (" Did not find all expected groups in %s. Skipping..." , link .text .strip ())
50+ continue
3651 # third capture group should be the facility name
37- location : str = text [ 3 ] # type: ignore [index ]
52+ location : str = matches . group ( 3 ) # type: ignore [union-attr ]
3853 # fifth capture group should be the inspection date
39- date : str = text [5 ] # type: ignore [index]
40- logger .debug ("Facility: %s, date: %s, details: %s" , location , date , url )
41- if download_reports :
42- download_file (str (url ), f"{ output_folder } { os .sep } inspections{ os .sep } { file_name } " )
54+ date : str = matches .group (5 ) # type: ignore [union-attr]
55+ obj ["date" ] = date
56+ logger .debug (" Facility: %s, date: %s, details: %s" , location , date , url )
57+ obj ["text" ] = _extract_txt (str (url ))
58+ exit (1 )
4359 if location in inspections :
44- inspections [location ].append ({ "date" : date , "details" : url } )
60+ inspections [location ].append (obj )
4561 else :
46- inspections [location ] = [{"date" : date , "details" : url }]
62+ inspections [location ] = [obj ]
63+
4764 logger .debug (pformat (inspections ))
4865 return inspections
0 commit comments