|
1 | 1 | import copy |
| 2 | +from thefuzz import fuzz # type: ignore [import-untyped] |
2 | 3 | from schemas import facilities_schema |
3 | 4 | from .agencies import scrape_agencies |
4 | 5 | from .custom_facilities import insert_additional_facilities |
|
10 | 11 | from .inspections import find_inspections |
11 | 12 | from .spreadsheet_load import load_sheet |
12 | 13 | from .vera_data import collect_vera_facility_data |
| 14 | +from utils import logger |
13 | 15 |
|
14 | 16 |
|
15 | 17 | def facilities_scrape_wrapper( |
16 | | - keep_sheet: bool = True, force_download: bool = True, skip_vera: bool = False |
| 18 | + keep_sheet: bool = True, |
| 19 | + force_download: bool = True, |
| 20 | + skip_vera: bool = False, |
| 21 | + inspection_text: bool = False, |
17 | 22 | ) -> tuple[dict, dict]: |
18 | | - _ = find_inspections() |
19 | 23 | agencies = scrape_agencies(keep_sheet, force_download) |
20 | 24 | facilities_data = copy.deepcopy(facilities_schema) |
21 | 25 | facilities = load_sheet(keep_sheet, force_download) |
22 | 26 | facilities_data["facilities"] = copy.deepcopy(facilities) |
| 27 | + facility_name_map = {v["name"].lower(): k for k, v in facilities_data["facilities"].items()} |
| 28 | + inspections = find_inspections(keep_text=inspection_text) |
| 29 | + |
| 30 | + # actually attach inspections to facilities |
| 31 | + for facility, inspect in inspections.items(): |
| 32 | + logger.debug(" Matching %s for inspection details...", facility) |
| 33 | + # exact match (extremely unlikely) |
| 34 | + if facility.lower() in facility_name_map: |
| 35 | + facilities_data["facilities"][facility_name_map[facility.lower()]]["inspection"]["details"] = copy.deepcopy( |
| 36 | + inspect |
| 37 | + ) |
| 38 | + break |
| 39 | + logger.debug(" Checking fuzzy matches:") |
| 40 | + for k, v in facility_name_map.items(): |
| 41 | + r = fuzz.partial_ratio(facility, k) |
| 42 | + logger.debug(" %s === %s, ratio: %s", facility, k, r) |
| 43 | + if r > 80: |
| 44 | + logger.info(" Probably the right facility %s => %s, (ratio %s)", k, facility, r) |
| 45 | + facilities_data["facilities"][facility_name_map[k]]["inspection"]["details"] = copy.deepcopy(inspect) |
| 46 | + break |
| 47 | + |
23 | 48 | facilities_data = scrape_facilities(facilities_data) |
24 | 49 | if not skip_vera: |
25 | 50 | facilities_data = collect_vera_facility_data(facilities_data, keep_sheet, force_download) |
|
0 commit comments