Skip to content

Commit 6067ffd

Browse files
committed
rough fuzzy matching
Signed-off-by: John Seekins <john@robot-house.us>
1 parent 8363d6e commit 6067ffd

4 files changed

Lines changed: 105 additions & 12 deletions

File tree

ice_scrapers/general.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import copy
2+
from thefuzz import fuzz # type: ignore [import-untyped]
23
from schemas import facilities_schema
34
from .agencies import scrape_agencies
45
from .custom_facilities import insert_additional_facilities
@@ -10,16 +11,40 @@
1011
from .inspections import find_inspections
1112
from .spreadsheet_load import load_sheet
1213
from .vera_data import collect_vera_facility_data
14+
from utils import logger
1315

1416

1517
def facilities_scrape_wrapper(
16-
keep_sheet: bool = True, force_download: bool = True, skip_vera: bool = False
18+
keep_sheet: bool = True,
19+
force_download: bool = True,
20+
skip_vera: bool = False,
21+
inspection_text: bool = False,
1722
) -> tuple[dict, dict]:
18-
_ = find_inspections()
1923
agencies = scrape_agencies(keep_sheet, force_download)
2024
facilities_data = copy.deepcopy(facilities_schema)
2125
facilities = load_sheet(keep_sheet, force_download)
2226
facilities_data["facilities"] = copy.deepcopy(facilities)
27+
facility_name_map = {v["name"].lower(): k for k, v in facilities_data["facilities"].items()}
28+
inspections = find_inspections(keep_text=inspection_text)
29+
30+
# actually attach inspections to facilities
31+
for facility, inspect in inspections.items():
32+
logger.debug(" Matching %s for inspection details...", facility)
33+
# exact match (extremely unlikely)
34+
if facility.lower() in facility_name_map:
35+
facilities_data["facilities"][facility_name_map[facility.lower()]]["inspection"]["details"] = copy.deepcopy(
36+
inspect
37+
)
38+
break
39+
logger.debug(" Checking fuzzy matches:")
40+
for k, v in facility_name_map.items():
41+
r = fuzz.partial_ratio(facility, k)
42+
logger.debug(" %s === %s, ratio: %s", facility, k, r)
43+
if r > 80:
44+
logger.info(" Probably the right facility %s => %s, (ratio %s)", k, facility, r)
45+
facilities_data["facilities"][facility_name_map[k]]["inspection"]["details"] = copy.deepcopy(inspect)
46+
break
47+
2348
facilities_data = scrape_facilities(facilities_data)
2449
if not skip_vera:
2550
facilities_data = collect_vera_facility_data(facilities_data, keep_sheet, force_download)

ice_scrapers/inspections.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def _extract_txt(url: str) -> str:
3434
return full_text
3535

3636

37-
def find_inspections() -> dict:
37+
def find_inspections(keep_text: bool = True) -> dict:
3838
os.makedirs(storage_dir, exist_ok=True)
3939
inspections: dict = {}
4040
logger.info("Collecting inspection reports from %s", root_url)
@@ -55,15 +55,16 @@ def find_inspections() -> dict:
5555
# fifth capture group should be the inspection date
5656
date: str = matches.group(5) # type: ignore [union-attr]
5757
obj["date"] = date
58-
text = zstd.compress(_extract_txt(str(url)).encode("utf-8"))
59-
logger.debug(
60-
" Facility: %s, date: %s, url: %s, report length (compressed): %s",
61-
location,
62-
date,
63-
url,
64-
sys.getsizeof(text),
65-
)
66-
obj["text"] = text
58+
if keep_text:
59+
text = zstd.compress(_extract_txt(str(url)).encode("utf-8"))
60+
logger.debug(
61+
" Facility: %s, date: %s, url: %s, report length (compressed): %s",
62+
location,
63+
date,
64+
url,
65+
sys.getsizeof(text),
66+
)
67+
obj["text"] = text
6768
if location in inspections:
6869
inspections[location].append(obj)
6970
else:

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ dependencies = [
1212
"polars>=1.33.0",
1313
"pyarrow>=21.0.0",
1414
"requests>=2.32.5",
15+
"thefuzz>=0.22.1",
1516
"xlsxwriter>=3.2.5",
1617
]
1718

uv.lock

Lines changed: 66 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)