Skip to content

Commit 202476e

Browse files
committed
fix: enforce strict validation for terms-of-use metadata (#766)
1 parent ea15930 commit 202476e

2 files changed

Lines changed: 17 additions & 27 deletions

File tree

malariagen_data/anoph/base.py

Lines changed: 13 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -564,7 +564,10 @@ def _sample_set_has_unrestricted_use(self, *, sample_set: str):
564564
)
565565

566566
if "unrestricted_use" not in release_manifest_df.columns:
567-
return False
567+
raise ValueError(
568+
f"Column 'unrestricted_use' missing from manifest for sample set '{sample_set}'. "
569+
"This indicates a data integrity issue in the release manifest."
570+
)
568571

569572
sample_set_records_srs = release_manifest_df.loc[
570573
release_manifest_df["sample_set"] == sample_set, "unrestricted_use"
@@ -833,22 +836,15 @@ def lookup_terms_of_use_info(self, sample_set: base_params.sample_set) -> dict:
833836
"terms_of_use_url",
834837
"unrestricted_use",
835838
]
836-
placeholder_values = {
837-
"terms_of_use_expiry_date": "2099-12-31",
838-
"terms_of_use_url": float("nan"),
839-
"unrestricted_use": False,
840-
}
841-
available_cols = [c for c in expected_cols if c in df_sample_sets.columns]
842-
if available_cols:
843-
lookup = df_sample_sets[available_cols].to_dict(orient="index")
844-
missing_cols = set(expected_cols) - set(available_cols)
845-
if missing_cols:
846-
for ss in lookup:
847-
for mc in missing_cols:
848-
lookup[ss][mc] = placeholder_values[mc]
849-
else:
850-
lookup = {ss: dict(placeholder_values) for ss in df_sample_sets.index}
851-
self._cache_sample_set_to_terms_of_use_info = lookup
839+
missing_cols = [c for c in expected_cols if c not in df_sample_sets.columns]
840+
if missing_cols:
841+
raise ValueError(
842+
f"Terms-of-use columns missing from manifest: {missing_cols}. "
843+
"This indicates a data integrity issue in the release manifest."
844+
)
845+
self._cache_sample_set_to_terms_of_use_info = df_sample_sets[
846+
expected_cols
847+
].to_dict(orient="index")
852848
try:
853849
return self._cache_sample_set_to_terms_of_use_info[sample_set]
854850
except KeyError as e:

tests/anoph/test_base.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -293,14 +293,8 @@ def test_lookup_terms_of_use_info_missing_columns(ag3_sim_fixture):
293293
)
294294

295295
sample_set = "1177-VO-ML-LEHMANN-VMF00004"
296-
info = api.lookup_terms_of_use_info(sample_set)
297-
assert isinstance(info, dict)
298-
assert "terms_of_use_expiry_date" in info
299-
assert "terms_of_use_url" in info
300-
assert "unrestricted_use" in info
301-
assert info["terms_of_use_expiry_date"] == "2099-12-31"
302-
assert pd.isna(info["terms_of_use_url"])
303-
assert info["unrestricted_use"] is False
296+
with pytest.raises(ValueError, match="Terms-of-use columns missing"):
297+
api.lookup_terms_of_use_info(sample_set)
304298
finally:
305299
for mp, bp in zip(manifest_paths, backups):
306300
shutil.move(bp, mp)
@@ -333,8 +327,8 @@ def test_sample_set_has_unrestricted_use_missing_column(ag3_sim_fixture):
333327
)
334328

335329
sample_set = "1177-VO-ML-LEHMANN-VMF00004"
336-
result = api._sample_set_has_unrestricted_use(sample_set=sample_set)
337-
assert result is False
330+
with pytest.raises(ValueError, match="unrestricted_use.*missing"):
331+
api._sample_set_has_unrestricted_use(sample_set=sample_set)
338332
finally:
339333
for mp, bp in zip(manifest_paths, backups):
340334
shutil.move(bp, mp)

0 commit comments

Comments
 (0)