Skip to content

Commit ea15930

Browse files
committed
fix: handle missing terms-of-use columns in manifest (#766)
1 parent 1b2ef11 commit ea15930

2 files changed

Lines changed: 141 additions & 7 deletions

File tree

malariagen_data/anoph/base.py

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -562,6 +562,10 @@ def _sample_set_has_unrestricted_use(self, *, sample_set: str):
562562
release_manifest_df = self._read_sample_sets_manifest(
563563
single_release=sample_set_release
564564
)
565+
566+
if "unrestricted_use" not in release_manifest_df.columns:
567+
return False
568+
565569
sample_set_records_srs = release_manifest_df.loc[
566570
release_manifest_df["sample_set"] == sample_set, "unrestricted_use"
567571
]
@@ -824,13 +828,27 @@ def lookup_study_info(self, sample_set: base_params.sample_set) -> dict:
824828
def lookup_terms_of_use_info(self, sample_set: base_params.sample_set) -> dict:
825829
if self._cache_sample_set_to_terms_of_use_info is None:
826830
df_sample_sets = self._available_sample_sets().set_index("sample_set")
827-
self._cache_sample_set_to_terms_of_use_info = df_sample_sets[
828-
[
829-
"terms_of_use_expiry_date",
830-
"terms_of_use_url",
831-
"unrestricted_use",
832-
]
833-
].to_dict(orient="index")
831+
expected_cols = [
832+
"terms_of_use_expiry_date",
833+
"terms_of_use_url",
834+
"unrestricted_use",
835+
]
836+
placeholder_values = {
837+
"terms_of_use_expiry_date": "2099-12-31",
838+
"terms_of_use_url": float("nan"),
839+
"unrestricted_use": False,
840+
}
841+
available_cols = [c for c in expected_cols if c in df_sample_sets.columns]
842+
if available_cols:
843+
lookup = df_sample_sets[available_cols].to_dict(orient="index")
844+
missing_cols = set(expected_cols) - set(available_cols)
845+
if missing_cols:
846+
for ss in lookup:
847+
for mc in missing_cols:
848+
lookup[ss][mc] = placeholder_values[mc]
849+
else:
850+
lookup = {ss: dict(placeholder_values) for ss in df_sample_sets.index}
851+
self._cache_sample_set_to_terms_of_use_info = lookup
834852
try:
835853
return self._cache_sample_set_to_terms_of_use_info[sample_set]
836854
except KeyError as e:

tests/anoph/test_base.py

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,3 +256,119 @@ def test_lookup_study(fixture, api):
256256

257257
with pytest.raises(ValueError):
258258
api.lookup_study("foobar")
259+
260+
261+
def _strip_terms_of_use_from_manifest(manifest_path):
262+
"""Rewrite a manifest TSV file without terms-of-use columns."""
263+
df = pd.read_csv(manifest_path, sep="\t")
264+
cols_to_drop = [c for c in df.columns if c.startswith("terms_of_use")]
265+
df = df.drop(columns=cols_to_drop)
266+
df.to_csv(manifest_path, index=False, sep="\t")
267+
268+
269+
def test_lookup_terms_of_use_info_missing_columns(ag3_sim_fixture):
270+
import shutil
271+
272+
manifest_paths = [
273+
ag3_sim_fixture.bucket_path / "v3" / "manifest.tsv",
274+
ag3_sim_fixture.bucket_path / "v3.1" / "manifest.tsv",
275+
]
276+
backups = []
277+
for mp in manifest_paths:
278+
bp = mp.parent / "manifest.tsv.bak"
279+
shutil.copy2(mp, bp)
280+
backups.append(bp)
281+
282+
try:
283+
for mp in manifest_paths:
284+
_strip_terms_of_use_from_manifest(mp)
285+
286+
api = AnophelesBase(
287+
url=ag3_sim_fixture.url,
288+
public_url=ag3_sim_fixture.url,
289+
config_path=_ag3.CONFIG_PATH,
290+
major_version_number=_ag3.MAJOR_VERSION_NUMBER,
291+
major_version_path=_ag3.MAJOR_VERSION_PATH,
292+
pre=True,
293+
)
294+
295+
sample_set = "1177-VO-ML-LEHMANN-VMF00004"
296+
info = api.lookup_terms_of_use_info(sample_set)
297+
assert isinstance(info, dict)
298+
assert "terms_of_use_expiry_date" in info
299+
assert "terms_of_use_url" in info
300+
assert "unrestricted_use" in info
301+
assert info["terms_of_use_expiry_date"] == "2099-12-31"
302+
assert pd.isna(info["terms_of_use_url"])
303+
assert info["unrestricted_use"] is False
304+
finally:
305+
for mp, bp in zip(manifest_paths, backups):
306+
shutil.move(bp, mp)
307+
308+
309+
def test_sample_set_has_unrestricted_use_missing_column(ag3_sim_fixture):
310+
import shutil
311+
312+
manifest_paths = [
313+
ag3_sim_fixture.bucket_path / "v3" / "manifest.tsv",
314+
ag3_sim_fixture.bucket_path / "v3.1" / "manifest.tsv",
315+
]
316+
backups = []
317+
for mp in manifest_paths:
318+
bp = mp.parent / "manifest.tsv.bak"
319+
shutil.copy2(mp, bp)
320+
backups.append(bp)
321+
322+
try:
323+
for mp in manifest_paths:
324+
_strip_terms_of_use_from_manifest(mp)
325+
326+
api = AnophelesBase(
327+
url=ag3_sim_fixture.url,
328+
public_url=ag3_sim_fixture.url,
329+
config_path=_ag3.CONFIG_PATH,
330+
major_version_number=_ag3.MAJOR_VERSION_NUMBER,
331+
major_version_path=_ag3.MAJOR_VERSION_PATH,
332+
pre=True,
333+
)
334+
335+
sample_set = "1177-VO-ML-LEHMANN-VMF00004"
336+
result = api._sample_set_has_unrestricted_use(sample_set=sample_set)
337+
assert result is False
338+
finally:
339+
for mp, bp in zip(manifest_paths, backups):
340+
shutil.move(bp, mp)
341+
342+
343+
def test_sample_sets_no_terms_of_use(ag3_sim_fixture):
344+
import shutil
345+
346+
manifest_paths = [
347+
ag3_sim_fixture.bucket_path / "v3" / "manifest.tsv",
348+
ag3_sim_fixture.bucket_path / "v3.1" / "manifest.tsv",
349+
]
350+
backups = []
351+
for mp in manifest_paths:
352+
bp = mp.parent / "manifest.tsv.bak"
353+
shutil.copy2(mp, bp)
354+
backups.append(bp)
355+
356+
try:
357+
for mp in manifest_paths:
358+
_strip_terms_of_use_from_manifest(mp)
359+
360+
api = AnophelesBase(
361+
url=ag3_sim_fixture.url,
362+
public_url=ag3_sim_fixture.url,
363+
config_path=_ag3.CONFIG_PATH,
364+
major_version_number=_ag3.MAJOR_VERSION_NUMBER,
365+
major_version_path=_ag3.MAJOR_VERSION_PATH,
366+
pre=True,
367+
)
368+
369+
df = api.sample_sets(release="3.1")
370+
assert isinstance(df, pd.DataFrame)
371+
assert len(df) > 0
372+
finally:
373+
for mp, bp in zip(manifest_paths, backups):
374+
shutil.move(bp, mp)

0 commit comments

Comments
 (0)