Skip to content

Commit f43c70d

Browse files
authored
Merge branch 'master' into GH1005-fix-cnv-fixed-denominator
2 parents 317c9b1 + cd62177 commit f43c70d

3 files changed

Lines changed: 138 additions & 5 deletions

File tree

.codecov.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
coverage:
2+
status:
3+
project:
4+
default:
5+
target: auto
6+
patch:
7+
default:
8+
target: 80%
9+
threshold: 0%

malariagen_data/anoph/base.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -562,6 +562,13 @@ def _sample_set_has_unrestricted_use(self, *, sample_set: str):
562562
release_manifest_df = self._read_sample_sets_manifest(
563563
single_release=sample_set_release
564564
)
565+
566+
if "unrestricted_use" not in release_manifest_df.columns:
567+
raise ValueError(
568+
f"Column 'unrestricted_use' missing from manifest for sample set '{sample_set}'. "
569+
"This indicates a data integrity issue in the release manifest."
570+
)
571+
565572
sample_set_records_srs = release_manifest_df.loc[
566573
release_manifest_df["sample_set"] == sample_set, "unrestricted_use"
567574
]
@@ -824,12 +831,19 @@ def lookup_study_info(self, sample_set: base_params.sample_set) -> dict:
824831
def lookup_terms_of_use_info(self, sample_set: base_params.sample_set) -> dict:
825832
if self._cache_sample_set_to_terms_of_use_info is None:
826833
df_sample_sets = self._available_sample_sets().set_index("sample_set")
834+
expected_cols = [
835+
"terms_of_use_expiry_date",
836+
"terms_of_use_url",
837+
"unrestricted_use",
838+
]
839+
missing_cols = [c for c in expected_cols if c not in df_sample_sets.columns]
840+
if missing_cols:
841+
raise ValueError(
842+
f"Terms-of-use columns missing from manifest: {missing_cols}. "
843+
"This indicates a data integrity issue in the release manifest."
844+
)
827845
self._cache_sample_set_to_terms_of_use_info = df_sample_sets[
828-
[
829-
"terms_of_use_expiry_date",
830-
"terms_of_use_url",
831-
"unrestricted_use",
832-
]
846+
expected_cols
833847
].to_dict(orient="index")
834848
try:
835849
return self._cache_sample_set_to_terms_of_use_info[sample_set]

tests/anoph/test_base.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,3 +256,113 @@ def test_lookup_study(fixture, api):
256256

257257
with pytest.raises(ValueError):
258258
api.lookup_study("foobar")
259+
260+
261+
def _strip_terms_of_use_from_manifest(manifest_path):
262+
"""Rewrite a manifest TSV file without terms-of-use columns."""
263+
df = pd.read_csv(manifest_path, sep="\t")
264+
cols_to_drop = [c for c in df.columns if c.startswith("terms_of_use")]
265+
df = df.drop(columns=cols_to_drop)
266+
df.to_csv(manifest_path, index=False, sep="\t")
267+
268+
269+
def test_lookup_terms_of_use_info_missing_columns(ag3_sim_fixture):
270+
import shutil
271+
272+
manifest_paths = [
273+
ag3_sim_fixture.bucket_path / "v3" / "manifest.tsv",
274+
ag3_sim_fixture.bucket_path / "v3.1" / "manifest.tsv",
275+
]
276+
backups = []
277+
for mp in manifest_paths:
278+
bp = mp.parent / "manifest.tsv.bak"
279+
shutil.copy2(mp, bp)
280+
backups.append(bp)
281+
282+
try:
283+
for mp in manifest_paths:
284+
_strip_terms_of_use_from_manifest(mp)
285+
286+
api = AnophelesBase(
287+
url=ag3_sim_fixture.url,
288+
public_url=ag3_sim_fixture.url,
289+
config_path=_ag3.CONFIG_PATH,
290+
major_version_number=_ag3.MAJOR_VERSION_NUMBER,
291+
major_version_path=_ag3.MAJOR_VERSION_PATH,
292+
pre=True,
293+
)
294+
295+
sample_set = "1177-VO-ML-LEHMANN-VMF00004"
296+
with pytest.raises(ValueError, match="Terms-of-use columns missing"):
297+
api.lookup_terms_of_use_info(sample_set)
298+
finally:
299+
for mp, bp in zip(manifest_paths, backups):
300+
shutil.move(bp, mp)
301+
302+
303+
def test_sample_set_has_unrestricted_use_missing_column(ag3_sim_fixture):
304+
import shutil
305+
306+
manifest_paths = [
307+
ag3_sim_fixture.bucket_path / "v3" / "manifest.tsv",
308+
ag3_sim_fixture.bucket_path / "v3.1" / "manifest.tsv",
309+
]
310+
backups = []
311+
for mp in manifest_paths:
312+
bp = mp.parent / "manifest.tsv.bak"
313+
shutil.copy2(mp, bp)
314+
backups.append(bp)
315+
316+
try:
317+
for mp in manifest_paths:
318+
_strip_terms_of_use_from_manifest(mp)
319+
320+
api = AnophelesBase(
321+
url=ag3_sim_fixture.url,
322+
public_url=ag3_sim_fixture.url,
323+
config_path=_ag3.CONFIG_PATH,
324+
major_version_number=_ag3.MAJOR_VERSION_NUMBER,
325+
major_version_path=_ag3.MAJOR_VERSION_PATH,
326+
pre=True,
327+
)
328+
329+
sample_set = "1177-VO-ML-LEHMANN-VMF00004"
330+
with pytest.raises(ValueError, match="unrestricted_use.*missing"):
331+
api._sample_set_has_unrestricted_use(sample_set=sample_set)
332+
finally:
333+
for mp, bp in zip(manifest_paths, backups):
334+
shutil.move(bp, mp)
335+
336+
337+
def test_sample_sets_no_terms_of_use(ag3_sim_fixture):
338+
import shutil
339+
340+
manifest_paths = [
341+
ag3_sim_fixture.bucket_path / "v3" / "manifest.tsv",
342+
ag3_sim_fixture.bucket_path / "v3.1" / "manifest.tsv",
343+
]
344+
backups = []
345+
for mp in manifest_paths:
346+
bp = mp.parent / "manifest.tsv.bak"
347+
shutil.copy2(mp, bp)
348+
backups.append(bp)
349+
350+
try:
351+
for mp in manifest_paths:
352+
_strip_terms_of_use_from_manifest(mp)
353+
354+
api = AnophelesBase(
355+
url=ag3_sim_fixture.url,
356+
public_url=ag3_sim_fixture.url,
357+
config_path=_ag3.CONFIG_PATH,
358+
major_version_number=_ag3.MAJOR_VERSION_NUMBER,
359+
major_version_path=_ag3.MAJOR_VERSION_PATH,
360+
pre=True,
361+
)
362+
363+
df = api.sample_sets(release="3.1")
364+
assert isinstance(df, pd.DataFrame)
365+
assert len(df) > 0
366+
finally:
367+
for mp, bp in zip(manifest_paths, backups):
368+
shutil.move(bp, mp)

0 commit comments

Comments
 (0)