Skip to content

Commit 3853881

Browse files
committed
Fix #1108: Replace truthiness checks with explicit None checks in genome sequence slicing
PlasmodiumDataResource._subset_genome_sequence_region() and AnophelesGenomeSequenceData.genome_sequence() used truthiness checks (if region.start:) which treated 0 as falsy, silently skipping slicing. Changes: - plasmodium.py: Replace truthiness with 'is not None', add start/end >= 1 validation, fix docstring typo 'Sebset' -> 'Subset' - genome_sequence.py: Same truthiness fix + bounds validation - snp_data.py: Fix same bug in is_accessible() (line 1726) - test_plasmodium.py: Add 3 regression tests for valid slice, start=0, end=0 The library uses 1-based genomic coordinates (start - 1 conversion), so start=0 and end=0 now raise ValueError instead of silently misbehaving.
1 parent c269768 commit 3853881

4 files changed

Lines changed: 43 additions & 6 deletions

File tree

malariagen_data/anoph/genome_sequence.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,13 @@ def genome_sequence(
117117

118118
# Deal with region start and stop.
119119
slice_start = slice_stop = None
120-
if resolved_region.start:
120+
if resolved_region.start is not None:
121+
if resolved_region.start < 1:
122+
raise ValueError("Region start must be >= 1 or None.")
121123
slice_start = resolved_region.start - 1
122-
if resolved_region.end:
124+
if resolved_region.end is not None:
125+
if resolved_region.end < 1:
126+
raise ValueError("Region end must be >= 1 or None.")
123127
slice_stop = resolved_region.end
124128
loc_region = slice(slice_start, slice_stop)
125129

malariagen_data/anoph/snp_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1723,7 +1723,7 @@ def is_accessible(
17231723

17241724
# Access SNP site positions.
17251725
pos = self.snp_sites(region=resolved_region, field="POS").compute()
1726-
if resolved_region.start:
1726+
if resolved_region.start is not None:
17271727
offset = resolved_region.start
17281728
else:
17291729
offset = 1

malariagen_data/plasmodium.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -232,17 +232,22 @@ def _resolve_region(self, region):
232232
def _subset_genome_sequence_region(
233233
self, genome, region, inline_array=True, chunks="native"
234234
):
235-
"""Sebset reference genome sequence."""
235+
"""Subset reference genome sequence."""
236236
region = self._resolve_region(region)
237237
z = genome[region.contig]
238238

239239
d = _da_from_zarr(z, inline_array=inline_array, chunks=chunks)
240240

241-
if region.start:
241+
if region.start is not None:
242+
if region.start < 1:
243+
raise ValueError("Region start must be >= 1 or None.")
242244
slice_start = region.start - 1
243245
else:
244246
slice_start = None
245-
if region.end:
247+
248+
if region.end is not None:
249+
if region.end < 1:
250+
raise ValueError("Region end must be >= 1 or None.")
246251
slice_stop = region.end
247252
else:
248253
slice_stop = None

tests/test_plasmodium.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -690,6 +690,34 @@ def test_subset_genome_sequence_contig2(self):
690690
assert sequence.shape == (15,)
691691
assert sequence.ndim == 1
692692

693+
def test_subset_genome_sequence_region_slice(self):
694+
from malariagen_data.util import Region
695+
# contig1 is np.arange(10)
696+
# Valid slice 1-3 -> indices 0,1,2 (length 3)
697+
region = Region("contig1", 1, 3)
698+
sequence = self.test_plasmodium_class._subset_genome_sequence_region(
699+
genome=self.test_ref_zarr, region=region
700+
)
701+
assert isinstance(sequence, da.Array)
702+
assert sequence.shape == (3,)
703+
assert sequence.compute().tolist() == [0, 1, 2]
704+
705+
def test_subset_genome_sequence_region_invalid_start_0(self):
706+
from malariagen_data.util import Region
707+
region = Region("contig1", 0, 3)
708+
with self.assertRaises(ValueError):
709+
self.test_plasmodium_class._subset_genome_sequence_region(
710+
genome=self.test_ref_zarr, region=region
711+
)
712+
713+
def test_subset_genome_sequence_region_invalid_end_0(self):
714+
from malariagen_data.util import Region
715+
region = Region("contig1", 1, 0)
716+
with self.assertRaises(ValueError):
717+
self.test_plasmodium_class._subset_genome_sequence_region(
718+
genome=self.test_ref_zarr, region=region
719+
)
720+
693721

694722
if __name__ == "__main__":
695723
unittest.main()

0 commit comments

Comments
 (0)