Skip to content

Commit 3e7859e

Browse files
committed
refactor: migrate karyotype tag loading to GCS and add simulated tests
- Replace importlib.resources loading with GCS-based loading via self._fs - Introduce DEFAULT_KARYOTYPE_ANALYSIS config support - Add inversion validation and improved contig handling - Add simulated test data and coverage for karyotype - Remove bundled CSV from package resources
1 parent 2a1431b commit 3e7859e

12 files changed

Lines changed: 155 additions & 1104 deletions

File tree

MANIFEST.in

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +0,0 @@
1-
include malariagen_data/resources/*

docs/source/Ag3.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,7 @@ Inversion karyotypes
224224
:toctree: generated/
225225

226226
karyotype
227+
load_inversion_tags
227228

228229
Phenotype data access
229230
---------------------

malariagen_data/adar1.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,6 @@ def __init__(
130130
tqdm_class=tqdm_class,
131131
taxon_colors=TAXON_COLORS,
132132
virtual_contigs=None,
133-
inversion_tag_path=None,
134133
unrestricted_use_only=unrestricted_use_only,
135134
surveillance_use_only=surveillance_use_only,
136135
)

malariagen_data/adir1.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,6 @@ def __init__(
130130
tqdm_class=tqdm_class,
131131
taxon_colors=TAXON_COLORS,
132132
virtual_contigs=None,
133-
inversion_tag_path=None,
134133
unrestricted_use_only=unrestricted_use_only,
135134
surveillance_use_only=surveillance_use_only,
136135
)

malariagen_data/af1.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,6 @@ def __init__(
132132
tqdm_class=tqdm_class,
133133
taxon_colors=TAXON_COLORS,
134134
virtual_contigs=None,
135-
inversion_tag_path=None,
136135
unrestricted_use_only=unrestricted_use_only,
137136
surveillance_use_only=surveillance_use_only,
138137
)

malariagen_data/ag3.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
GENE_NAMES = {
2828
"AGAP004707": "Vgsc/para",
2929
}
30-
INVERSION_TAG_PATH = "karyotype_tag_snps.csv"
3130

3231

3332
def _setup_aim_palettes():
@@ -213,7 +212,6 @@ def __init__(
213212
aim_species_colors=AIM_SPECIES_COLORS,
214213
virtual_contigs=VIRTUAL_CONTIGS,
215214
gene_names=GENE_NAMES,
216-
inversion_tag_path=INVERSION_TAG_PATH,
217215
unrestricted_use_only=unrestricted_use_only,
218216
surveillance_use_only=surveillance_use_only,
219217
)

malariagen_data/amin1.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,6 @@ def __init__(
130130
tqdm_class=tqdm_class,
131131
taxon_colors=TAXON_COLORS,
132132
virtual_contigs=None,
133-
inversion_tag_path=None,
134133
unrestricted_use_only=unrestricted_use_only,
135134
surveillance_use_only=surveillance_use_only,
136135
)

malariagen_data/anoph/karyotype.py

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -36,37 +36,59 @@ def _karyotype_tags_n_alt(gt, alts, inversion_alts):
3636
class AnophelesKaryotypeAnalysis(AnophelesSnpData):
3737
def __init__(
3838
self,
39-
inversion_tag_path: Optional[str] = None,
39+
karyotype_analysis: Optional[str] = None,
4040
**kwargs,
4141
):
4242
# N.B., this class is designed to work cooperatively, and
4343
# so it's important that any remaining parameters are passed
4444
# to the superclass constructor.
4545
super().__init__(**kwargs)
4646

47-
self._inversion_tag_path = inversion_tag_path
47+
# If provided, this analysis version will override the
48+
# default value provided in the release configuration.
49+
self._karyotype_analysis_override = karyotype_analysis
50+
51+
@property
52+
def _karyotype_analysis(self) -> Optional[str]:
53+
if self._karyotype_analysis_override:
54+
return self._karyotype_analysis_override
55+
else:
56+
# N.B., this will return None if the key is not present in the
57+
# config.
58+
return self.config.get("DEFAULT_KARYOTYPE_ANALYSIS")
59+
60+
def _require_karyotype_analysis(self):
61+
if not self._karyotype_analysis:
62+
raise NotImplementedError(
63+
"Inversion karyotype analysis is not available for this data resource."
64+
)
4865

4966
@_check_types
5067
@doc(
5168
summary="Load tag SNPs for a given inversion.",
5269
)
5370
def load_inversion_tags(self, inversion: inversion_param) -> pd.DataFrame:
54-
# needs to be modified depending on where we are hosting
55-
import importlib.resources
56-
from .. import resources
71+
self._require_karyotype_analysis()
5772

58-
if self._inversion_tag_path is None:
59-
raise NotImplementedError(
60-
"No inversion tags are available for this data resource."
73+
path = (
74+
f"{self._base_path}/{self._major_version_path}"
75+
f"/karyotype/{self._karyotype_analysis}/karyotype_tag_snps.csv"
76+
)
77+
with self._fs.open(path) as f:
78+
df_tag_snps = pd.read_csv(f, sep=",")
79+
80+
# Validate inversion name.
81+
available = sorted(df_tag_snps["inversion"].unique())
82+
if inversion not in available:
83+
raise ValueError(
84+
f"Unknown inversion '{inversion}'. Available inversions: {available}"
6185
)
62-
else:
63-
with importlib.resources.path(resources, self._inversion_tag_path) as path:
64-
df_tag_snps = pd.read_csv(path, sep=",")
65-
return df_tag_snps.query(f"inversion == '{inversion}'").reset_index()
86+
87+
return df_tag_snps.query(f"inversion == '{inversion}'").reset_index(drop=True)
6688

6789
@_check_types
6890
@doc(
69-
summary="Infer karyotype from tag SNPs for a given inversion in Ag.",
91+
summary="Infer karyotype from tag SNPs for a given inversion.",
7092
)
7193
def karyotype(
7294
self,
@@ -79,7 +101,7 @@ def karyotype(
79101
df_tagsnps = self.load_inversion_tags(inversion=inversion)
80102
inversion_pos = df_tagsnps["position"]
81103
inversion_alts = df_tagsnps["alt_allele"]
82-
contig = inversion[0:2]
104+
contig = df_tagsnps["contig"].iloc[0]
83105

84106
# get snp calls for inversion region
85107
start, end = np.min(inversion_pos), np.max(inversion_pos)

malariagen_data/anopheles.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ def __init__(
140140
aim_species_colors: Optional[Mapping[str, str]] = None,
141141
virtual_contigs: Optional[Mapping[str, Sequence[str]]] = None,
142142
gene_names: Optional[Mapping[str, str]] = None,
143-
inversion_tag_path: Optional[str] = None,
143+
karyotype_analysis: Optional[str] = None,
144144
unrestricted_use_only: Optional[bool] = None,
145145
surveillance_use_only: Optional[bool] = None,
146146
):
@@ -178,7 +178,7 @@ def __init__(
178178
aim_species_colors=aim_species_colors,
179179
virtual_contigs=virtual_contigs,
180180
gene_names=gene_names,
181-
inversion_tag_path=inversion_tag_path,
181+
karyotype_analysis=karyotype_analysis,
182182
unrestricted_use_only=unrestricted_use_only,
183183
surveillance_use_only=surveillance_use_only,
184184
)

0 commit comments

Comments
 (0)