Skip to content

Commit 159b91e

Browse files
authored
Merge pull request #1076 from joshitha1808/feat-allow-region-input-snp-allele-frequencies
feat: allow snp_allele_frequencies to accept genomic regions
2 parents 51acce9 + ec098be commit 159b91e

1 file changed

Lines changed: 31 additions & 16 deletions

File tree

malariagen_data/anoph/snp_frq.py

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -119,14 +119,14 @@ def snp_effects(
119119
@_check_types
120120
@doc(
121121
summary="""
122-
Compute SNP allele frequencies for a gene transcript.
122+
Compute SNP allele frequencies for a gene transcript or genomic region.
123123
""",
124124
returns="""
125125
A dataframe of SNP allele frequencies, one row per variant allele. The variant alleles are indexed by
126126
their contig, their position, the reference allele, the alternate allele and the associated amino acid change.
127127
The columns are split into three categories: there is one column for each taxon filter (e.g., pass_funestus, pass_gamb_colu, ...) containing whether the site of the variant allele passes the filter;
128-
there is then 1 column for each cohort containing the frequency of the variant allele within the cohort, additionally there is a column `max_af` containing the maximum allele frequency of the variant allele across all cohorts;
129-
finally, there are 9 columns describing the variant allele: `transcript` contains the gene transcript used for this analysis,
128+
there is then 1 column for each cohort containing the frequency of the variant allele within the cohort, additionally there is a column `max_af` containing the maximum allele frequency of the variant allele accross all cohorts;
129+
finally, there are 9 columns describing the variant allele: `transcript` contains the gene transcript used for this analysis (when provided),
130130
`effect` is the effect of the allele change,
131131
`impact`is the impact of the allele change,
132132
`ref_codon` is the reference codon,
@@ -143,8 +143,9 @@ def snp_effects(
143143
)
144144
def snp_allele_frequencies(
145145
self,
146-
transcript: base_params.transcript,
147-
cohorts: base_params.cohorts,
146+
transcript: Optional[base_params.transcript] = None,
147+
region: Optional[base_params.region] = None,
148+
cohorts: Optional[base_params.cohorts] = None,
148149
sample_query: Optional[base_params.sample_query] = None,
149150
sample_query_options: Optional[base_params.sample_query_options] = None,
150151
min_cohort_size: base_params.min_cohort_size = 10,
@@ -156,6 +157,16 @@ def snp_allele_frequencies(
156157
chunks: base_params.chunks = base_params.native_chunks,
157158
inline_array: base_params.inline_array = base_params.inline_array_default,
158159
) -> pd.DataFrame:
160+
# Validate transcript/region usage.
161+
if transcript is None and region is None:
162+
raise ValueError("Provide either transcript or region.")
163+
if transcript is not None and region is not None:
164+
raise ValueError("Provide only one of transcript or region, not both.")
165+
166+
# For backwards compatibility, default region to transcript when only transcript is given.
167+
if region is None:
168+
region = transcript
169+
159170
# Access sample metadata.
160171
df_samples = self.sample_metadata(
161172
sample_sets=sample_sets,
@@ -170,7 +181,7 @@ def snp_allele_frequencies(
170181

171182
# Access SNP data.
172183
ds_snp = self.snp_calls(
173-
region=transcript,
184+
region=region,
174185
site_mask=site_mask,
175186
sample_sets=sample_sets,
176187
sample_query=sample_query,
@@ -244,45 +255,49 @@ def snp_allele_frequencies(
244255
# Reset index after filtering.
245256
df_snps.reset_index(inplace=True, drop=True)
246257

247-
if effects:
248-
# Add effect annotations.
258+
if effects and (transcript is not None):
259+
# Add effect annotations (requires transcript).
249260
ann = self._snp_effect_annotator
250261
ann.get_effects(
251262
transcript=transcript, variants=df_snps, progress=self._progress
252263
)
253264

254-
# Add label.
265+
# Add label with amino acid change.
255266
df_snps["label"] = _pandas_apply(
256267
_make_snp_label_effect,
257268
df_snps,
258269
columns=["contig", "position", "ref_allele", "alt_allele", "aa_change"],
259270
)
260271

261-
# Set index.
272+
# Set index including aa_change.
262273
df_snps.set_index(
263274
["contig", "position", "ref_allele", "alt_allele", "aa_change"],
264275
inplace=True,
265276
)
266277

267278
else:
268-
# Add label.
279+
# No transcript (or effects=False): do not require effect annotation.
269280
df_snps["label"] = _pandas_apply(
270281
_make_snp_label,
271282
df_snps,
272283
columns=["contig", "position", "ref_allele", "alt_allele"],
273284
)
274285

275-
# Set index.
276286
df_snps.set_index(
277287
["contig", "position", "ref_allele", "alt_allele"],
278288
inplace=True,
279289
)
280290

281291
# Add dataframe metadata.
282-
gene_name = self._transcript_to_parent_name(transcript)
283-
title = transcript
284-
if gene_name:
285-
title += f" ({gene_name})"
292+
if transcript is not None:
293+
gene_name = self._transcript_to_parent_name(transcript)
294+
title = transcript
295+
if gene_name:
296+
title += f" ({gene_name})"
297+
else:
298+
# No transcript, just use the region string.
299+
title = str(region)
300+
286301
title += " SNP frequencies"
287302
df_snps.attrs["title"] = title
288303

0 commit comments

Comments
 (0)