Merge pull request #1076 from joshitha1808/feat-allow-region-input-snp-allele-frequencies

jonbrenas · web-flow · commit 159b91ebd01c · 2026-03-22T09:40:28.000Z
feat: allow snp_allele_frequencies to accept genomic regions
diff --git a/malariagen_data/anoph/snp_frq.py b/malariagen_data/anoph/snp_frq.py
@@ -119,14 +119,14 @@ def snp_effects(
     @_check_types
     @doc(
         summary="""
-            Compute SNP allele frequencies for a gene transcript.
+            Compute SNP allele frequencies for a gene transcript or genomic region.
         """,
         returns="""
             A dataframe of SNP allele frequencies, one row per variant allele. The variant alleles are indexed by
             their contig, their position, the reference allele, the alternate allele and the associated amino acid change.
             The columns are split into three categories: there is one column for each taxon filter (e.g., pass_funestus, pass_gamb_colu, ...) containing whether the site of the variant allele passes the filter;
-            there is then 1 column for each cohort containing the frequency of the variant allele within the cohort, additionally there is a column `max_af` containing the maximum allele frequency of the variant allele across all cohorts;
-            finally, there are 9 columns describing the variant allele: `transcript` contains the gene transcript used for this analysis,
+            there is then 1 column for each cohort containing the frequency of the variant allele within the cohort, additionally there is a column `max_af` containing the maximum allele frequency of the variant allele accross all cohorts;
+            finally, there are 9 columns describing the variant allele: `transcript` contains the gene transcript used for this analysis (when provided),
             `effect` is the effect of the allele change,
             `impact`is the impact of the allele change,
             `ref_codon` is the reference codon,
@@ -143,8 +143,9 @@ def snp_effects(
     )
     def snp_allele_frequencies(
         self,
-        transcript: base_params.transcript,
-        cohorts: base_params.cohorts,
+        transcript: Optional[base_params.transcript] = None,
+        region: Optional[base_params.region] = None,
+        cohorts: Optional[base_params.cohorts] = None,
         sample_query: Optional[base_params.sample_query] = None,
         sample_query_options: Optional[base_params.sample_query_options] = None,
         min_cohort_size: base_params.min_cohort_size = 10,
@@ -156,6 +157,16 @@ def snp_allele_frequencies(
         chunks: base_params.chunks = base_params.native_chunks,
         inline_array: base_params.inline_array = base_params.inline_array_default,
     ) -> pd.DataFrame:
+        # Validate transcript/region usage.
+        if transcript is None and region is None:
+            raise ValueError("Provide either transcript or region.")
+        if transcript is not None and region is not None:
+            raise ValueError("Provide only one of transcript or region, not both.")
+
+        # For backwards compatibility, default region to transcript when only transcript is given.
+        if region is None:
+            region = transcript
+
         # Access sample metadata.
         df_samples = self.sample_metadata(
             sample_sets=sample_sets,
@@ -170,7 +181,7 @@ def snp_allele_frequencies(
 
         # Access SNP data.
         ds_snp = self.snp_calls(
-            region=transcript,
+            region=region,
             site_mask=site_mask,
             sample_sets=sample_sets,
             sample_query=sample_query,
@@ -244,45 +255,49 @@ def snp_allele_frequencies(
         # Reset index after filtering.
         df_snps.reset_index(inplace=True, drop=True)
 
-        if effects:
-            # Add effect annotations.
+        if effects and (transcript is not None):
+            # Add effect annotations (requires transcript).
             ann = self._snp_effect_annotator
             ann.get_effects(
                 transcript=transcript, variants=df_snps, progress=self._progress
             )
 
-            # Add label.
+            # Add label with amino acid change.
             df_snps["label"] = _pandas_apply(
                 _make_snp_label_effect,
                 df_snps,
                 columns=["contig", "position", "ref_allele", "alt_allele", "aa_change"],
             )
 
-            # Set index.
+            # Set index including aa_change.
             df_snps.set_index(
                 ["contig", "position", "ref_allele", "alt_allele", "aa_change"],
                 inplace=True,
             )
 
         else:
-            # Add label.
+            # No transcript (or effects=False): do not require effect annotation.
             df_snps["label"] = _pandas_apply(
                 _make_snp_label,
                 df_snps,
                 columns=["contig", "position", "ref_allele", "alt_allele"],
             )
 
-            # Set index.
             df_snps.set_index(
                 ["contig", "position", "ref_allele", "alt_allele"],
                 inplace=True,
             )
 
         # Add dataframe metadata.
-        gene_name = self._transcript_to_parent_name(transcript)
-        title = transcript
-        if gene_name:
-            title += f" ({gene_name})"
+        if transcript is not None:
+            gene_name = self._transcript_to_parent_name(transcript)
+            title = transcript
+            if gene_name:
+                title += f" ({gene_name})"
+        else:
+            # No transcript, just use the region string.
+            title = str(region)
+
         title += " SNP frequencies"
         df_snps.attrs["title"] = title