Merge branch 'master' into plink-converter-2024-03-26-tristanpwdennis-shadow

jonbrenas · web-flow · commit e07f92cd0ae6 · 2024-11-27T07:24:12.000Z
diff --git a/malariagen_data/anoph/base.py b/malariagen_data/anoph/base.py
@@ -327,14 +327,18 @@ def _discover_releases(self) -> Tuple[str, ...]:
         )
         # Note: this matches v3, v3. and v3.1, but not v3001.1
         version_pattern = re.compile(f"^v{self._major_version_number}(\\..*)?$")
+        # To sort the versions numerically, we use a lambda function for the "key" parameter of sorted().
+        # The lambda function splits each version string into a list of its integer parts, using split('.') and int(), e.g. [3, 1],
+        # which sorted() then uses to determine the order, as opposed to the default lexicographic order.
         discovered_releases = tuple(
             sorted(
                 [
                     self._path_to_release(d)
                     for d in sub_dirs
                     if version_pattern.match(d)
                     and self._fs.exists(f"{self._base_path}/{d}/manifest.tsv")
-                ]
+                ],
+                key=lambda v: [int(part) for part in v.split(".")],
             )
         )
         return discovered_releases
diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
@@ -888,14 +888,24 @@ def _gene_cnv(
         chunks,
         inline_array,
     ):
-        debug = self._log.debug
-
-        debug("sanity check")
+        # Sanity check.
         assert isinstance(region, Region)
 
-        debug("access HMM data")
+        # Access genes within the region of interest.
+        df_genome_features = self.genome_features(region=region)
+        sample_query_options = sample_query_options or {}
+        df_genes = df_genome_features.query(
+            f"type == '{self._gff_gene_type}'", **sample_query_options
+        )
+
+        # Refine the region for CNV data to ensure coverage of all requested genes.
+        cnv_region = Region(
+            region.contig, df_genes["start"].min(), df_genes["end"].max()
+        )
+
+        # Access HMM data.
         ds_hmm = self.cnv_hmm(
-            region=region.contig,
+            region=cnv_region,
             sample_sets=sample_sets,
             sample_query=sample_query,
             sample_query_options=sample_query_options,
@@ -909,45 +919,38 @@ def _gene_cnv(
         with self._dask_progress(desc="Load CNV HMM data"):
             pos, end, cn = dask.compute(pos, end, cn)
 
-        debug("access genes")
-        df_genome_features = self.genome_features(region=region)
-        sample_query_options = sample_query_options or {}
-        df_genes = df_genome_features.query(
-            f"type == '{self._gff_gene_type}'", **sample_query_options
-        )
-
-        debug("setup intermediates")
+        # Set up intermediates.
         windows = []
         modes = []
         counts = []
 
-        debug("iterate over genes")
+        # Iterate over genes.
         genes_iterator = self._progress(
             df_genes.itertuples(),
             desc="Compute modal gene copy number",
             total=len(df_genes),
         )
         for gene in genes_iterator:
-            # locate windows overlapping the gene
+            # Locate windows overlapping the gene.
             loc_gene_start = bisect_left(end, gene.start)
             loc_gene_stop = bisect_right(pos, gene.end)
             w = loc_gene_stop - loc_gene_start
             windows.append(w)
 
-            # slice out copy number data for the given gene
+            # Slice out copy number data for the given gene.
             cn_gene = cn[loc_gene_start:loc_gene_stop]
 
-            # compute the modes
+            # Compute the modes.
             m, c = _cn_mode(cn_gene, vmax=12)
             modes.append(m)
             counts.append(c)
 
-        debug("combine results")
+        # Combine results.
         windows = np.array(windows)
         modes = np.vstack(modes)
         counts = np.vstack(counts)
 
-        debug("build dataset")
+        # Build dataset.
         ds_out = xr.Dataset(
             coords={
                 "gene_id": (["genes"], df_genes["ID"].values),
@@ -1182,6 +1185,11 @@ def _gene_cnv_frequencies(
 
                 freq_cols[f"frq_{coh}"] = np.concatenate([amp_freq_coh, del_freq_coh])
 
+        if len(coh_dict) == 0:
+            raise ValueError(
+                "No cohorts available for the given sample selection parameters and minimum cohort size."
+            )
+
         debug("build a dataframe with the frequency columns")
         df_freqs = pd.DataFrame(freq_cols)
 
diff --git a/notebooks/plot_frequencies_heatmap.ipynb b/notebooks/plot_frequencies_heatmap.ipynb
@@ -381,6 +381,44 @@
    "id": "86c5c594",
    "metadata": {},
    "outputs": [],
+   "source": [
+    "interesting_cyp_genes = [\n",
+    "    \"AGAP002862\",  # Cyp6aa1\n",
+    "    \"AGAP013128\",  # Cyp6aa2\n",
+    "    \"AGAP002865\",  # Cyp6p3\n",
+    "    \"AGAP000818\",  # Cyp9k1\n",
+    "    \"AGAP008212\",  # Cyp6m2\n",
+    "    \"AGAP008218\",  # Cyp6z2    \n",
+    "]\n",
+    "\n",
+    "cyp_cnv_freqs_df = ag3.gene_cnv_frequencies(\n",
+    "    region=interesting_cyp_genes,\n",
+    "    cohorts=\"admin1_year\",\n",
+    "    sample_sets=(\"AG1000G-BF-A\", \"AG1000G-BF-B\", \"AG1000G-BF-C\"),\n",
+    "    sample_query=\"taxon == 'coluzzii'\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d7ad130-30c2-4cd3-8906-a7ada3ccc75f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ag3.plot_frequencies_heatmap(\n",
+    "    df=cyp_cnv_freqs_df,\n",
+    "    color_continuous_scale=\"Blues\",\n",
+    "    title=\"Cyp gene CNV frequencies\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "83aab417-632e-4fd2-8da4-3ffdd6e233f6",
+   "metadata": {},
+   "outputs": [],
    "source": []
   }
  ],
diff --git a/tests/test_af1.py b/tests/test_af1.py
@@ -6,7 +6,7 @@
 from malariagen_data.util import locate_region, resolve_region
 
 
-def setup_af1(url="simplecache::gs://vo_afun_release/", **kwargs):
+def setup_af1(url="simplecache::gs://vo_afun_release_master_us_central1/", **kwargs):
     kwargs.setdefault("check_location", False)
     kwargs.setdefault("show_progress", False)
     if url is None:
diff --git a/tests/test_ag3.py b/tests/test_ag3.py
@@ -15,7 +15,7 @@
 contigs = "2R", "2L", "3R", "3L", "X"
 
 
-def setup_ag3(url="simplecache::gs://vo_agam_release/", **kwargs):
+def setup_ag3(url="simplecache::gs://vo_agam_release_master_us_central1/", **kwargs):
     kwargs.setdefault("check_location", False)
     kwargs.setdefault("show_progress", False)
     if url is None:

Original file line number	Diff line number	Diff line change
`@@ -327,14 +327,18 @@ def _discover_releases(self) -> Tuple[str, ...]:`
`327`	`327`	`)`
`328`	`328`	`# Note: this matches v3, v3. and v3.1, but not v3001.1`
`329`	`329`	`version_pattern = re.compile(f"^v{self._major_version_number}(\\..*)?$")`
	`330`	`+ # To sort the versions numerically, we use a lambda function for the "key" parameter of sorted().`
	`331`	`+ # The lambda function splits each version string into a list of its integer parts, using split('.') and int(), e.g. [3, 1],`
	`332`	`+ # which sorted() then uses to determine the order, as opposed to the default lexicographic order.`
`330`	`333`	`discovered_releases = tuple(`
`331`	`334`	`sorted(`
`332`	`335`	`[`
`333`	`336`	`self._path_to_release(d)`
`334`	`337`	`for d in sub_dirs`
`335`	`338`	`if version_pattern.match(d)`
`336`	`339`	`and self._fs.exists(f"{self._base_path}/{d}/manifest.tsv")`
`337`		`- ]`
	`340`	`+ ],`
	`341`	`+ key=lambda v: [int(part) for part in v.split(".")],`
`338`	`342`	`)`
`339`	`343`	`)`
`340`	`344`	`return discovered_releases`