malariagen
diff --git a/‎malariagen_data/anoph/sample_metadata.py‎
Lines changed: 94 additions & 9 deletions b/‎malariagen_data/anoph/sample_metadata.py‎
Lines changed: 94 additions & 9 deletions
diff --git a/‎malariagen_data/util.py‎
Lines changed: 2 additions & 3 deletions b/‎malariagen_data/util.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎notebooks/cohort_geometries.ipynb‎
Lines changed: 115 additions & 0 deletions b/‎notebooks/cohort_geometries.ipynb‎
Lines changed: 115 additions & 0 deletions
diff --git a/‎notebooks/plot_haplotypes_frequencies.ipynb‎
Lines changed: 1 addition & 1 deletion b/‎notebooks/plot_haplotypes_frequencies.ipynb‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/anoph/conftest.py‎
Lines changed: 54 additions & 22 deletions b/‎tests/anoph/conftest.py‎
Lines changed: 54 additions & 22 deletions
@@ -1,4 +1,5 @@
 import io
+import json
 from itertools import cycle
 from typing import (
     Any,
@@ -81,6 +82,8 @@ def __init__(
 
         # Initialize cache attributes.
         self._cache_sample_metadata: Dict = dict()
+        self._cache_cohorts: Dict = dict()
+        self._cache_cohort_geometries: Dict = dict()
 
     def _metadata_paths(
         self,
@@ -1496,7 +1499,11 @@ def _setup_cohort_queries(
                 A cohort set name. Accepted values are:
                 "admin1_month", "admin1_quarter", "admin1_year",
                 "admin2_month", "admin2_quarter", "admin2_year".
-            """
+            """,
+            query="""
+                An optional pandas query string to filter the resulting
+                dataframe, e.g., "country == 'Burkina Faso'".
+            """,
         ),
         returns="""A dataframe of cohort data, one row per cohort. There are up to 18 columns:
         `cohort_id` is the identifier of the cohort,
@@ -1523,20 +1530,98 @@ def _setup_cohort_queries(
     def cohorts(
         self,
         cohort_set: base_params.cohorts,
+        query: Optional[str] = None,
     ) -> pd.DataFrame:
-        major_version_path = self._major_version_path
+        valid_cohort_sets = {
+            "admin1_month",
+            "admin1_quarter",
+            "admin1_year",
+            "admin2_month",
+            "admin2_quarter",
+            "admin2_year",
+        }
+        if cohort_set not in valid_cohort_sets:
+            raise ValueError(
+                f"{cohort_set!r} is not a valid cohort set. "
+                f"Accepted values are: {sorted(valid_cohort_sets)}."
+            )
+
+        cohorts_analysis = self._cohorts_analysis
+
+        # Cache to avoid repeated reads.
+        cache_key = (cohorts_analysis, cohort_set)
+        try:
+            df_cohorts = self._cache_cohorts[cache_key]
+        except KeyError:
+            major_version_path = self._major_version_path
+            path = f"{major_version_path[:2]}_cohorts/cohorts_{cohorts_analysis}/cohorts_{cohort_set}.csv"
+
+            with self.open_file(path) as f:
+                df_cohorts = pd.read_csv(f, sep=",", na_values="")
+
+            # Ensure all column names are lower case.
+            df_cohorts.columns = [c.lower() for c in df_cohorts.columns]  # type: ignore
+
+            self._cache_cohorts[cache_key] = df_cohorts
+
+        if query is not None:
+            df_cohorts = df_cohorts.query(query)
+            df_cohorts = df_cohorts.reset_index(drop=True)
+
+        return df_cohorts.copy()
+
+    @_check_types
+    @doc(
+        summary="""
+            Read GeoJSON geometry data for a specific cohort set,
+            providing boundary geometries for each cohort.
+        """,
+        parameters=dict(
+            cohort_set="""
+                A cohort set name. Accepted values are:
+                "admin1_month", "admin1_quarter", "admin1_year",
+                "admin2_month", "admin2_quarter", "admin2_year".
+            """,
+        ),
+        returns="""
+            A dict containing the parsed GeoJSON FeatureCollection,
+            with boundary geometries for each cohort in the set.
+        """,
+    )
+    def cohort_geometries(
+        self,
+        cohort_set: base_params.cohorts,
+    ) -> dict:
+        valid_cohort_sets = {
+            "admin1_month",
+            "admin1_quarter",
+            "admin1_year",
+            "admin2_month",
+            "admin2_quarter",
+            "admin2_year",
+        }
+        if cohort_set not in valid_cohort_sets:
+            raise ValueError(
+                f"{cohort_set!r} is not a valid cohort set. "
+                f"Accepted values are: {sorted(valid_cohort_sets)}."
+            )
+
         cohorts_analysis = self._cohorts_analysis
 
-        path = f"{major_version_path[:2]}_cohorts/cohorts_{cohorts_analysis}/cohorts_{cohort_set}.csv"
+        # Cache to avoid repeated reads.
+        cache_key = (cohorts_analysis, cohort_set)
+        try:
+            geojson_data = self._cache_cohort_geometries[cache_key]
+        except KeyError:
+            major_version_path = self._major_version_path
+            path = f"{major_version_path[:2]}_cohorts/cohorts_{cohorts_analysis}/cohorts_{cohort_set}.geojson"
 
-        # Read the manifest into a pandas dataframe.
-        with self.open_file(path) as f:
-            df_cohorts = pd.read_csv(f, sep=",", na_values="")
+            with self.open_file(path) as f:
+                geojson_data = json.load(f)
 
-        # Ensure all column names are lower case.
-        df_cohorts.columns = [c.lower() for c in df_cohorts.columns]  # type: ignore
+            self._cache_cohort_geometries[cache_key] = geojson_data
 
-        return df_cohorts
+        return geojson_data
 
     @_check_types
     @doc(
 
@@ -855,9 +855,7 @@ def _value_error(
     value,
     expectation,
 ):
-    message = (
-        f"Bad value for parameter {name}; expected {expectation}, " f"found {value!r}"
-    )
+    message = f"Bad value for parameter {name}; expected {expectation}, found {value!r}"
     raise ValueError(message)
 
 
@@ -935,6 +933,7 @@ def info(self, msg):
         self.flush()
 
     def set_level(self, level):
+        self._logger.setLevel(level)
         if self._handler is not None:
             self._handler.setLevel(level)
 
 
@@ -0,0 +1,115 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Cohort Geometries\n",
+    "\n",
+    "Demonstrates the `cohort_geometries()` method for accessing GeoJSON boundary data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import malariagen_data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Set up the Ag3 data resource"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ag3 = malariagen_data.Ag3(\n",
+    "    \"simplecache::gs://vo_agam_release_master_us_central1\",\n",
+    "    simplecache=dict(cache_storage=\"../gcs_cache\"),\n",
+    ")\n",
+    "ag3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access cohort geometries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "geojson = ag3.cohort_geometries(cohort_set=\"admin1_year\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Inspect the returned data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "type(geojson)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "geojson.keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(geojson[\"features\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for f in geojson[\"features\"][:3]:\n",
+    "    print(f[\"properties\"])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
@@ -102,7 +102,7 @@
     "    sample_sets=[\"1232-VO-KE-OCHOMO-VMF00044\"],\n",
     "    min_cohort_size=10,\n",
     ")\n",
-    "ag3.plot_frequencies_time_series(hap_xr)"
+    "af1.plot_frequencies_time_series(hap_xr)"
    ]
   },
   {
 
@@ -334,7 +334,7 @@ def simulate_exons(
         # keep things simple for now.
         if strand == "-":
             # Take exons in reverse order.
-            exons == exons[::-1]
+            exons = exons[::-1]
         for exon_ix, exon in enumerate(exons):
             first_exon = exon_ix == 0
             last_exon = exon_ix == len(exons) - 1
@@ -646,8 +646,8 @@ def simulate_cnv_hmm(zarr_path, metadata_path, contigs, contig_sizes, rng):
     # - sample_is_high_variance [1D array] [bool] [True or False for n_samples]
     # - samples [1D array] [str]
 
-    # Get a random probability for a sample being high variance, between 0 and 1.
-    p_variance = rng.random()
+    # Keep high variance sample prevalence stable for deterministic tests.
+    p_variance = 0.1
 
     # Open a zarr at the specified path.
     root = zarr.open(zarr_path, mode="w")
@@ -862,8 +862,8 @@ def simulate_cnv_discordant_read_calls(
     # - sample_is_high_variance [1D array] [bool] [True or False for n_samples]
     # - samples [1D array] [str for n_samples]
 
-    # Get a random probability for a sample being high variance, between 0 and 1.
-    p_variance = rng.random()
+    # Keep high variance sample prevalence stable for deterministic tests.
+    p_variance = 0.1
 
     # Get a random probability for choosing allele 1, between 0 and 1.
     p_allele = rng.random()
@@ -1408,23 +1408,55 @@ def write_metadata(
             df_coh_ds.to_csv(dst_path, index=False)
 
             # Create cohorts data by sampling from some real files.
-            src_path = (
-                self.fixture_dir
-                / "vo_agam_release_master_us_central1"
-                / "v3_cohorts"
-                / "cohorts_20230516"
-                / "cohorts_admin1_month.csv"
-            )
-            dst_path = (
-                self.bucket_path
-                / "v3_cohorts"
-                / "cohorts_20230516"
-                / "cohorts_admin1_month.csv"
-            )
-            dst_path.parent.mkdir(parents=True, exist_ok=True)
-            with open(src_path, mode="r") as src, open(dst_path, mode="w") as dst:
-                for line in src.readlines()[:5]:
-                    print(line, file=dst)
+            cohort_files = [
+                "cohorts_admin1_month.csv",
+                "cohorts_admin1_year.csv",
+                "cohorts_admin2_month.csv",
+            ]
+            for cohort_file in cohort_files:
+                src_path = (
+                    self.fixture_dir
+                    / "vo_agam_release_master_us_central1"
+                    / "v3_cohorts"
+                    / "cohorts_20230516"
+                    / cohort_file
+                )
+                if src_path.exists():
+                    dst_path = (
+                        self.bucket_path
+                        / "v3_cohorts"
+                        / "cohorts_20230516"
+                        / cohort_file
+                    )
+                    dst_path.parent.mkdir(parents=True, exist_ok=True)
+                    with open(src_path, mode="r") as src, open(
+                        dst_path, mode="w"
+                    ) as dst:
+                        for line in src.readlines()[:5]:
+                            print(line, file=dst)
+
+            # Copy cohort GeoJSON fixtures.
+            geojson_files = [
+                "cohorts_admin1_month.geojson",
+                "cohorts_admin1_year.geojson",
+            ]
+            for geojson_file in geojson_files:
+                src_path = (
+                    self.fixture_dir
+                    / "vo_agam_release_master_us_central1"
+                    / "v3_cohorts"
+                    / "cohorts_20230516"
+                    / geojson_file
+                )
+                if src_path.exists():
+                    dst_path = (
+                        self.bucket_path
+                        / "v3_cohorts"
+                        / "cohorts_20230516"
+                        / geojson_file
+                    )
+                    dst_path.parent.mkdir(parents=True, exist_ok=True)
+                    shutil.copy2(src_path, dst_path)
 
         # Create data catalog by sampling from some real metadata files.
         src_path = (
Original file line number	Diff line number	Diff line change
`@@ -102,7 +102,7 @@`
`102`	`102`	`" sample_sets=[\"1232-VO-KE-OCHOMO-VMF00044\"],\n",`
`103`	`103`	`" min_cohort_size=10,\n",`
`104`	`104`	`")\n",`
`105`		`- "ag3.plot_frequencies_time_series(hap_xr)"`
	`105`	`+ "af1.plot_frequencies_time_series(hap_xr)"`
`106`	`106`	`]`
`107`	`107`	`},`
`108`	`108`	`{`