malariagen
diff --git a/‎malariagen_data/anoph/base_params.py‎
Lines changed: 7 additions & 0 deletions b/‎malariagen_data/anoph/base_params.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎malariagen_data/anoph/cohort_group_metadata.py‎
Lines changed: 60 additions & 0 deletions b/‎malariagen_data/anoph/cohort_group_metadata.py‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎notebooks/cohort_group_metadata.ipynb‎
Lines changed: 94 additions & 0 deletions b/‎notebooks/cohort_group_metadata.ipynb‎
Lines changed: 94 additions & 0 deletions
@@ -81,6 +81,13 @@
     """,
 ]
 
+cohort_group_query: TypeAlias = Annotated[
+    str,
+    """
+    A pandas query string to be evaluated against cohort-group metadata.
+    """,
+]
+
 sample_indices: TypeAlias = Annotated[
     List[int],
     """
 
@@ -0,0 +1,60 @@
+from typing import Optional
+
+import pandas as pd
+from numpydoc_decorator import doc
+
+from ..util import _check_types
+from . import base_params
+from .base import AnophelesBase
+
+
+class AnophelesCohortGroupMetadata(AnophelesBase):
+    def __init__(
+        self,
+        **kwargs,
+    ):
+        # N.B., this class is designed to work cooperatively, and
+        # so it's important that any remaining parameters are passed
+        # to the superclass constructor.
+        super().__init__(**kwargs)
+
+    @_check_types
+    @doc(
+        summary="""
+            Read metadata for a specific cohort group, including cohort size,
+            country code, taxon, administrative units name, ISO code, geoBoundaries
+            shape ID and representative latitude and longitude points.
+        """,
+        parameters=dict(
+            cohort_group="""
+                A cohort group name. Accepted values are:
+                "admin1_month", "admin1_quarter", "admin1_year",
+                "admin2_month", "admin2_quarter", "admin2_year".
+            """
+        ),
+        returns="A dataframe of cohort metadata, one row per cohort.",
+    )
+    def cohort_group_metadata(
+        self,
+        cohort_group: base_params.cohorts,
+        cohort_group_query: Optional[base_params.cohort_group_query] = None,
+    ) -> pd.DataFrame:
+        major_version_path = self._major_version_path
+        cohorts_analysis = self.config.get("DEFAULT_COHORTS_ANALYSIS")
+
+        path = f"{major_version_path[:2]}_cohorts/cohorts_{cohorts_analysis}/cohorts_{cohort_group}.csv"
+
+        # Read the manifest into a pandas dataframe.
+        with self.open_file(path) as f:
+            df_cohorts = pd.read_csv(f, sep=",", na_values="")
+
+        # Ensure all column names are lower case.
+        df_cohorts.columns = [c.lower() for c in df_cohorts.columns]
+
+        # Apply a cohort group selection.
+        if cohort_group_query is not None:
+            # Assume a pandas query string.
+            df_cohorts = df_cohorts.query(cohort_group_query)
+            df_cohorts = df_cohorts.reset_index(drop=True)
+
+        return df_cohorts.copy()
@@ -0,0 +1,94 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Cohort group metadata quick demo\n",
+    "\n",
+    "This notebook shows how to load cohort-group metadata tables and filter them with `cohort_group_query`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from malariagen_data import ag3 as _ag3\n",
+    "from malariagen_data.anoph.cohort_group_metadata import AnophelesCohortGroupMetadata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cohort_meta = AnophelesCohortGroupMetadata(\n",
+    "    url=\"simplecache::gs://vo_agam_release_master_us_central1\",\n",
+    "    public_url=\"gs://vo_agam_release_master_us_central1/\",\n",
+    "    config_path=_ag3.CONFIG_PATH,\n",
+    "    major_version_number=_ag3.MAJOR_VERSION_NUMBER,\n",
+    "    major_version_path=_ag3.MAJOR_VERSION_PATH,\n",
+    "    pre=True,\n",
+    "    simplecache={\"cache_storage\": \"../gcs_cache\"},\n",
+    ")\n",
+    "cohort_meta"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_admin1_month = cohort_meta.cohort_group_metadata(\"admin1_month\")\n",
+    "df_admin1_month.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_bf = cohort_meta.cohort_group_metadata(\n",
+    "    \"admin1_month\",\n",
+    "    cohort_group_query=\"country == 'Burkina Faso' and year >= 2010\",\n",
+    ")\n",
+    "df_bf.loc[:, [\"cohort_id\", \"cohort_size\", \"country\", \"year\", \"month\", \"admin1_name\"]].head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_bf.groupby([\"year\", \"month\"], as_index=False)[\"cohort_size\"].sum().head()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}