Skip to content

Commit 64ff049

Browse files
authored
Merge pull request #1016 from Sharon-codes/issue-435-cohort-group-metadata
Issue 435 cohort group metadata
2 parents ea9e24c + aa40d8a commit 64ff049

7 files changed

Lines changed: 910 additions & 2 deletions

File tree

malariagen_data/anoph/base_params.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,13 @@
8181
""",
8282
]
8383

84+
cohort_group_query: TypeAlias = Annotated[
85+
str,
86+
"""
87+
A pandas query string to be evaluated against cohort-group metadata.
88+
""",
89+
]
90+
8491
sample_indices: TypeAlias = Annotated[
8592
List[int],
8693
"""
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
from typing import Optional
2+
3+
import pandas as pd
4+
from numpydoc_decorator import doc
5+
6+
from ..util import _check_types
7+
from . import base_params
8+
from .base import AnophelesBase
9+
10+
11+
class AnophelesCohortGroupMetadata(AnophelesBase):
12+
def __init__(
13+
self,
14+
**kwargs,
15+
):
16+
# N.B., this class is designed to work cooperatively, and
17+
# so it's important that any remaining parameters are passed
18+
# to the superclass constructor.
19+
super().__init__(**kwargs)
20+
21+
@_check_types
22+
@doc(
23+
summary="""
24+
Read metadata for a specific cohort group, including cohort size,
25+
country code, taxon, administrative units name, ISO code, geoBoundaries
26+
shape ID and representative latitude and longitude points.
27+
""",
28+
parameters=dict(
29+
cohort_group="""
30+
A cohort group name. Accepted values are:
31+
"admin1_month", "admin1_quarter", "admin1_year",
32+
"admin2_month", "admin2_quarter", "admin2_year".
33+
"""
34+
),
35+
returns="A dataframe of cohort metadata, one row per cohort.",
36+
)
37+
def cohort_group_metadata(
38+
self,
39+
cohort_group: base_params.cohorts,
40+
cohort_group_query: Optional[base_params.cohort_group_query] = None,
41+
) -> pd.DataFrame:
42+
major_version_path = self._major_version_path
43+
cohorts_analysis = self.config.get("DEFAULT_COHORTS_ANALYSIS")
44+
45+
path = f"{major_version_path[:2]}_cohorts/cohorts_{cohorts_analysis}/cohorts_{cohort_group}.csv"
46+
47+
# Read the manifest into a pandas dataframe.
48+
with self.open_file(path) as f:
49+
df_cohorts = pd.read_csv(f, sep=",", na_values="")
50+
51+
# Ensure all column names are lower case.
52+
df_cohorts.columns = [c.lower() for c in df_cohorts.columns]
53+
54+
# Apply a cohort group selection.
55+
if cohort_group_query is not None:
56+
# Assume a pandas query string.
57+
df_cohorts = df_cohorts.query(cohort_group_query)
58+
df_cohorts = df_cohorts.reset_index(drop=True)
59+
60+
return df_cohorts.copy()
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Cohort group metadata quick demo\n",
8+
"\n",
9+
"This notebook shows how to load cohort-group metadata tables and filter them with `cohort_group_query`."
10+
]
11+
},
12+
{
13+
"cell_type": "code",
14+
"execution_count": null,
15+
"metadata": {},
16+
"outputs": [],
17+
"source": [
18+
"from malariagen_data import ag3 as _ag3\n",
19+
"from malariagen_data.anoph.cohort_group_metadata import AnophelesCohortGroupMetadata"
20+
]
21+
},
22+
{
23+
"cell_type": "code",
24+
"execution_count": null,
25+
"metadata": {},
26+
"outputs": [],
27+
"source": [
28+
"cohort_meta = AnophelesCohortGroupMetadata(\n",
29+
" url=\"simplecache::gs://vo_agam_release_master_us_central1\",\n",
30+
" public_url=\"gs://vo_agam_release_master_us_central1/\",\n",
31+
" config_path=_ag3.CONFIG_PATH,\n",
32+
" major_version_number=_ag3.MAJOR_VERSION_NUMBER,\n",
33+
" major_version_path=_ag3.MAJOR_VERSION_PATH,\n",
34+
" pre=True,\n",
35+
" simplecache={\"cache_storage\": \"../gcs_cache\"},\n",
36+
")\n",
37+
"cohort_meta"
38+
]
39+
},
40+
{
41+
"cell_type": "code",
42+
"execution_count": null,
43+
"metadata": {},
44+
"outputs": [],
45+
"source": [
46+
"df_admin1_month = cohort_meta.cohort_group_metadata(\"admin1_month\")\n",
47+
"df_admin1_month.head()"
48+
]
49+
},
50+
{
51+
"cell_type": "code",
52+
"execution_count": null,
53+
"metadata": {},
54+
"outputs": [],
55+
"source": [
56+
"df_bf = cohort_meta.cohort_group_metadata(\n",
57+
" \"admin1_month\",\n",
58+
" cohort_group_query=\"country == 'Burkina Faso' and year >= 2010\",\n",
59+
")\n",
60+
"df_bf.loc[:, [\"cohort_id\", \"cohort_size\", \"country\", \"year\", \"month\", \"admin1_name\"]].head()"
61+
]
62+
},
63+
{
64+
"cell_type": "code",
65+
"execution_count": null,
66+
"metadata": {},
67+
"outputs": [],
68+
"source": [
69+
"df_bf.groupby([\"year\", \"month\"], as_index=False)[\"cohort_size\"].sum().head()"
70+
]
71+
}
72+
],
73+
"metadata": {
74+
"kernelspec": {
75+
"display_name": "Python 3 (ipykernel)",
76+
"language": "python",
77+
"name": "python3"
78+
},
79+
"language_info": {
80+
"codemirror_mode": {
81+
"name": "ipython",
82+
"version": 3
83+
},
84+
"file_extension": ".py",
85+
"mimetype": "text/x-python",
86+
"name": "python",
87+
"nbconvert_exporter": "python",
88+
"pygments_lexer": "ipython3",
89+
"version": "3.11.0"
90+
}
91+
},
92+
"nbformat": 4,
93+
"nbformat_minor": 5
94+
}

0 commit comments

Comments
 (0)