Skip to content

Commit ce0ed40

Browse files
Merge branch 'fix/deprecation-warning' of https://github.com/aman-dev-maker/malariagen-data-python into fix/deprecation-warning
# Please enter a commit message to explain why this merge is necessary, # especially if it merges an updated upstream into a topic branch. # # Lines starting with '#' will be ignored, and an empty message aborts # the commit.
2 parents 73c4f88 + b752e20 commit ce0ed40

12 files changed

Lines changed: 477 additions & 45 deletions

File tree

WINDOWS_SETUP.md

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
1-
# Windows Setup Guide
1+
Windows Setup Guide
22

33
To get setup for development on Windows, see
44
[this video if you prefer VS Code](https://youtu.be/zddl3n1DCFM),
55
or [this older video if you prefer PyCharm](https://youtu.be/QniQi-Hoo9A),
66
and the instructions below.
77

8-
## 1. Fork and clone this repo
8+
1. Fork and clone this repo
99

1010
Go to https://github.com/malariagen/malariagen-data-python
1111
and click the Fork button (top right corner).
@@ -16,7 +16,7 @@ git clone https://github.com/[username]/malariagen-data-python.git
1616
cd malariagen-data-python
1717
```
1818

19-
## 2. Install Python
19+
2. Install Python
2020

2121
Download and install Python 3.10 from:
2222
https://www.python.org/downloads/windows/
@@ -29,7 +29,7 @@ Verify it worked:
2929
python --version
3030
```
3131

32-
## 3. Install pipx and poetry
32+
3. Install pipx and poetry
3333
```bash
3434
python -m pip install --user pipx
3535
python -m pipx ensurepath
@@ -39,19 +39,19 @@ pipx install poetry
3939
> ⚠️ After running ensurepath, close and reopen
4040
> PowerShell before continuing.
4141
42-
## 4. Create and activate development environment
42+
4. Create and activate development environment
4343
```bash
4444
poetry install
4545
poetry shell
4646
```
4747

48-
## 5. Install pre-commit hooks
48+
5. Install pre-commit hooks
4949
```bash
5050
pipx install pre-commit
5151
pre-commit install
5252
```
5353

54-
## 6. Add upstream remote and get latest code
54+
6. Add upstream remote and get latest code
5555
```bash
5656
git remote add upstream https://github.com/malariagen/malariagen-data-python
5757
git pull upstream master
@@ -60,29 +60,29 @@ git pull upstream master
6060
> ℹ️ Note: On Windows the default branch is called
6161
> master not main.
6262
63-
## 7. Verify everything works
63+
7. Verify everything works
6464
```bash
6565
python -c "import malariagen_data; print('Setup successful!')"
6666
```
6767

68-
## Common Issues on Windows
68+
Common Issues on Windows
6969

70-
**poetry not found after install**
70+
poetry not found after install
7171
Close and reopen PowerShell, then try again.
7272

73-
**git not recognized**
73+
git not recognized
7474
Install Git from https://git-scm.com/download/win
7575
and restart PowerShell.
7676

77-
**python not recognized**
77+
python not recognized
7878
Reinstall Python and make sure to check
7979
"Add Python to PATH" during installation.
8080

81-
**fatal: not a git repository**
81+
fatal: not a git repository
8282
Make sure you are inside the malariagen-data-python
8383
folder before running any git commands.
8484
Use: cd malariagen-data-python
8585

86-
**error: pathspec main did not match**
86+
error: pathspec main did not match
8787
On Windows use master instead of main.
88-
Run: git checkout master
88+
Run: git checkout master

malariagen_data/anoph/sample_metadata.py

Lines changed: 94 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import io
2+
import json
23
from itertools import cycle
34
from typing import (
45
Any,
@@ -81,6 +82,8 @@ def __init__(
8182

8283
# Initialize cache attributes.
8384
self._cache_sample_metadata: Dict = dict()
85+
self._cache_cohorts: Dict = dict()
86+
self._cache_cohort_geometries: Dict = dict()
8487

8588
def _metadata_paths(
8689
self,
@@ -1485,7 +1488,11 @@ def _setup_cohort_queries(
14851488
A cohort set name. Accepted values are:
14861489
"admin1_month", "admin1_quarter", "admin1_year",
14871490
"admin2_month", "admin2_quarter", "admin2_year".
1488-
"""
1491+
""",
1492+
query="""
1493+
An optional pandas query string to filter the resulting
1494+
dataframe, e.g., "country == 'Burkina Faso'".
1495+
""",
14891496
),
14901497
returns="""A dataframe of cohort data, one row per cohort. There are up to 18 columns:
14911498
`cohort_id` is the identifier of the cohort,
@@ -1512,20 +1519,98 @@ def _setup_cohort_queries(
15121519
def cohorts(
15131520
self,
15141521
cohort_set: base_params.cohorts,
1522+
query: Optional[str] = None,
15151523
) -> pd.DataFrame:
1516-
major_version_path = self._major_version_path
1524+
valid_cohort_sets = {
1525+
"admin1_month",
1526+
"admin1_quarter",
1527+
"admin1_year",
1528+
"admin2_month",
1529+
"admin2_quarter",
1530+
"admin2_year",
1531+
}
1532+
if cohort_set not in valid_cohort_sets:
1533+
raise ValueError(
1534+
f"{cohort_set!r} is not a valid cohort set. "
1535+
f"Accepted values are: {sorted(valid_cohort_sets)}."
1536+
)
1537+
1538+
cohorts_analysis = self._cohorts_analysis
1539+
1540+
# Cache to avoid repeated reads.
1541+
cache_key = (cohorts_analysis, cohort_set)
1542+
try:
1543+
df_cohorts = self._cache_cohorts[cache_key]
1544+
except KeyError:
1545+
major_version_path = self._major_version_path
1546+
path = f"{major_version_path[:2]}_cohorts/cohorts_{cohorts_analysis}/cohorts_{cohort_set}.csv"
1547+
1548+
with self.open_file(path) as f:
1549+
df_cohorts = pd.read_csv(f, sep=",", na_values="")
1550+
1551+
# Ensure all column names are lower case.
1552+
df_cohorts.columns = [c.lower() for c in df_cohorts.columns] # type: ignore
1553+
1554+
self._cache_cohorts[cache_key] = df_cohorts
1555+
1556+
if query is not None:
1557+
df_cohorts = df_cohorts.query(query)
1558+
df_cohorts = df_cohorts.reset_index(drop=True)
1559+
1560+
return df_cohorts.copy()
1561+
1562+
@_check_types
1563+
@doc(
1564+
summary="""
1565+
Read GeoJSON geometry data for a specific cohort set,
1566+
providing boundary geometries for each cohort.
1567+
""",
1568+
parameters=dict(
1569+
cohort_set="""
1570+
A cohort set name. Accepted values are:
1571+
"admin1_month", "admin1_quarter", "admin1_year",
1572+
"admin2_month", "admin2_quarter", "admin2_year".
1573+
""",
1574+
),
1575+
returns="""
1576+
A dict containing the parsed GeoJSON FeatureCollection,
1577+
with boundary geometries for each cohort in the set.
1578+
""",
1579+
)
1580+
def cohort_geometries(
1581+
self,
1582+
cohort_set: base_params.cohorts,
1583+
) -> dict:
1584+
valid_cohort_sets = {
1585+
"admin1_month",
1586+
"admin1_quarter",
1587+
"admin1_year",
1588+
"admin2_month",
1589+
"admin2_quarter",
1590+
"admin2_year",
1591+
}
1592+
if cohort_set not in valid_cohort_sets:
1593+
raise ValueError(
1594+
f"{cohort_set!r} is not a valid cohort set. "
1595+
f"Accepted values are: {sorted(valid_cohort_sets)}."
1596+
)
1597+
15171598
cohorts_analysis = self._cohorts_analysis
15181599

1519-
path = f"{major_version_path[:2]}_cohorts/cohorts_{cohorts_analysis}/cohorts_{cohort_set}.csv"
1600+
# Cache to avoid repeated reads.
1601+
cache_key = (cohorts_analysis, cohort_set)
1602+
try:
1603+
geojson_data = self._cache_cohort_geometries[cache_key]
1604+
except KeyError:
1605+
major_version_path = self._major_version_path
1606+
path = f"{major_version_path[:2]}_cohorts/cohorts_{cohorts_analysis}/cohorts_{cohort_set}.geojson"
15201607

1521-
# Read the manifest into a pandas dataframe.
1522-
with self.open_file(path) as f:
1523-
df_cohorts = pd.read_csv(f, sep=",", na_values="")
1608+
with self.open_file(path) as f:
1609+
geojson_data = json.load(f)
15241610

1525-
# Ensure all column names are lower case.
1526-
df_cohorts.columns = [c.lower() for c in df_cohorts.columns] # type: ignore
1611+
self._cache_cohort_geometries[cache_key] = geojson_data
15271612

1528-
return df_cohorts
1613+
return geojson_data
15291614

15301615
@_check_types
15311616
@doc(

malariagen_data/util.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -855,9 +855,7 @@ def _value_error(
855855
value,
856856
expectation,
857857
):
858-
message = (
859-
f"Bad value for parameter {name}; expected {expectation}, " f"found {value!r}"
860-
)
858+
message = f"Bad value for parameter {name}; expected {expectation}, found {value!r}"
861859
raise ValueError(message)
862860

863861

@@ -935,6 +933,7 @@ def info(self, msg):
935933
self.flush()
936934

937935
def set_level(self, level):
936+
self._logger.setLevel(level)
938937
if self._handler is not None:
939938
self._handler.setLevel(level)
940939

notebooks/cohort_geometries.ipynb

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Cohort Geometries\n",
8+
"\n",
9+
"Demonstrates the `cohort_geometries()` method for accessing GeoJSON boundary data."
10+
]
11+
},
12+
{
13+
"cell_type": "code",
14+
"execution_count": null,
15+
"metadata": {},
16+
"outputs": [],
17+
"source": [
18+
"import malariagen_data"
19+
]
20+
},
21+
{
22+
"cell_type": "markdown",
23+
"metadata": {},
24+
"source": [
25+
"## Set up the Ag3 data resource"
26+
]
27+
},
28+
{
29+
"cell_type": "code",
30+
"execution_count": null,
31+
"metadata": {},
32+
"outputs": [],
33+
"source": [
34+
"ag3 = malariagen_data.Ag3(\n",
35+
" \"simplecache::gs://vo_agam_release_master_us_central1\",\n",
36+
" simplecache=dict(cache_storage=\"../gcs_cache\"),\n",
37+
")\n",
38+
"ag3"
39+
]
40+
},
41+
{
42+
"cell_type": "markdown",
43+
"metadata": {},
44+
"source": [
45+
"## Access cohort geometries"
46+
]
47+
},
48+
{
49+
"cell_type": "code",
50+
"execution_count": null,
51+
"metadata": {},
52+
"outputs": [],
53+
"source": [
54+
"geojson = ag3.cohort_geometries(cohort_set=\"admin1_year\")"
55+
]
56+
},
57+
{
58+
"cell_type": "markdown",
59+
"metadata": {},
60+
"source": [
61+
"## Inspect the returned data"
62+
]
63+
},
64+
{
65+
"cell_type": "code",
66+
"execution_count": null,
67+
"metadata": {},
68+
"outputs": [],
69+
"source": [
70+
"type(geojson)"
71+
]
72+
},
73+
{
74+
"cell_type": "code",
75+
"execution_count": null,
76+
"metadata": {},
77+
"outputs": [],
78+
"source": [
79+
"geojson.keys()"
80+
]
81+
},
82+
{
83+
"cell_type": "code",
84+
"execution_count": null,
85+
"metadata": {},
86+
"outputs": [],
87+
"source": [
88+
"len(geojson[\"features\"])"
89+
]
90+
},
91+
{
92+
"cell_type": "code",
93+
"execution_count": null,
94+
"metadata": {},
95+
"outputs": [],
96+
"source": [
97+
"for f in geojson[\"features\"][:3]:\n",
98+
" print(f[\"properties\"])"
99+
]
100+
}
101+
],
102+
"metadata": {
103+
"kernelspec": {
104+
"display_name": "Python 3 (ipykernel)",
105+
"language": "python",
106+
"name": "python3"
107+
},
108+
"language_info": {
109+
"name": "python",
110+
"version": "3.10.0"
111+
}
112+
},
113+
"nbformat": 4,
114+
"nbformat_minor": 4
115+
}

notebooks/plot_haplotypes_frequencies.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@
102102
" sample_sets=[\"1232-VO-KE-OCHOMO-VMF00044\"],\n",
103103
" min_cohort_size=10,\n",
104104
")\n",
105-
"ag3.plot_frequencies_time_series(hap_xr)"
105+
"af1.plot_frequencies_time_series(hap_xr)"
106106
]
107107
},
108108
{

0 commit comments

Comments
 (0)