Skip to content

Commit dad61a6

Browse files
committed
Merge branch 'master' into GH716_add_constructor_params
2 parents 0f20d3a + c659b69 commit dad61a6

11 files changed

Lines changed: 3268 additions & 3183 deletions

File tree

docs/source/Ag3.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,3 +224,14 @@ Inversion karyotypes
224224
:toctree: generated/
225225

226226
karyotype
227+
228+
Phenotype data access
229+
---------------------
230+
.. autosummary::
231+
:toctree: generated/
232+
233+
phenotype_data
234+
phenotypes_with_snps
235+
phenotypes_with_haplotypes
236+
phenotype_sample_sets
237+
phenotype_binary

docs/source/index.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ natural genetic variation.
8282

8383
Some data from MalariaGEN are subject to **terms of use** which may include an embargo on
8484
public communication of any analysis results without permission from data owners. If you
85-
have any questions about terms of use please email data@malariagen.net.
85+
have any questions about terms of use please email support@malariagen.net.
8686

8787
By default, this sofware package accesses data directly from the **MalariaGEN cloud data repository**
8888
hosted in Google Cloud Storage in the US. Note that data access will be more efficient if your

malariagen_data/anoph/phenotypes.py

Lines changed: 70 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
import pandas as pd
22
import xarray as xr
3-
from typing import Callable, Optional, List, Any
3+
from typing import Callable, Optional, List, Any, TYPE_CHECKING
44
import warnings
55
import fsspec
6+
from numpydoc_decorator import doc # type: ignore
7+
8+
from ..util import check_types
69
from malariagen_data.anoph import base_params, phenotype_params
710

811

@@ -12,14 +15,20 @@ class AnophelesPhenotypeData:
1215
Inherited by AnophelesDataResource subclasses (e.g., Ag3).
1316
"""
1417

15-
# Type annotations for MyPy
16-
_url: str
17-
_fs: fsspec.AbstractFileSystem
18-
sample_metadata: Callable[..., pd.DataFrame]
19-
sample_sets: list[str]
20-
_prep_sample_sets_param: Callable[..., Any]
21-
snp_calls: Callable[..., Any]
22-
haplotypes: Callable[..., Any]
18+
if TYPE_CHECKING:
19+
# Type annotations for MyPy
20+
_url: str
21+
_fs: fsspec.AbstractFileSystem
22+
sample_metadata: Callable[..., pd.DataFrame]
23+
_base_path: str
24+
_major_version_path: str
25+
_release_to_path: Callable[[str], str]
26+
lookup_release: Callable[..., str]
27+
_prep_sample_sets_param: Callable[..., Any]
28+
29+
sample_sets: Callable[..., pd.DataFrame]
30+
snp_calls: Callable[..., Any]
31+
haplotypes: Callable[..., Any]
2332

2433
def __init__(self, **kwargs):
2534
super().__init__(**kwargs)
@@ -32,11 +41,14 @@ def _load_phenotype_data(
3241
Load raw phenotypic data from GCS for given sample sets.
3342
"""
3443
phenotype_dfs = []
35-
base_phenotype_path = f"{self._url}v3.2/phenotypes/all"
3644

3745
for sample_set in sample_sets:
38-
phenotype_path = f"{base_phenotype_path}/{sample_set}/phenotypes.csv"
3946
try:
47+
release = self.lookup_release(sample_set=sample_set)
48+
release_path = self._release_to_path(release)
49+
50+
phenotype_path = f"{self._base_path}/{release_path}/phenotypes/all/{sample_set}/phenotypes.csv"
51+
4052
if not self._fs.exists(phenotype_path):
4153
warnings.warn(
4254
f"Phenotype data file not found for {sample_set} at {phenotype_path}"
@@ -58,14 +70,9 @@ def _load_phenotype_data(
5870
df_pheno["sample_set"] = sample_set
5971
phenotype_dfs.append(df_pheno)
6072

61-
except FileNotFoundError:
62-
warnings.warn(
63-
f"Phenotype data file not found for {sample_set} at {phenotype_path}"
64-
)
65-
continue
6673
except Exception as e:
6774
warnings.warn(
68-
f"Unexpected error loading phenotype data for {sample_set} from {phenotype_path}: {e}"
75+
f"Unexpected error loading phenotype data for {sample_set}: {e}"
6976
)
7077
continue
7178

@@ -308,6 +315,13 @@ def _create_phenotype_dataset(
308315

309316
return ds
310317

318+
@check_types
319+
@doc(
320+
summary="Load phenotypic data from insecticide resistance bioassays.",
321+
returns=dict(
322+
df="DataFrame containing phenotype data merged with sample metadata. Includes sample identifiers, phenotypic measurements, and experimental conditions."
323+
),
324+
)
311325
def phenotype_data(
312326
self,
313327
sample_sets: Optional[base_params.sample_sets] = None,
@@ -318,55 +332,9 @@ def phenotype_data(
318332
max_cohort_size: Optional[base_params.max_cohort_size] = None,
319333
) -> pd.DataFrame:
320334
"""
321-
Load phenotypic data from insecticide resistance bioassays.
322-
323-
Parameters
324-
----------
325-
sample_sets : Optional[base_params.sample_sets]
326-
Sample sets to load data for.
327-
sample_query : Optional[base_params.sample_query]
328-
Query string to filter samples. Can include phenotype-specific columns like:
329-
- insecticide: e.g., "insecticide == 'Deltamethrin'"
330-
- dose: e.g., "dose in [0.5, 2.0]"
331-
- phenotype: e.g., "phenotype == 'alive'"
332-
- location: e.g., "location == 'Cotonou'"
333-
- Any other metadata columns
334-
sample_query_options : Optional[base_params.sample_query_options]
335-
Options for the sample query.
336-
cohort_size : Optional[base_params.cohort_size]
337-
Exact cohort size for sampling.
338-
min_cohort_size : Optional[base_params.min_cohort_size]
339-
Minimum cohort size to include.
340-
max_cohort_size : Optional[base_params.max_cohort_size]
341-
Maximum cohort size (will be randomly sampled if exceeded).
342-
343-
Returns
344-
-------
345-
pd.DataFrame
346-
DataFrame containing phenotype data merged with sample metadata.
347-
348-
Examples
349-
--------
350-
# Load all phenotype data
351-
df = ag3.phenotype_data(sample_sets=['1237-VO-BJ-DJOGBENOU-VMF00050'])
352-
353-
# Filter by insecticide
354-
df = ag3.phenotype_data(
355-
sample_sets=['1237-VO-BJ-DJOGBENOU-VMF00050'],
356-
sample_query="insecticide == 'Deltamethrin'"
357-
)
358-
359-
# Filter by multiple criteria
360-
df = ag3.phenotype_data(
361-
sample_sets=['1237-VO-BJ-DJOGBENOU-VMF00050'],
362-
sample_query="insecticide == 'Deltamethrin' and dose >= 1.0 and phenotype == 'alive'"
363-
)
364-
365-
# Filter by location and insecticide
366-
df = ag3.phenotype_data(
367-
sample_query="location == 'Cotonou' and insecticide in ['Deltamethrin', 'Bendiocarb']"
368-
)
335+
Retrieve and merge phenotype data with sample metadata for bioassay analysis.
369336
"""
337+
370338
# 1. Normalize sample_sets
371339
sample_sets_norm = self._prep_sample_sets_param(sample_sets=sample_sets)
372340

@@ -416,6 +384,13 @@ def phenotype_data(
416384

417385
return df_final
418386

387+
@check_types
388+
@doc(
389+
summary="Combine phenotypic traits with SNP genotype data for GWAS analysis.",
390+
returns=dict(
391+
ds="xarray Dataset containing phenotype data and SNP genotype calls for the specified region."
392+
),
393+
)
419394
def phenotypes_with_snps(
420395
self,
421396
region: base_params.region,
@@ -426,9 +401,8 @@ def phenotypes_with_snps(
426401
min_cohort_size: Optional[base_params.min_cohort_size] = None,
427402
max_cohort_size: Optional[base_params.max_cohort_size] = None,
428403
) -> xr.Dataset:
429-
"""
430-
Load phenotypic data and merge with SNP calls.
431-
"""
404+
"""Merge phenotypes with SNP calls in a given region for association testing."""
405+
432406
df_phenotypes = self.phenotype_data(
433407
sample_sets=sample_sets,
434408
sample_query=sample_query,
@@ -455,6 +429,13 @@ def phenotypes_with_snps(
455429

456430
return ds
457431

432+
@check_types
433+
@doc(
434+
summary="Combine phenotypic traits with haplotype data for extended association analysis.",
435+
returns=dict(
436+
ds="xarray Dataset with phenotype and haplotype data for the specified region."
437+
),
438+
)
458439
def phenotypes_with_haplotypes(
459440
self,
460441
region: base_params.region,
@@ -465,9 +446,8 @@ def phenotypes_with_haplotypes(
465446
min_cohort_size: Optional[base_params.min_cohort_size] = None,
466447
max_cohort_size: Optional[base_params.max_cohort_size] = None,
467448
) -> xr.Dataset:
468-
"""
469-
Load phenotypic data and merge with haplotype data.
470-
"""
449+
"""Merge phenotypes with haplotype data in a given region for association testing."""
450+
471451
df_phenotypes = self.phenotype_data(
472452
sample_sets=sample_sets,
473453
sample_query=sample_query,
@@ -494,29 +474,37 @@ def phenotypes_with_haplotypes(
494474

495475
return ds
496476

477+
@check_types
478+
@doc(
479+
summary="List sample sets that contain phenotypic data.",
480+
returns=dict(sample_sets="List of sample set identifiers with phenotype data."),
481+
)
497482
def phenotype_sample_sets(self) -> List[str]:
498-
"""
499-
Get list of sample sets that have phenotypic data available.
483+
"""Identify sample sets containing phenotype data."""
500484

501-
Returns
502-
-------
503-
List[str]
504-
List of sample set identifiers with available phenotype data.
505-
"""
506485
all_sample_sets = self.sample_sets()["sample_set"].tolist() # type: ignore[operator]
507486
phenotype_sample_sets = []
508-
base_phenotype_path = f"{self._url}v3.2/phenotypes/all"
509487

510488
for sample_set in all_sample_sets:
511489
try:
512-
phenotype_path = f"{base_phenotype_path}/{sample_set}/phenotypes.csv"
490+
release = self.lookup_release(sample_set=sample_set)
491+
release_path = self._release_to_path(release)
492+
493+
phenotype_path = f"{self._base_path}/{release_path}/phenotypes/all/{sample_set}/phenotypes.csv"
494+
513495
if self._fs.exists(phenotype_path):
514496
phenotype_sample_sets.append(sample_set)
515497
except Exception:
516498
continue
517499

518500
return phenotype_sample_sets
519501

502+
@doc(
503+
summary="Convert phenotype data into binary format for statistical analysis.",
504+
returns=dict(
505+
binary="Pandas Series indexed by sample_id with binary classification: 1 for resistant, 0 for susceptible, NaN for unknown."
506+
),
507+
)
520508
def phenotype_binary(
521509
self,
522510
sample_sets: Optional[base_params.sample_sets] = None,
@@ -531,10 +519,8 @@ def phenotype_binary(
531519
min_cohort_size: Optional[base_params.min_cohort_size] = None,
532520
max_cohort_size: Optional[base_params.max_cohort_size] = None,
533521
) -> pd.Series:
534-
"""
535-
Load phenotypic data as binary outcomes (1=alive/resistant, 0=dead/susceptible, NaN=unknown).
536-
Returns a pandas Series indexed by sample_id.
537-
"""
522+
"""Generate binary phenotypic labels from raw phenotype data."""
523+
538524
# Build the sample_query string from individual parameters
539525
query_parts = []
540526
if insecticide is not None:

0 commit comments

Comments
 (0)