11import pandas as pd
22import xarray as xr
3- from typing import Callable , Optional , List , Any
3+ from typing import Callable , Optional , List , Any , TYPE_CHECKING
44import warnings
55import fsspec
6+ from numpydoc_decorator import doc # type: ignore
7+
8+ from ..util import check_types
69from malariagen_data .anoph import base_params , phenotype_params
710
811
@@ -12,14 +15,20 @@ class AnophelesPhenotypeData:
1215 Inherited by AnophelesDataResource subclasses (e.g., Ag3).
1316 """
1417
15- # Type annotations for MyPy
16- _url : str
17- _fs : fsspec .AbstractFileSystem
18- sample_metadata : Callable [..., pd .DataFrame ]
19- sample_sets : list [str ]
20- _prep_sample_sets_param : Callable [..., Any ]
21- snp_calls : Callable [..., Any ]
22- haplotypes : Callable [..., Any ]
18+ if TYPE_CHECKING :
19+ # Type annotations for MyPy
20+ _url : str
21+ _fs : fsspec .AbstractFileSystem
22+ sample_metadata : Callable [..., pd .DataFrame ]
23+ _base_path : str
24+ _major_version_path : str
25+ _release_to_path : Callable [[str ], str ]
26+ lookup_release : Callable [..., str ]
27+ _prep_sample_sets_param : Callable [..., Any ]
28+
29+ sample_sets : Callable [..., pd .DataFrame ]
30+ snp_calls : Callable [..., Any ]
31+ haplotypes : Callable [..., Any ]
2332
2433 def __init__ (self , ** kwargs ):
2534 super ().__init__ (** kwargs )
@@ -32,11 +41,14 @@ def _load_phenotype_data(
3241 Load raw phenotypic data from GCS for given sample sets.
3342 """
3443 phenotype_dfs = []
35- base_phenotype_path = f"{ self ._url } v3.2/phenotypes/all"
3644
3745 for sample_set in sample_sets :
38- phenotype_path = f"{ base_phenotype_path } /{ sample_set } /phenotypes.csv"
3946 try :
47+ release = self .lookup_release (sample_set = sample_set )
48+ release_path = self ._release_to_path (release )
49+
50+ phenotype_path = f"{ self ._base_path } /{ release_path } /phenotypes/all/{ sample_set } /phenotypes.csv"
51+
4052 if not self ._fs .exists (phenotype_path ):
4153 warnings .warn (
4254 f"Phenotype data file not found for { sample_set } at { phenotype_path } "
@@ -58,14 +70,9 @@ def _load_phenotype_data(
5870 df_pheno ["sample_set" ] = sample_set
5971 phenotype_dfs .append (df_pheno )
6072
61- except FileNotFoundError :
62- warnings .warn (
63- f"Phenotype data file not found for { sample_set } at { phenotype_path } "
64- )
65- continue
6673 except Exception as e :
6774 warnings .warn (
68- f"Unexpected error loading phenotype data for { sample_set } from { phenotype_path } : { e } "
75+ f"Unexpected error loading phenotype data for { sample_set } : { e } "
6976 )
7077 continue
7178
@@ -308,6 +315,13 @@ def _create_phenotype_dataset(
308315
309316 return ds
310317
318+ @check_types
319+ @doc (
320+ summary = "Load phenotypic data from insecticide resistance bioassays." ,
321+ returns = dict (
322+ df = "DataFrame containing phenotype data merged with sample metadata. Includes sample identifiers, phenotypic measurements, and experimental conditions."
323+ ),
324+ )
311325 def phenotype_data (
312326 self ,
313327 sample_sets : Optional [base_params .sample_sets ] = None ,
@@ -318,55 +332,9 @@ def phenotype_data(
318332 max_cohort_size : Optional [base_params .max_cohort_size ] = None ,
319333 ) -> pd .DataFrame :
320334 """
321- Load phenotypic data from insecticide resistance bioassays.
322-
323- Parameters
324- ----------
325- sample_sets : Optional[base_params.sample_sets]
326- Sample sets to load data for.
327- sample_query : Optional[base_params.sample_query]
328- Query string to filter samples. Can include phenotype-specific columns like:
329- - insecticide: e.g., "insecticide == 'Deltamethrin'"
330- - dose: e.g., "dose in [0.5, 2.0]"
331- - phenotype: e.g., "phenotype == 'alive'"
332- - location: e.g., "location == 'Cotonou'"
333- - Any other metadata columns
334- sample_query_options : Optional[base_params.sample_query_options]
335- Options for the sample query.
336- cohort_size : Optional[base_params.cohort_size]
337- Exact cohort size for sampling.
338- min_cohort_size : Optional[base_params.min_cohort_size]
339- Minimum cohort size to include.
340- max_cohort_size : Optional[base_params.max_cohort_size]
341- Maximum cohort size (will be randomly sampled if exceeded).
342-
343- Returns
344- -------
345- pd.DataFrame
346- DataFrame containing phenotype data merged with sample metadata.
347-
348- Examples
349- --------
350- # Load all phenotype data
351- df = ag3.phenotype_data(sample_sets=['1237-VO-BJ-DJOGBENOU-VMF00050'])
352-
353- # Filter by insecticide
354- df = ag3.phenotype_data(
355- sample_sets=['1237-VO-BJ-DJOGBENOU-VMF00050'],
356- sample_query="insecticide == 'Deltamethrin'"
357- )
358-
359- # Filter by multiple criteria
360- df = ag3.phenotype_data(
361- sample_sets=['1237-VO-BJ-DJOGBENOU-VMF00050'],
362- sample_query="insecticide == 'Deltamethrin' and dose >= 1.0 and phenotype == 'alive'"
363- )
364-
365- # Filter by location and insecticide
366- df = ag3.phenotype_data(
367- sample_query="location == 'Cotonou' and insecticide in ['Deltamethrin', 'Bendiocarb']"
368- )
335+ Retrieve and merge phenotype data with sample metadata for bioassay analysis.
369336 """
337+
370338 # 1. Normalize sample_sets
371339 sample_sets_norm = self ._prep_sample_sets_param (sample_sets = sample_sets )
372340
@@ -416,6 +384,13 @@ def phenotype_data(
416384
417385 return df_final
418386
387+ @check_types
388+ @doc (
389+ summary = "Combine phenotypic traits with SNP genotype data for GWAS analysis." ,
390+ returns = dict (
391+ ds = "xarray Dataset containing phenotype data and SNP genotype calls for the specified region."
392+ ),
393+ )
419394 def phenotypes_with_snps (
420395 self ,
421396 region : base_params .region ,
@@ -426,9 +401,8 @@ def phenotypes_with_snps(
426401 min_cohort_size : Optional [base_params .min_cohort_size ] = None ,
427402 max_cohort_size : Optional [base_params .max_cohort_size ] = None ,
428403 ) -> xr .Dataset :
429- """
430- Load phenotypic data and merge with SNP calls.
431- """
404+ """Merge phenotypes with SNP calls in a given region for association testing."""
405+
432406 df_phenotypes = self .phenotype_data (
433407 sample_sets = sample_sets ,
434408 sample_query = sample_query ,
@@ -455,6 +429,13 @@ def phenotypes_with_snps(
455429
456430 return ds
457431
432+ @check_types
433+ @doc (
434+ summary = "Combine phenotypic traits with haplotype data for extended association analysis." ,
435+ returns = dict (
436+ ds = "xarray Dataset with phenotype and haplotype data for the specified region."
437+ ),
438+ )
458439 def phenotypes_with_haplotypes (
459440 self ,
460441 region : base_params .region ,
@@ -465,9 +446,8 @@ def phenotypes_with_haplotypes(
465446 min_cohort_size : Optional [base_params .min_cohort_size ] = None ,
466447 max_cohort_size : Optional [base_params .max_cohort_size ] = None ,
467448 ) -> xr .Dataset :
468- """
469- Load phenotypic data and merge with haplotype data.
470- """
449+ """Merge phenotypes with haplotype data in a given region for association testing."""
450+
471451 df_phenotypes = self .phenotype_data (
472452 sample_sets = sample_sets ,
473453 sample_query = sample_query ,
@@ -494,29 +474,37 @@ def phenotypes_with_haplotypes(
494474
495475 return ds
496476
477+ @check_types
478+ @doc (
479+ summary = "List sample sets that contain phenotypic data." ,
480+ returns = dict (sample_sets = "List of sample set identifiers with phenotype data." ),
481+ )
497482 def phenotype_sample_sets (self ) -> List [str ]:
498- """
499- Get list of sample sets that have phenotypic data available.
483+ """Identify sample sets containing phenotype data."""
500484
501- Returns
502- -------
503- List[str]
504- List of sample set identifiers with available phenotype data.
505- """
506485 all_sample_sets = self .sample_sets ()["sample_set" ].tolist () # type: ignore[operator]
507486 phenotype_sample_sets = []
508- base_phenotype_path = f"{ self ._url } v3.2/phenotypes/all"
509487
510488 for sample_set in all_sample_sets :
511489 try :
512- phenotype_path = f"{ base_phenotype_path } /{ sample_set } /phenotypes.csv"
490+ release = self .lookup_release (sample_set = sample_set )
491+ release_path = self ._release_to_path (release )
492+
493+ phenotype_path = f"{ self ._base_path } /{ release_path } /phenotypes/all/{ sample_set } /phenotypes.csv"
494+
513495 if self ._fs .exists (phenotype_path ):
514496 phenotype_sample_sets .append (sample_set )
515497 except Exception :
516498 continue
517499
518500 return phenotype_sample_sets
519501
502+ @doc (
503+ summary = "Convert phenotype data into binary format for statistical analysis." ,
504+ returns = dict (
505+ binary = "Pandas Series indexed by sample_id with binary classification: 1 for resistant, 0 for susceptible, NaN for unknown."
506+ ),
507+ )
520508 def phenotype_binary (
521509 self ,
522510 sample_sets : Optional [base_params .sample_sets ] = None ,
@@ -531,10 +519,8 @@ def phenotype_binary(
531519 min_cohort_size : Optional [base_params .min_cohort_size ] = None ,
532520 max_cohort_size : Optional [base_params .max_cohort_size ] = None ,
533521 ) -> pd .Series :
534- """
535- Load phenotypic data as binary outcomes (1=alive/resistant, 0=dead/susceptible, NaN=unknown).
536- Returns a pandas Series indexed by sample_id.
537- """
522+ """Generate binary phenotypic labels from raw phenotype data."""
523+
538524 # Build the sample_query string from individual parameters
539525 query_parts = []
540526 if insecticide is not None :
0 commit comments