33from typing import Callable , Optional , List , Any
44import warnings
55import fsspec
6+ from numpydoc_decorator import doc # type: ignore
7+
8+ from ..util import check_types
69from malariagen_data .anoph import base_params , phenotype_params
710
811
@@ -308,6 +311,13 @@ def _create_phenotype_dataset(
308311
309312 return ds
310313
314+ @check_types
315+ @doc (
316+ summary = "Load phenotypic data from insecticide resistance bioassays." ,
317+ returns = dict (
318+ df = "DataFrame containing phenotype data merged with sample metadata. Includes sample identifiers, phenotypic measurements, and experimental conditions."
319+ ),
320+ )
311321 def phenotype_data (
312322 self ,
313323 sample_sets : Optional [base_params .sample_sets ] = None ,
@@ -318,55 +328,9 @@ def phenotype_data(
318328 max_cohort_size : Optional [base_params .max_cohort_size ] = None ,
319329 ) -> pd .DataFrame :
320330 """
321- Load phenotypic data from insecticide resistance bioassays.
322-
323- Parameters
324- ----------
325- sample_sets : Optional[base_params.sample_sets]
326- Sample sets to load data for.
327- sample_query : Optional[base_params.sample_query]
328- Query string to filter samples. Can include phenotype-specific columns like:
329- - insecticide: e.g., "insecticide == 'Deltamethrin'"
330- - dose: e.g., "dose in [0.5, 2.0]"
331- - phenotype: e.g., "phenotype == 'alive'"
332- - location: e.g., "location == 'Cotonou'"
333- - Any other metadata columns
334- sample_query_options : Optional[base_params.sample_query_options]
335- Options for the sample query.
336- cohort_size : Optional[base_params.cohort_size]
337- Exact cohort size for sampling.
338- min_cohort_size : Optional[base_params.min_cohort_size]
339- Minimum cohort size to include.
340- max_cohort_size : Optional[base_params.max_cohort_size]
341- Maximum cohort size (will be randomly sampled if exceeded).
342-
343- Returns
344- -------
345- pd.DataFrame
346- DataFrame containing phenotype data merged with sample metadata.
347-
348- Examples
349- --------
350- # Load all phenotype data
351- df = ag3.phenotype_data(sample_sets=['1237-VO-BJ-DJOGBENOU-VMF00050'])
352-
353- # Filter by insecticide
354- df = ag3.phenotype_data(
355- sample_sets=['1237-VO-BJ-DJOGBENOU-VMF00050'],
356- sample_query="insecticide == 'Deltamethrin'"
357- )
358-
359- # Filter by multiple criteria
360- df = ag3.phenotype_data(
361- sample_sets=['1237-VO-BJ-DJOGBENOU-VMF00050'],
362- sample_query="insecticide == 'Deltamethrin' and dose >= 1.0 and phenotype == 'alive'"
363- )
364-
365- # Filter by location and insecticide
366- df = ag3.phenotype_data(
367- sample_query="location == 'Cotonou' and insecticide in ['Deltamethrin', 'Bendiocarb']"
368- )
331+ Retrieve and merge phenotype data with sample metadata for bioassay analysis.
369332 """
333+
370334 # 1. Normalize sample_sets
371335 sample_sets_norm = self ._prep_sample_sets_param (sample_sets = sample_sets )
372336
@@ -416,6 +380,13 @@ def phenotype_data(
416380
417381 return df_final
418382
383+ @check_types
384+ @doc (
385+ summary = "Combine phenotypic traits with SNP genotype data for GWAS analysis." ,
386+ returns = dict (
387+ ds = "xarray Dataset containing phenotype data and SNP genotype calls for the specified region."
388+ ),
389+ )
419390 def phenotypes_with_snps (
420391 self ,
421392 region : base_params .region ,
@@ -426,9 +397,8 @@ def phenotypes_with_snps(
426397 min_cohort_size : Optional [base_params .min_cohort_size ] = None ,
427398 max_cohort_size : Optional [base_params .max_cohort_size ] = None ,
428399 ) -> xr .Dataset :
429- """
430- Load phenotypic data and merge with SNP calls.
431- """
400+ """Merge phenotypes with SNP calls in a given region for association testing."""
401+
432402 df_phenotypes = self .phenotype_data (
433403 sample_sets = sample_sets ,
434404 sample_query = sample_query ,
@@ -455,6 +425,13 @@ def phenotypes_with_snps(
455425
456426 return ds
457427
428+ @check_types
429+ @doc (
430+ summary = "Combine phenotypic traits with haplotype data for extended association analysis." ,
431+ returns = dict (
432+ ds = "xarray Dataset with phenotype and haplotype data for the specified region."
433+ ),
434+ )
458435 def phenotypes_with_haplotypes (
459436 self ,
460437 region : base_params .region ,
@@ -465,9 +442,8 @@ def phenotypes_with_haplotypes(
465442 min_cohort_size : Optional [base_params .min_cohort_size ] = None ,
466443 max_cohort_size : Optional [base_params .max_cohort_size ] = None ,
467444 ) -> xr .Dataset :
468- """
469- Load phenotypic data and merge with haplotype data.
470- """
445+ """Merge phenotypes with haplotype data in a given region for association testing."""
446+
471447 df_phenotypes = self .phenotype_data (
472448 sample_sets = sample_sets ,
473449 sample_query = sample_query ,
@@ -494,15 +470,14 @@ def phenotypes_with_haplotypes(
494470
495471 return ds
496472
473+ @check_types
474+ @doc (
475+ summary = "List sample sets that contain phenotypic data." ,
476+ returns = dict (sample_sets = "List of sample set identifiers with phenotype data." ),
477+ )
497478 def phenotype_sample_sets (self ) -> List [str ]:
498- """
499- Get list of sample sets that have phenotypic data available.
479+ """Identify sample sets containing phenotype data."""
500480
501- Returns
502- -------
503- List[str]
504- List of sample set identifiers with available phenotype data.
505- """
506481 all_sample_sets = self .sample_sets ()["sample_set" ].tolist () # type: ignore[operator]
507482 phenotype_sample_sets = []
508483 base_phenotype_path = f"{ self ._url } v3.2/phenotypes/all"
@@ -517,6 +492,12 @@ def phenotype_sample_sets(self) -> List[str]:
517492
518493 return phenotype_sample_sets
519494
495+ @doc (
496+ summary = "Convert phenotype data into binary format for statistical analysis." ,
497+ returns = dict (
498+ binary = "Pandas Series indexed by sample_id with binary classification: 1 for resistant, 0 for susceptible, NaN for unknown."
499+ ),
500+ )
520501 def phenotype_binary (
521502 self ,
522503 sample_sets : Optional [base_params .sample_sets ] = None ,
@@ -531,10 +512,8 @@ def phenotype_binary(
531512 min_cohort_size : Optional [base_params .min_cohort_size ] = None ,
532513 max_cohort_size : Optional [base_params .max_cohort_size ] = None ,
533514 ) -> pd .Series :
534- """
535- Load phenotypic data as binary outcomes (1=alive/resistant, 0=dead/susceptible, NaN=unknown).
536- Returns a pandas Series indexed by sample_id.
537- """
515+ """Generate binary phenotypic labels from raw phenotype data."""
516+
538517 # Build the sample_query string from individual parameters
539518 query_parts = []
540519 if insecticide is not None :
0 commit comments