33from typing import Callable , Optional , List , Any
44import warnings
55import fsspec
6+ from numpydoc_decorator import doc # type: ignore
7+
8+ from ..util import check_types
69from malariagen_data .anoph import base_params , phenotype_params
710
811
@@ -308,6 +311,13 @@ def _create_phenotype_dataset(
308311
309312 return ds
310313
314+ @check_types
315+ @doc (
316+ summary = "Load phenotypic data from insecticide resistance bioassays." ,
317+ returns = dict (
318+ df = "DataFrame containing phenotype data merged with sample metadata. Includes sample identifiers, phenotypic measurements, and experimental conditions."
319+ ),
320+ )
311321 def phenotype_data (
312322 self ,
313323 sample_sets : Optional [base_params .sample_sets ] = None ,
@@ -318,45 +328,9 @@ def phenotype_data(
318328 max_cohort_size : Optional [base_params .max_cohort_size ] = None ,
319329 ) -> pd .DataFrame :
320330 """
321- Load phenotypic data from insecticide resistance bioassays.
322-
323- This function retrieves phenotypic data from bioassay experiments and merges it
324- with sample metadata to provide a comprehensive dataset for analysis. The data
325- includes information about insecticide resistance testing, mortality rates,
326- and associated sample characteristics.
327-
328- Parameters
329- ----------
330- sample_sets : base_params.sample_sets, optional
331- Sample sets to load data for. If None, loads data from all available sample sets.
332- sample_query : base_params.sample_query, optional
333- Query string to filter samples using pandas query syntax. Can include
334- phenotype-specific columns such as:
335- - insecticide: e.g., "insecticide == 'Deltamethrin'"
336- - dose: e.g., "dose in [0.5, 2.0]"
337- - phenotype: e.g., "phenotype == 'alive'"
338- - location: e.g., "location == 'Cotonou'"
339- - Any other metadata columns available in the dataset
340- sample_query_options : base_params.sample_query_options, optional
341- Additional options for the sample query, passed to pandas.DataFrame.query().
342- cohort_size : base_params.cohort_size, optional
343- Exact number of samples to include. If specified, samples will be randomly
344- selected to match this size.
345- min_cohort_size : base_params.min_cohort_size, optional
346- Minimum number of samples required. Function will return empty DataFrame
347- if fewer samples are available.
348- max_cohort_size : base_params.max_cohort_size, optional
349- Maximum number of samples to include. If more samples are available,
350- they will be randomly sampled to this size.
351-
352- Returns
353- -------
354- pd.DataFrame
355- DataFrame containing phenotype data merged with sample metadata. Includes
356- columns for sample identifiers, phenotypic measurements, experimental
357- conditions (insecticide, dose), and sample metadata (location, collection
358- date, etc.).
331+ Retrieve and merge phenotype data with sample metadata for bioassay analysis.
359332 """
333+
360334 # 1. Normalize sample_sets
361335 sample_sets_norm = self ._prep_sample_sets_param (sample_sets = sample_sets )
362336
@@ -406,6 +380,13 @@ def phenotype_data(
406380
407381 return df_final
408382
383+ @check_types
384+ @doc (
385+ summary = "Combine phenotypic traits with SNP genotype data for GWAS analysis." ,
386+ returns = dict (
387+ ds = "xarray Dataset containing phenotype data and SNP genotype calls for the specified region."
388+ ),
389+ )
409390 def phenotypes_with_snps (
410391 self ,
411392 region : base_params .region ,
@@ -416,38 +397,8 @@ def phenotypes_with_snps(
416397 min_cohort_size : Optional [base_params .min_cohort_size ] = None ,
417398 max_cohort_size : Optional [base_params .max_cohort_size ] = None ,
418399 ) -> xr .Dataset :
419- """
420- Load phenotypic data and merge with SNP calls for genetic association analysis.
421-
422- This function combines phenotypic traits with SNP genotype data, enabling
423- genome-wide association studies (GWAS) and other genetic analyses. It first
424- retrieves phenotype data based on the provided filters, then fetches SNP calls
425- for the same samples within the specified genomic region.
426-
427- Parameters
428- ----------
429- region : base_params.region
430- Genomic region specification (e.g., chromosome, start/end positions).
431- sample_sets : base_params.sample_sets, optional
432- Specific sample sets to include in the analysis.
433- sample_query : base_params.sample_query, optional
434- Query string to filter samples (e.g., "population == 'CEU'").
435- sample_query_options : base_params.sample_query_options, optional
436- Additional query options for sample filtering.
437- cohort_size : base_params.cohort_size, optional
438- Exact number of samples to include.
439- min_cohort_size : base_params.min_cohort_size, optional
440- Minimum number of samples required.
441- max_cohort_size : base_params.max_cohort_size, optional
442- Maximum number of samples to include.
443-
444- Returns
445- -------
446- xr.Dataset
447- A xarray Dataset containing phenotype data indexed by sample_id and
448- SNP genotype calls for the specified genomic region, merged for
449- direct phenotype-genotype analysis.
450- """
400+ """Merge phenotypes with SNP calls in a given region for association testing."""
401+
451402 df_phenotypes = self .phenotype_data (
452403 sample_sets = sample_sets ,
453404 sample_query = sample_query ,
@@ -474,6 +425,13 @@ def phenotypes_with_snps(
474425
475426 return ds
476427
428+ @check_types
429+ @doc (
430+ summary = "Combine phenotypic traits with haplotype data for extended association analysis." ,
431+ returns = dict (
432+ ds = "xarray Dataset with phenotype and haplotype data for the specified region."
433+ ),
434+ )
477435 def phenotypes_with_haplotypes (
478436 self ,
479437 region : base_params .region ,
@@ -484,38 +442,8 @@ def phenotypes_with_haplotypes(
484442 min_cohort_size : Optional [base_params .min_cohort_size ] = None ,
485443 max_cohort_size : Optional [base_params .max_cohort_size ] = None ,
486444 ) -> xr .Dataset :
487- """
488- Load phenotypic data and merge with haplotype data for extended genetic analysis.
489-
490- This function combines phenotypic traits with haplotype data, enabling analysis
491- of linked genetic variants and their association with phenotypes. Haplotypes
492- represent combinations of alleles at multiple nearby loci that are inherited
493- together.
494-
495- Parameters
496- ----------
497- region : base_params.region
498- Genomic region specification for haplotype analysis.
499- sample_sets : base_params.sample_sets, optional
500- Specific sample sets to include in the analysis.
501- sample_query : base_params.sample_query, optional
502- Query string to filter samples.
503- sample_query_options : base_params.sample_query_options, optional
504- Additional query options for sample filtering.
505- cohort_size : base_params.cohort_size, optional
506- Exact number of samples to include.
507- min_cohort_size : base_params.min_cohort_size, optional
508- Minimum number of samples required.
509- max_cohort_size : base_params.max_cohort_size, optional
510- Maximum number of samples to include.
511-
512- Returns
513- -------
514- xr.Dataset
515- A xarray Dataset containing phenotype data indexed by sample_id and
516- haplotype data for the specified genomic region, merged for
517- haplotype-phenotype association analysis.
518- """
445+ """Merge phenotypes with haplotype data in a given region for association testing."""
446+
519447 df_phenotypes = self .phenotype_data (
520448 sample_sets = sample_sets ,
521449 sample_query = sample_query ,
@@ -542,19 +470,14 @@ def phenotypes_with_haplotypes(
542470
543471 return ds
544472
473+ @check_types
474+ @doc (
475+ summary = "List sample sets that contain phenotypic data." ,
476+ returns = dict (sample_sets = "List of sample set identifiers with phenotype data." ),
477+ )
545478 def phenotype_sample_sets (self ) -> List [str ]:
546- """
547- Get list of sample sets that have phenotypic data available.
479+ """Identify sample sets containing phenotype data."""
548480
549- This function scans the available data repository to identify which sample sets
550- contain phenotypic information, helping users determine which datasets can be
551- used for phenotype-based analyses.
552-
553- Returns
554- -------
555- List[str]
556- List of sample set identifiers that have associated phenotype data available.
557- """
558481 all_sample_sets = self .sample_sets ()["sample_set" ].tolist () # type: ignore[operator]
559482 phenotype_sample_sets = []
560483 base_phenotype_path = f"{ self ._url } v3.2/phenotypes/all"
@@ -569,6 +492,12 @@ def phenotype_sample_sets(self) -> List[str]:
569492
570493 return phenotype_sample_sets
571494
495+ @doc (
496+ summary = "Convert phenotype data into binary format for statistical analysis." ,
497+ returns = dict (
498+ binary = "Pandas Series indexed by sample_id with binary classification: 1 for resistant, 0 for susceptible, NaN for unknown."
499+ ),
500+ )
572501 def phenotype_binary (
573502 self ,
574503 sample_sets : Optional [base_params .sample_sets ] = None ,
@@ -583,43 +512,8 @@ def phenotype_binary(
583512 min_cohort_size : Optional [base_params .min_cohort_size ] = None ,
584513 max_cohort_size : Optional [base_params .max_cohort_size ] = None ,
585514 ) -> pd .Series :
586- """
587- Load phenotypic data as binary outcomes for statistical analysis.
588-
589- This function converts phenotypic measurements into binary classifications
590- suitable for statistical analysis, particularly useful for resistance/susceptibility
591- studies. The binary encoding follows: 1=alive/resistant, 0=dead/susceptible,
592- NaN=unknown/missing.
593-
594- Parameters
595- ----------
596- sample_sets : base_params.sample_sets, optional
597- Specific sample sets to include in the analysis.
598- insecticide : phenotype_params.insecticide, optional
599- Insecticide type(s) to filter by. Can be a single value or list.
600- dose : phenotype_params.dose, optional
601- Dose level(s) to filter by. Can be a single value or list.
602- phenotype : phenotype_params.phenotype, optional
603- Specific phenotype(s) to filter by. Can be a single value or list.
604- sample_query : base_params.sample_query, optional
605- Additional query string for sample filtering.
606- sample_query_options : base_params.sample_query_options, optional
607- Additional query options for sample filtering.
608- cohort_size : base_params.cohort_size, optional
609- Exact number of samples to include.
610- min_cohort_size : base_params.min_cohort_size, optional
611- Minimum number of samples required.
612- max_cohort_size : base_params.max_cohort_size, optional
613- Maximum number of samples to include.
614-
615- Returns
616- -------
617- pd.Series
618- A pandas Series indexed by sample_id with binary values:
619- - 1: Alive/resistant phenotype
620- - 0: Dead/susceptible phenotype
621- - NaN: Unknown or missing phenotype
622- """
515+ """Generate binary phenotypic labels from raw phenotype data."""
516+
623517 # Build the sample_query string from individual parameters
624518 query_parts = []
625519 if insecticide is not None :
0 commit comments