@@ -320,30 +320,42 @@ def phenotype_data(
320320 """
321321 Load phenotypic data from insecticide resistance bioassays.
322322
323+ This function retrieves phenotypic data from bioassay experiments and merges it
324+ with sample metadata to provide a comprehensive dataset for analysis. The data
325+ includes information about insecticide resistance testing, mortality rates,
326+ and associated sample characteristics.
327+
323328 Parameters
324329 ----------
325- sample_sets : Optional[base_params.sample_sets]
326- Sample sets to load data for.
327- sample_query : Optional[base_params.sample_query]
328- Query string to filter samples. Can include phenotype-specific columns like:
330+ sample_sets : base_params.sample_sets, optional
331+ Sample sets to load data for. If None, loads data from all available sample sets.
332+ sample_query : base_params.sample_query, optional
333+ Query string to filter samples using pandas query syntax. Can include
334+ phenotype-specific columns such as:
329335 - insecticide: e.g., "insecticide == 'Deltamethrin'"
330336 - dose: e.g., "dose in [0.5, 2.0]"
331337 - phenotype: e.g., "phenotype == 'alive'"
332338 - location: e.g., "location == 'Cotonou'"
333- - Any other metadata columns
334- sample_query_options : Optional[base_params.sample_query_options]
335- Options for the sample query.
336- cohort_size : Optional[base_params.cohort_size]
337- Exact cohort size for sampling.
338- min_cohort_size : Optional[base_params.min_cohort_size]
339- Minimum cohort size to include.
340- max_cohort_size : Optional[base_params.max_cohort_size]
341- Maximum cohort size (will be randomly sampled if exceeded).
339+ - Any other metadata columns available in the dataset
340+ sample_query_options : base_params.sample_query_options, optional
341+ Additional options for the sample query, passed to pandas.DataFrame.query().
342+ cohort_size : base_params.cohort_size, optional
343+ Exact number of samples to include. If specified, samples will be randomly
344+ selected to match this size.
345+ min_cohort_size : base_params.min_cohort_size, optional
346+ Minimum number of samples required. Function will return empty DataFrame
347+ if fewer samples are available.
348+ max_cohort_size : base_params.max_cohort_size, optional
349+ Maximum number of samples to include. If more samples are available,
350+ they will be randomly sampled to this size.
342351
343352 Returns
344353 -------
345354 pd.DataFrame
346- DataFrame containing phenotype data merged with sample metadata.
355+ DataFrame containing phenotype data merged with sample metadata. Includes
356+ columns for sample identifiers, phenotypic measurements, experimental
357+ conditions (insecticide, dose), and sample metadata (location, collection
358+ date, etc.).
347359 """
348360 # 1. Normalize sample_sets
349361 sample_sets_norm = self ._prep_sample_sets_param (sample_sets = sample_sets )
@@ -405,7 +417,36 @@ def phenotypes_with_snps(
405417 max_cohort_size : Optional [base_params .max_cohort_size ] = None ,
406418 ) -> xr .Dataset :
407419 """
408- Load phenotypic data and merge with SNP calls.
420+ Load phenotypic data and merge with SNP calls for genetic association analysis.
421+
422+ This function combines phenotypic traits with SNP genotype data, enabling
423+ genome-wide association studies (GWAS) and other genetic analyses. It first
424+ retrieves phenotype data based on the provided filters, then fetches SNP calls
425+ for the same samples within the specified genomic region.
426+
427+ Parameters
428+ ----------
429+ region : base_params.region
430+ Genomic region specification (e.g., chromosome, start/end positions).
431+ sample_sets : base_params.sample_sets, optional
432+ Specific sample sets to include in the analysis.
433+ sample_query : base_params.sample_query, optional
434+ Query string to filter samples (e.g., "population == 'CEU'").
435+ sample_query_options : base_params.sample_query_options, optional
436+ Additional query options for sample filtering.
437+ cohort_size : base_params.cohort_size, optional
438+ Exact number of samples to include.
439+ min_cohort_size : base_params.min_cohort_size, optional
440+ Minimum number of samples required.
441+ max_cohort_size : base_params.max_cohort_size, optional
442+ Maximum number of samples to include.
443+
444+ Returns
445+ -------
446+ xr.Dataset
447+ A xarray Dataset containing phenotype data indexed by sample_id and
448+ SNP genotype calls for the specified genomic region, merged for
449+ direct phenotype-genotype analysis.
409450 """
410451 df_phenotypes = self .phenotype_data (
411452 sample_sets = sample_sets ,
@@ -444,7 +485,36 @@ def phenotypes_with_haplotypes(
444485 max_cohort_size : Optional [base_params .max_cohort_size ] = None ,
445486 ) -> xr .Dataset :
446487 """
447- Load phenotypic data and merge with haplotype data.
488+ Load phenotypic data and merge with haplotype data for extended genetic analysis.
489+
490+ This function combines phenotypic traits with haplotype data, enabling analysis
491+ of linked genetic variants and their association with phenotypes. Haplotypes
492+ represent combinations of alleles at multiple nearby loci that are inherited
493+ together.
494+
495+ Parameters
496+ ----------
497+ region : base_params.region
498+ Genomic region specification for haplotype analysis.
499+ sample_sets : base_params.sample_sets, optional
500+ Specific sample sets to include in the analysis.
501+ sample_query : base_params.sample_query, optional
502+ Query string to filter samples.
503+ sample_query_options : base_params.sample_query_options, optional
504+ Additional query options for sample filtering.
505+ cohort_size : base_params.cohort_size, optional
506+ Exact number of samples to include.
507+ min_cohort_size : base_params.min_cohort_size, optional
508+ Minimum number of samples required.
509+ max_cohort_size : base_params.max_cohort_size, optional
510+ Maximum number of samples to include.
511+
512+ Returns
513+ -------
514+ xr.Dataset
515+ A xarray Dataset containing phenotype data indexed by sample_id and
516+ haplotype data for the specified genomic region, merged for
517+ haplotype-phenotype association analysis.
448518 """
449519 df_phenotypes = self .phenotype_data (
450520 sample_sets = sample_sets ,
@@ -476,10 +546,14 @@ def phenotype_sample_sets(self) -> List[str]:
476546 """
477547 Get list of sample sets that have phenotypic data available.
478548
549+ This function scans the available data repository to identify which sample sets
550+ contain phenotypic information, helping users determine which datasets can be
551+ used for phenotype-based analyses.
552+
479553 Returns
480554 -------
481555 List[str]
482- List of sample set identifiers with available phenotype data.
556+ List of sample set identifiers that have associated phenotype data available .
483557 """
484558 all_sample_sets = self .sample_sets ()["sample_set" ].tolist () # type: ignore[operator]
485559 phenotype_sample_sets = []
@@ -510,8 +584,41 @@ def phenotype_binary(
510584 max_cohort_size : Optional [base_params .max_cohort_size ] = None ,
511585 ) -> pd .Series :
512586 """
513- Load phenotypic data as binary outcomes (1=alive/resistant, 0=dead/susceptible, NaN=unknown).
514- Returns a pandas Series indexed by sample_id.
587+ Load phenotypic data as binary outcomes for statistical analysis.
588+
589+ This function converts phenotypic measurements into binary classifications
590+ suitable for statistical analysis, particularly useful for resistance/susceptibility
591+ studies. The binary encoding follows: 1=alive/resistant, 0=dead/susceptible,
592+ NaN=unknown/missing.
593+
594+ Parameters
595+ ----------
596+ sample_sets : base_params.sample_sets, optional
597+ Specific sample sets to include in the analysis.
598+ insecticide : phenotype_params.insecticide, optional
599+ Insecticide type(s) to filter by. Can be a single value or list.
600+ dose : phenotype_params.dose, optional
601+ Dose level(s) to filter by. Can be a single value or list.
602+ phenotype : phenotype_params.phenotype, optional
603+ Specific phenotype(s) to filter by. Can be a single value or list.
604+ sample_query : base_params.sample_query, optional
605+ Additional query string for sample filtering.
606+ sample_query_options : base_params.sample_query_options, optional
607+ Additional query options for sample filtering.
608+ cohort_size : base_params.cohort_size, optional
609+ Exact number of samples to include.
610+ min_cohort_size : base_params.min_cohort_size, optional
611+ Minimum number of samples required.
612+ max_cohort_size : base_params.max_cohort_size, optional
613+ Maximum number of samples to include.
614+
615+ Returns
616+ -------
617+ pd.Series
618+ A pandas Series indexed by sample_id with binary values:
619+ - 1: Alive/resistant phenotype
620+ - 0: Dead/susceptible phenotype
621+ - NaN: Unknown or missing phenotype
515622 """
516623 # Build the sample_query string from individual parameters
517624 query_parts = []
0 commit comments