Skip to content

Commit ffe7cde

Browse files
docs: improve phenotype function docstrings
1 parent 91ac2e2 commit ffe7cde

1 file changed

Lines changed: 126 additions & 19 deletions

File tree

malariagen_data/anoph/phenotypes.py

Lines changed: 126 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -320,30 +320,42 @@ def phenotype_data(
320320
"""
321321
Load phenotypic data from insecticide resistance bioassays.
322322
323+
This function retrieves phenotypic data from bioassay experiments and merges it
324+
with sample metadata to provide a comprehensive dataset for analysis. The data
325+
includes information about insecticide resistance testing, mortality rates,
326+
and associated sample characteristics.
327+
323328
Parameters
324329
----------
325-
sample_sets : Optional[base_params.sample_sets]
326-
Sample sets to load data for.
327-
sample_query : Optional[base_params.sample_query]
328-
Query string to filter samples. Can include phenotype-specific columns like:
330+
sample_sets : base_params.sample_sets, optional
331+
Sample sets to load data for. If None, loads data from all available sample sets.
332+
sample_query : base_params.sample_query, optional
333+
Query string to filter samples using pandas query syntax. Can include
334+
phenotype-specific columns such as:
329335
- insecticide: e.g., "insecticide == 'Deltamethrin'"
330336
- dose: e.g., "dose in [0.5, 2.0]"
331337
- phenotype: e.g., "phenotype == 'alive'"
332338
- location: e.g., "location == 'Cotonou'"
333-
- Any other metadata columns
334-
sample_query_options : Optional[base_params.sample_query_options]
335-
Options for the sample query.
336-
cohort_size : Optional[base_params.cohort_size]
337-
Exact cohort size for sampling.
338-
min_cohort_size : Optional[base_params.min_cohort_size]
339-
Minimum cohort size to include.
340-
max_cohort_size : Optional[base_params.max_cohort_size]
341-
Maximum cohort size (will be randomly sampled if exceeded).
339+
- Any other metadata columns available in the dataset
340+
sample_query_options : base_params.sample_query_options, optional
341+
Additional options for the sample query, passed to pandas.DataFrame.query().
342+
cohort_size : base_params.cohort_size, optional
343+
Exact number of samples to include. If specified, samples will be randomly
344+
selected to match this size.
345+
min_cohort_size : base_params.min_cohort_size, optional
346+
Minimum number of samples required. Function will return empty DataFrame
347+
if fewer samples are available.
348+
max_cohort_size : base_params.max_cohort_size, optional
349+
Maximum number of samples to include. If more samples are available,
350+
they will be randomly sampled to this size.
342351
343352
Returns
344353
-------
345354
pd.DataFrame
346-
DataFrame containing phenotype data merged with sample metadata.
355+
DataFrame containing phenotype data merged with sample metadata. Includes
356+
columns for sample identifiers, phenotypic measurements, experimental
357+
conditions (insecticide, dose), and sample metadata (location, collection
358+
date, etc.).
347359
"""
348360
# 1. Normalize sample_sets
349361
sample_sets_norm = self._prep_sample_sets_param(sample_sets=sample_sets)
@@ -405,7 +417,36 @@ def phenotypes_with_snps(
405417
max_cohort_size: Optional[base_params.max_cohort_size] = None,
406418
) -> xr.Dataset:
407419
"""
408-
Load phenotypic data and merge with SNP calls.
420+
Load phenotypic data and merge with SNP calls for genetic association analysis.
421+
422+
This function combines phenotypic traits with SNP genotype data, enabling
423+
genome-wide association studies (GWAS) and other genetic analyses. It first
424+
retrieves phenotype data based on the provided filters, then fetches SNP calls
425+
for the same samples within the specified genomic region.
426+
427+
Parameters
428+
----------
429+
region : base_params.region
430+
Genomic region specification (e.g., chromosome, start/end positions).
431+
sample_sets : base_params.sample_sets, optional
432+
Specific sample sets to include in the analysis.
433+
sample_query : base_params.sample_query, optional
434+
Query string to filter samples (e.g., "population == 'CEU'").
435+
sample_query_options : base_params.sample_query_options, optional
436+
Additional query options for sample filtering.
437+
cohort_size : base_params.cohort_size, optional
438+
Exact number of samples to include.
439+
min_cohort_size : base_params.min_cohort_size, optional
440+
Minimum number of samples required.
441+
max_cohort_size : base_params.max_cohort_size, optional
442+
Maximum number of samples to include.
443+
444+
Returns
445+
-------
446+
xr.Dataset
447+
A xarray Dataset containing phenotype data indexed by sample_id and
448+
SNP genotype calls for the specified genomic region, merged for
449+
direct phenotype-genotype analysis.
409450
"""
410451
df_phenotypes = self.phenotype_data(
411452
sample_sets=sample_sets,
@@ -444,7 +485,36 @@ def phenotypes_with_haplotypes(
444485
max_cohort_size: Optional[base_params.max_cohort_size] = None,
445486
) -> xr.Dataset:
446487
"""
447-
Load phenotypic data and merge with haplotype data.
488+
Load phenotypic data and merge with haplotype data for extended genetic analysis.
489+
490+
This function combines phenotypic traits with haplotype data, enabling analysis
491+
of linked genetic variants and their association with phenotypes. Haplotypes
492+
represent combinations of alleles at multiple nearby loci that are inherited
493+
together.
494+
495+
Parameters
496+
----------
497+
region : base_params.region
498+
Genomic region specification for haplotype analysis.
499+
sample_sets : base_params.sample_sets, optional
500+
Specific sample sets to include in the analysis.
501+
sample_query : base_params.sample_query, optional
502+
Query string to filter samples.
503+
sample_query_options : base_params.sample_query_options, optional
504+
Additional query options for sample filtering.
505+
cohort_size : base_params.cohort_size, optional
506+
Exact number of samples to include.
507+
min_cohort_size : base_params.min_cohort_size, optional
508+
Minimum number of samples required.
509+
max_cohort_size : base_params.max_cohort_size, optional
510+
Maximum number of samples to include.
511+
512+
Returns
513+
-------
514+
xr.Dataset
515+
A xarray Dataset containing phenotype data indexed by sample_id and
516+
haplotype data for the specified genomic region, merged for
517+
haplotype-phenotype association analysis.
448518
"""
449519
df_phenotypes = self.phenotype_data(
450520
sample_sets=sample_sets,
@@ -476,10 +546,14 @@ def phenotype_sample_sets(self) -> List[str]:
476546
"""
477547
Get list of sample sets that have phenotypic data available.
478548
549+
This function scans the available data repository to identify which sample sets
550+
contain phenotypic information, helping users determine which datasets can be
551+
used for phenotype-based analyses.
552+
479553
Returns
480554
-------
481555
List[str]
482-
List of sample set identifiers with available phenotype data.
556+
List of sample set identifiers that have associated phenotype data available.
483557
"""
484558
all_sample_sets = self.sample_sets()["sample_set"].tolist() # type: ignore[operator]
485559
phenotype_sample_sets = []
@@ -510,8 +584,41 @@ def phenotype_binary(
510584
max_cohort_size: Optional[base_params.max_cohort_size] = None,
511585
) -> pd.Series:
512586
"""
513-
Load phenotypic data as binary outcomes (1=alive/resistant, 0=dead/susceptible, NaN=unknown).
514-
Returns a pandas Series indexed by sample_id.
587+
Load phenotypic data as binary outcomes for statistical analysis.
588+
589+
This function converts phenotypic measurements into binary classifications
590+
suitable for statistical analysis, particularly useful for resistance/susceptibility
591+
studies. The binary encoding follows: 1=alive/resistant, 0=dead/susceptible,
592+
NaN=unknown/missing.
593+
594+
Parameters
595+
----------
596+
sample_sets : base_params.sample_sets, optional
597+
Specific sample sets to include in the analysis.
598+
insecticide : phenotype_params.insecticide, optional
599+
Insecticide type(s) to filter by. Can be a single value or list.
600+
dose : phenotype_params.dose, optional
601+
Dose level(s) to filter by. Can be a single value or list.
602+
phenotype : phenotype_params.phenotype, optional
603+
Specific phenotype(s) to filter by. Can be a single value or list.
604+
sample_query : base_params.sample_query, optional
605+
Additional query string for sample filtering.
606+
sample_query_options : base_params.sample_query_options, optional
607+
Additional query options for sample filtering.
608+
cohort_size : base_params.cohort_size, optional
609+
Exact number of samples to include.
610+
min_cohort_size : base_params.min_cohort_size, optional
611+
Minimum number of samples required.
612+
max_cohort_size : base_params.max_cohort_size, optional
613+
Maximum number of samples to include.
614+
615+
Returns
616+
-------
617+
pd.Series
618+
A pandas Series indexed by sample_id with binary values:
619+
- 1: Alive/resistant phenotype
620+
- 0: Dead/susceptible phenotype
621+
- NaN: Unknown or missing phenotype
515622
"""
516623
# Build the sample_query string from individual parameters
517624
query_parts = []

0 commit comments

Comments
 (0)