Skip to content

Commit 8d1e289

Browse files
docs: refactor phenotype method docstrings to use @doc and @check_types decorators
1 parent ffe7cde commit 8d1e289

1 file changed

Lines changed: 44 additions & 150 deletions

File tree

malariagen_data/anoph/phenotypes.py

Lines changed: 44 additions & 150 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
from typing import Callable, Optional, List, Any
44
import warnings
55
import fsspec
6+
from numpydoc_decorator import doc # type: ignore
7+
8+
from ..util import check_types
69
from malariagen_data.anoph import base_params, phenotype_params
710

811

@@ -308,6 +311,13 @@ def _create_phenotype_dataset(
308311

309312
return ds
310313

314+
@check_types
315+
@doc(
316+
summary="Load phenotypic data from insecticide resistance bioassays.",
317+
returns=dict(
318+
df="DataFrame containing phenotype data merged with sample metadata. Includes sample identifiers, phenotypic measurements, and experimental conditions."
319+
),
320+
)
311321
def phenotype_data(
312322
self,
313323
sample_sets: Optional[base_params.sample_sets] = None,
@@ -318,45 +328,9 @@ def phenotype_data(
318328
max_cohort_size: Optional[base_params.max_cohort_size] = None,
319329
) -> pd.DataFrame:
320330
"""
321-
Load phenotypic data from insecticide resistance bioassays.
322-
323-
This function retrieves phenotypic data from bioassay experiments and merges it
324-
with sample metadata to provide a comprehensive dataset for analysis. The data
325-
includes information about insecticide resistance testing, mortality rates,
326-
and associated sample characteristics.
327-
328-
Parameters
329-
----------
330-
sample_sets : base_params.sample_sets, optional
331-
Sample sets to load data for. If None, loads data from all available sample sets.
332-
sample_query : base_params.sample_query, optional
333-
Query string to filter samples using pandas query syntax. Can include
334-
phenotype-specific columns such as:
335-
- insecticide: e.g., "insecticide == 'Deltamethrin'"
336-
- dose: e.g., "dose in [0.5, 2.0]"
337-
- phenotype: e.g., "phenotype == 'alive'"
338-
- location: e.g., "location == 'Cotonou'"
339-
- Any other metadata columns available in the dataset
340-
sample_query_options : base_params.sample_query_options, optional
341-
Additional options for the sample query, passed to pandas.DataFrame.query().
342-
cohort_size : base_params.cohort_size, optional
343-
Exact number of samples to include. If specified, samples will be randomly
344-
selected to match this size.
345-
min_cohort_size : base_params.min_cohort_size, optional
346-
Minimum number of samples required. Function will return empty DataFrame
347-
if fewer samples are available.
348-
max_cohort_size : base_params.max_cohort_size, optional
349-
Maximum number of samples to include. If more samples are available,
350-
they will be randomly sampled to this size.
351-
352-
Returns
353-
-------
354-
pd.DataFrame
355-
DataFrame containing phenotype data merged with sample metadata. Includes
356-
columns for sample identifiers, phenotypic measurements, experimental
357-
conditions (insecticide, dose), and sample metadata (location, collection
358-
date, etc.).
331+
Retrieve and merge phenotype data with sample metadata for bioassay analysis.
359332
"""
333+
360334
# 1. Normalize sample_sets
361335
sample_sets_norm = self._prep_sample_sets_param(sample_sets=sample_sets)
362336

@@ -406,6 +380,13 @@ def phenotype_data(
406380

407381
return df_final
408382

383+
@check_types
384+
@doc(
385+
summary="Combine phenotypic traits with SNP genotype data for GWAS analysis.",
386+
returns=dict(
387+
ds="xarray Dataset containing phenotype data and SNP genotype calls for the specified region."
388+
),
389+
)
409390
def phenotypes_with_snps(
410391
self,
411392
region: base_params.region,
@@ -416,38 +397,8 @@ def phenotypes_with_snps(
416397
min_cohort_size: Optional[base_params.min_cohort_size] = None,
417398
max_cohort_size: Optional[base_params.max_cohort_size] = None,
418399
) -> xr.Dataset:
419-
"""
420-
Load phenotypic data and merge with SNP calls for genetic association analysis.
421-
422-
This function combines phenotypic traits with SNP genotype data, enabling
423-
genome-wide association studies (GWAS) and other genetic analyses. It first
424-
retrieves phenotype data based on the provided filters, then fetches SNP calls
425-
for the same samples within the specified genomic region.
426-
427-
Parameters
428-
----------
429-
region : base_params.region
430-
Genomic region specification (e.g., chromosome, start/end positions).
431-
sample_sets : base_params.sample_sets, optional
432-
Specific sample sets to include in the analysis.
433-
sample_query : base_params.sample_query, optional
434-
Query string to filter samples (e.g., "population == 'CEU'").
435-
sample_query_options : base_params.sample_query_options, optional
436-
Additional query options for sample filtering.
437-
cohort_size : base_params.cohort_size, optional
438-
Exact number of samples to include.
439-
min_cohort_size : base_params.min_cohort_size, optional
440-
Minimum number of samples required.
441-
max_cohort_size : base_params.max_cohort_size, optional
442-
Maximum number of samples to include.
443-
444-
Returns
445-
-------
446-
xr.Dataset
447-
A xarray Dataset containing phenotype data indexed by sample_id and
448-
SNP genotype calls for the specified genomic region, merged for
449-
direct phenotype-genotype analysis.
450-
"""
400+
"""Merge phenotypes with SNP calls in a given region for association testing."""
401+
451402
df_phenotypes = self.phenotype_data(
452403
sample_sets=sample_sets,
453404
sample_query=sample_query,
@@ -474,6 +425,13 @@ def phenotypes_with_snps(
474425

475426
return ds
476427

428+
@check_types
429+
@doc(
430+
summary="Combine phenotypic traits with haplotype data for extended association analysis.",
431+
returns=dict(
432+
ds="xarray Dataset with phenotype and haplotype data for the specified region."
433+
),
434+
)
477435
def phenotypes_with_haplotypes(
478436
self,
479437
region: base_params.region,
@@ -484,38 +442,8 @@ def phenotypes_with_haplotypes(
484442
min_cohort_size: Optional[base_params.min_cohort_size] = None,
485443
max_cohort_size: Optional[base_params.max_cohort_size] = None,
486444
) -> xr.Dataset:
487-
"""
488-
Load phenotypic data and merge with haplotype data for extended genetic analysis.
489-
490-
This function combines phenotypic traits with haplotype data, enabling analysis
491-
of linked genetic variants and their association with phenotypes. Haplotypes
492-
represent combinations of alleles at multiple nearby loci that are inherited
493-
together.
494-
495-
Parameters
496-
----------
497-
region : base_params.region
498-
Genomic region specification for haplotype analysis.
499-
sample_sets : base_params.sample_sets, optional
500-
Specific sample sets to include in the analysis.
501-
sample_query : base_params.sample_query, optional
502-
Query string to filter samples.
503-
sample_query_options : base_params.sample_query_options, optional
504-
Additional query options for sample filtering.
505-
cohort_size : base_params.cohort_size, optional
506-
Exact number of samples to include.
507-
min_cohort_size : base_params.min_cohort_size, optional
508-
Minimum number of samples required.
509-
max_cohort_size : base_params.max_cohort_size, optional
510-
Maximum number of samples to include.
511-
512-
Returns
513-
-------
514-
xr.Dataset
515-
A xarray Dataset containing phenotype data indexed by sample_id and
516-
haplotype data for the specified genomic region, merged for
517-
haplotype-phenotype association analysis.
518-
"""
445+
"""Merge phenotypes with haplotype data in a given region for association testing."""
446+
519447
df_phenotypes = self.phenotype_data(
520448
sample_sets=sample_sets,
521449
sample_query=sample_query,
@@ -542,19 +470,14 @@ def phenotypes_with_haplotypes(
542470

543471
return ds
544472

473+
@check_types
474+
@doc(
475+
summary="List sample sets that contain phenotypic data.",
476+
returns=dict(sample_sets="List of sample set identifiers with phenotype data."),
477+
)
545478
def phenotype_sample_sets(self) -> List[str]:
546-
"""
547-
Get list of sample sets that have phenotypic data available.
479+
"""Identify sample sets containing phenotype data."""
548480

549-
This function scans the available data repository to identify which sample sets
550-
contain phenotypic information, helping users determine which datasets can be
551-
used for phenotype-based analyses.
552-
553-
Returns
554-
-------
555-
List[str]
556-
List of sample set identifiers that have associated phenotype data available.
557-
"""
558481
all_sample_sets = self.sample_sets()["sample_set"].tolist() # type: ignore[operator]
559482
phenotype_sample_sets = []
560483
base_phenotype_path = f"{self._url}v3.2/phenotypes/all"
@@ -569,6 +492,12 @@ def phenotype_sample_sets(self) -> List[str]:
569492

570493
return phenotype_sample_sets
571494

495+
@doc(
496+
summary="Convert phenotype data into binary format for statistical analysis.",
497+
returns=dict(
498+
binary="Pandas Series indexed by sample_id with binary classification: 1 for resistant, 0 for susceptible, NaN for unknown."
499+
),
500+
)
572501
def phenotype_binary(
573502
self,
574503
sample_sets: Optional[base_params.sample_sets] = None,
@@ -583,43 +512,8 @@ def phenotype_binary(
583512
min_cohort_size: Optional[base_params.min_cohort_size] = None,
584513
max_cohort_size: Optional[base_params.max_cohort_size] = None,
585514
) -> pd.Series:
586-
"""
587-
Load phenotypic data as binary outcomes for statistical analysis.
588-
589-
This function converts phenotypic measurements into binary classifications
590-
suitable for statistical analysis, particularly useful for resistance/susceptibility
591-
studies. The binary encoding follows: 1=alive/resistant, 0=dead/susceptible,
592-
NaN=unknown/missing.
593-
594-
Parameters
595-
----------
596-
sample_sets : base_params.sample_sets, optional
597-
Specific sample sets to include in the analysis.
598-
insecticide : phenotype_params.insecticide, optional
599-
Insecticide type(s) to filter by. Can be a single value or list.
600-
dose : phenotype_params.dose, optional
601-
Dose level(s) to filter by. Can be a single value or list.
602-
phenotype : phenotype_params.phenotype, optional
603-
Specific phenotype(s) to filter by. Can be a single value or list.
604-
sample_query : base_params.sample_query, optional
605-
Additional query string for sample filtering.
606-
sample_query_options : base_params.sample_query_options, optional
607-
Additional query options for sample filtering.
608-
cohort_size : base_params.cohort_size, optional
609-
Exact number of samples to include.
610-
min_cohort_size : base_params.min_cohort_size, optional
611-
Minimum number of samples required.
612-
max_cohort_size : base_params.max_cohort_size, optional
613-
Maximum number of samples to include.
614-
615-
Returns
616-
-------
617-
pd.Series
618-
A pandas Series indexed by sample_id with binary values:
619-
- 1: Alive/resistant phenotype
620-
- 0: Dead/susceptible phenotype
621-
- NaN: Unknown or missing phenotype
622-
"""
515+
"""Generate binary phenotypic labels from raw phenotype data."""
516+
623517
# Build the sample_query string from individual parameters
624518
query_parts = []
625519
if insecticide is not None:

0 commit comments

Comments
 (0)