docs: refactor phenotype method docstrings to use @doc and @check_types decorators

mohamed-laarej · mohamed-laarej · commit 8d1e289070bb · 2025-07-10T17:54:13.000+01:00
diff --git a/malariagen_data/anoph/phenotypes.py b/malariagen_data/anoph/phenotypes.py
@@ -3,6 +3,9 @@
 from typing import Callable, Optional, List, Any
 import warnings
 import fsspec
+from numpydoc_decorator import doc  # type: ignore
+
+from ..util import check_types
 from malariagen_data.anoph import base_params, phenotype_params
 
 
@@ -308,6 +311,13 @@ def _create_phenotype_dataset(
 
         return ds
 
+    @check_types
+    @doc(
+        summary="Load phenotypic data from insecticide resistance bioassays.",
+        returns=dict(
+            df="DataFrame containing phenotype data merged with sample metadata. Includes sample identifiers, phenotypic measurements, and experimental conditions."
+        ),
+    )
     def phenotype_data(
         self,
         sample_sets: Optional[base_params.sample_sets] = None,
@@ -318,45 +328,9 @@ def phenotype_data(
         max_cohort_size: Optional[base_params.max_cohort_size] = None,
     ) -> pd.DataFrame:
         """
-        Load phenotypic data from insecticide resistance bioassays.
-
-        This function retrieves phenotypic data from bioassay experiments and merges it
-        with sample metadata to provide a comprehensive dataset for analysis. The data
-        includes information about insecticide resistance testing, mortality rates,
-        and associated sample characteristics.
-
-        Parameters
-        ----------
-        sample_sets : base_params.sample_sets, optional
-            Sample sets to load data for. If None, loads data from all available sample sets.
-        sample_query : base_params.sample_query, optional
-            Query string to filter samples using pandas query syntax. Can include
-            phenotype-specific columns such as:
-            - insecticide: e.g., "insecticide == 'Deltamethrin'"
-            - dose: e.g., "dose in [0.5, 2.0]"
-            - phenotype: e.g., "phenotype == 'alive'"
-            - location: e.g., "location == 'Cotonou'"
-            - Any other metadata columns available in the dataset
-        sample_query_options : base_params.sample_query_options, optional
-            Additional options for the sample query, passed to pandas.DataFrame.query().
-        cohort_size : base_params.cohort_size, optional
-            Exact number of samples to include. If specified, samples will be randomly
-            selected to match this size.
-        min_cohort_size : base_params.min_cohort_size, optional
-            Minimum number of samples required. Function will return empty DataFrame
-            if fewer samples are available.
-        max_cohort_size : base_params.max_cohort_size, optional
-            Maximum number of samples to include. If more samples are available,
-            they will be randomly sampled to this size.
-
-        Returns
-        -------
-        pd.DataFrame
-            DataFrame containing phenotype data merged with sample metadata. Includes
-            columns for sample identifiers, phenotypic measurements, experimental
-            conditions (insecticide, dose), and sample metadata (location, collection
-            date, etc.).
+        Retrieve and merge phenotype data with sample metadata for bioassay analysis.
         """
+
         # 1. Normalize sample_sets
         sample_sets_norm = self._prep_sample_sets_param(sample_sets=sample_sets)
 
@@ -406,6 +380,13 @@ def phenotype_data(
 
         return df_final
 
+    @check_types
+    @doc(
+        summary="Combine phenotypic traits with SNP genotype data for GWAS analysis.",
+        returns=dict(
+            ds="xarray Dataset containing phenotype data and SNP genotype calls for the specified region."
+        ),
+    )
     def phenotypes_with_snps(
         self,
         region: base_params.region,
@@ -416,38 +397,8 @@ def phenotypes_with_snps(
         min_cohort_size: Optional[base_params.min_cohort_size] = None,
         max_cohort_size: Optional[base_params.max_cohort_size] = None,
     ) -> xr.Dataset:
-        """
-        Load phenotypic data and merge with SNP calls for genetic association analysis.
-
-        This function combines phenotypic traits with SNP genotype data, enabling
-        genome-wide association studies (GWAS) and other genetic analyses. It first
-        retrieves phenotype data based on the provided filters, then fetches SNP calls
-        for the same samples within the specified genomic region.
-
-        Parameters
-        ----------
-        region : base_params.region
-            Genomic region specification (e.g., chromosome, start/end positions).
-        sample_sets : base_params.sample_sets, optional
-            Specific sample sets to include in the analysis.
-        sample_query : base_params.sample_query, optional
-            Query string to filter samples (e.g., "population == 'CEU'").
-        sample_query_options : base_params.sample_query_options, optional
-            Additional query options for sample filtering.
-        cohort_size : base_params.cohort_size, optional
-            Exact number of samples to include.
-        min_cohort_size : base_params.min_cohort_size, optional
-            Minimum number of samples required.
-        max_cohort_size : base_params.max_cohort_size, optional
-            Maximum number of samples to include.
-
-        Returns
-        -------
-        xr.Dataset
-            A xarray Dataset containing phenotype data indexed by sample_id and
-            SNP genotype calls for the specified genomic region, merged for
-            direct phenotype-genotype analysis.
-        """
+        """Merge phenotypes with SNP calls in a given region for association testing."""
+
         df_phenotypes = self.phenotype_data(
             sample_sets=sample_sets,
             sample_query=sample_query,
@@ -474,6 +425,13 @@ def phenotypes_with_snps(
 
         return ds
 
+    @check_types
+    @doc(
+        summary="Combine phenotypic traits with haplotype data for extended association analysis.",
+        returns=dict(
+            ds="xarray Dataset with phenotype and haplotype data for the specified region."
+        ),
+    )
     def phenotypes_with_haplotypes(
         self,
         region: base_params.region,
@@ -484,38 +442,8 @@ def phenotypes_with_haplotypes(
         min_cohort_size: Optional[base_params.min_cohort_size] = None,
         max_cohort_size: Optional[base_params.max_cohort_size] = None,
     ) -> xr.Dataset:
-        """
-        Load phenotypic data and merge with haplotype data for extended genetic analysis.
-
-        This function combines phenotypic traits with haplotype data, enabling analysis
-        of linked genetic variants and their association with phenotypes. Haplotypes
-        represent combinations of alleles at multiple nearby loci that are inherited
-        together.
-
-        Parameters
-        ----------
-        region : base_params.region
-            Genomic region specification for haplotype analysis.
-        sample_sets : base_params.sample_sets, optional
-            Specific sample sets to include in the analysis.
-        sample_query : base_params.sample_query, optional
-            Query string to filter samples.
-        sample_query_options : base_params.sample_query_options, optional
-            Additional query options for sample filtering.
-        cohort_size : base_params.cohort_size, optional
-            Exact number of samples to include.
-        min_cohort_size : base_params.min_cohort_size, optional
-            Minimum number of samples required.
-        max_cohort_size : base_params.max_cohort_size, optional
-            Maximum number of samples to include.
-
-        Returns
-        -------
-        xr.Dataset
-            A xarray Dataset containing phenotype data indexed by sample_id and
-            haplotype data for the specified genomic region, merged for
-            haplotype-phenotype association analysis.
-        """
+        """Merge phenotypes with haplotype data in a given region for association testing."""
+
         df_phenotypes = self.phenotype_data(
             sample_sets=sample_sets,
             sample_query=sample_query,
@@ -542,19 +470,14 @@ def phenotypes_with_haplotypes(
 
         return ds
 
+    @check_types
+    @doc(
+        summary="List sample sets that contain phenotypic data.",
+        returns=dict(sample_sets="List of sample set identifiers with phenotype data."),
+    )
     def phenotype_sample_sets(self) -> List[str]:
-        """
-        Get list of sample sets that have phenotypic data available.
+        """Identify sample sets containing phenotype data."""
 
-        This function scans the available data repository to identify which sample sets
-        contain phenotypic information, helping users determine which datasets can be
-        used for phenotype-based analyses.
-
-        Returns
-        -------
-        List[str]
-            List of sample set identifiers that have associated phenotype data available.
-        """
         all_sample_sets = self.sample_sets()["sample_set"].tolist()  # type: ignore[operator]
         phenotype_sample_sets = []
         base_phenotype_path = f"{self._url}v3.2/phenotypes/all"
@@ -569,6 +492,12 @@ def phenotype_sample_sets(self) -> List[str]:
 
         return phenotype_sample_sets
 
+    @doc(
+        summary="Convert phenotype data into binary format for statistical analysis.",
+        returns=dict(
+            binary="Pandas Series indexed by sample_id with binary classification: 1 for resistant, 0 for susceptible, NaN for unknown."
+        ),
+    )
     def phenotype_binary(
         self,
         sample_sets: Optional[base_params.sample_sets] = None,
@@ -583,43 +512,8 @@ def phenotype_binary(
         min_cohort_size: Optional[base_params.min_cohort_size] = None,
         max_cohort_size: Optional[base_params.max_cohort_size] = None,
     ) -> pd.Series:
-        """
-        Load phenotypic data as binary outcomes for statistical analysis.
-
-        This function converts phenotypic measurements into binary classifications
-        suitable for statistical analysis, particularly useful for resistance/susceptibility
-        studies. The binary encoding follows: 1=alive/resistant, 0=dead/susceptible,
-        NaN=unknown/missing.
-
-        Parameters
-        ----------
-        sample_sets : base_params.sample_sets, optional
-            Specific sample sets to include in the analysis.
-        insecticide : phenotype_params.insecticide, optional
-            Insecticide type(s) to filter by. Can be a single value or list.
-        dose : phenotype_params.dose, optional
-            Dose level(s) to filter by. Can be a single value or list.
-        phenotype : phenotype_params.phenotype, optional
-            Specific phenotype(s) to filter by. Can be a single value or list.
-        sample_query : base_params.sample_query, optional
-            Additional query string for sample filtering.
-        sample_query_options : base_params.sample_query_options, optional
-            Additional query options for sample filtering.
-        cohort_size : base_params.cohort_size, optional
-            Exact number of samples to include.
-        min_cohort_size : base_params.min_cohort_size, optional
-            Minimum number of samples required.
-        max_cohort_size : base_params.max_cohort_size, optional
-            Maximum number of samples to include.
-
-        Returns
-        -------
-        pd.Series
-            A pandas Series indexed by sample_id with binary values:
-            - 1: Alive/resistant phenotype
-            - 0: Dead/susceptible phenotype
-            - NaN: Unknown or missing phenotype
-        """
+        """Generate binary phenotypic labels from raw phenotype data."""
+
         # Build the sample_query string from individual parameters
         query_parts = []
         if insecticide is not None: