Merge pull request #810 from malariagen/docs/phenotype-functions

leehart · web-flow · commit f591dba1921d · 2025-09-04T11:13:13.000+01:00
docs: remove usage examples and register phenotype methods in Ag3.rst
diff --git a/docs/source/Ag3.rst b/docs/source/Ag3.rst
@@ -224,3 +224,14 @@ Inversion karyotypes
     :toctree: generated/
 
     karyotype
+
+Phenotype data access
+---------------------
+.. autosummary::
+    :toctree: generated/
+
+    phenotype_data
+    phenotypes_with_snps
+    phenotypes_with_haplotypes
+    phenotype_sample_sets
+    phenotype_binary
diff --git a/malariagen_data/anoph/phenotypes.py b/malariagen_data/anoph/phenotypes.py
@@ -3,6 +3,9 @@
 from typing import Callable, Optional, List, Any
 import warnings
 import fsspec
+from numpydoc_decorator import doc  # type: ignore
+
+from ..util import check_types
 from malariagen_data.anoph import base_params, phenotype_params
 
 
@@ -308,6 +311,13 @@ def _create_phenotype_dataset(
 
         return ds
 
+    @check_types
+    @doc(
+        summary="Load phenotypic data from insecticide resistance bioassays.",
+        returns=dict(
+            df="DataFrame containing phenotype data merged with sample metadata. Includes sample identifiers, phenotypic measurements, and experimental conditions."
+        ),
+    )
     def phenotype_data(
         self,
         sample_sets: Optional[base_params.sample_sets] = None,
@@ -318,55 +328,9 @@ def phenotype_data(
         max_cohort_size: Optional[base_params.max_cohort_size] = None,
     ) -> pd.DataFrame:
         """
-        Load phenotypic data from insecticide resistance bioassays.
-
-        Parameters
-        ----------
-        sample_sets : Optional[base_params.sample_sets]
-            Sample sets to load data for.
-        sample_query : Optional[base_params.sample_query]
-            Query string to filter samples. Can include phenotype-specific columns like:
-            - insecticide: e.g., "insecticide == 'Deltamethrin'"
-            - dose: e.g., "dose in [0.5, 2.0]"
-            - phenotype: e.g., "phenotype == 'alive'"
-            - location: e.g., "location == 'Cotonou'"
-            - Any other metadata columns
-        sample_query_options : Optional[base_params.sample_query_options]
-            Options for the sample query.
-        cohort_size : Optional[base_params.cohort_size]
-            Exact cohort size for sampling.
-        min_cohort_size : Optional[base_params.min_cohort_size]
-            Minimum cohort size to include.
-        max_cohort_size : Optional[base_params.max_cohort_size]
-            Maximum cohort size (will be randomly sampled if exceeded).
-
-        Returns
-        -------
-        pd.DataFrame
-            DataFrame containing phenotype data merged with sample metadata.
-
-        Examples
-        --------
-        # Load all phenotype data
-        df = ag3.phenotype_data(sample_sets=['1237-VO-BJ-DJOGBENOU-VMF00050'])
-
-        # Filter by insecticide
-        df = ag3.phenotype_data(
-            sample_sets=['1237-VO-BJ-DJOGBENOU-VMF00050'],
-            sample_query="insecticide == 'Deltamethrin'"
-        )
-
-        # Filter by multiple criteria
-        df = ag3.phenotype_data(
-            sample_sets=['1237-VO-BJ-DJOGBENOU-VMF00050'],
-            sample_query="insecticide == 'Deltamethrin' and dose >= 1.0 and phenotype == 'alive'"
-        )
-
-        # Filter by location and insecticide
-        df = ag3.phenotype_data(
-            sample_query="location == 'Cotonou' and insecticide in ['Deltamethrin', 'Bendiocarb']"
-        )
+        Retrieve and merge phenotype data with sample metadata for bioassay analysis.
         """
+
         # 1. Normalize sample_sets
         sample_sets_norm = self._prep_sample_sets_param(sample_sets=sample_sets)
 
@@ -416,6 +380,13 @@ def phenotype_data(
 
         return df_final
 
+    @check_types
+    @doc(
+        summary="Combine phenotypic traits with SNP genotype data for GWAS analysis.",
+        returns=dict(
+            ds="xarray Dataset containing phenotype data and SNP genotype calls for the specified region."
+        ),
+    )
     def phenotypes_with_snps(
         self,
         region: base_params.region,
@@ -426,9 +397,8 @@ def phenotypes_with_snps(
         min_cohort_size: Optional[base_params.min_cohort_size] = None,
         max_cohort_size: Optional[base_params.max_cohort_size] = None,
     ) -> xr.Dataset:
-        """
-        Load phenotypic data and merge with SNP calls.
-        """
+        """Merge phenotypes with SNP calls in a given region for association testing."""
+
         df_phenotypes = self.phenotype_data(
             sample_sets=sample_sets,
             sample_query=sample_query,
@@ -455,6 +425,13 @@ def phenotypes_with_snps(
 
         return ds
 
+    @check_types
+    @doc(
+        summary="Combine phenotypic traits with haplotype data for extended association analysis.",
+        returns=dict(
+            ds="xarray Dataset with phenotype and haplotype data for the specified region."
+        ),
+    )
     def phenotypes_with_haplotypes(
         self,
         region: base_params.region,
@@ -465,9 +442,8 @@ def phenotypes_with_haplotypes(
         min_cohort_size: Optional[base_params.min_cohort_size] = None,
         max_cohort_size: Optional[base_params.max_cohort_size] = None,
     ) -> xr.Dataset:
-        """
-        Load phenotypic data and merge with haplotype data.
-        """
+        """Merge phenotypes with haplotype data in a given region for association testing."""
+
         df_phenotypes = self.phenotype_data(
             sample_sets=sample_sets,
             sample_query=sample_query,
@@ -494,15 +470,14 @@ def phenotypes_with_haplotypes(
 
         return ds
 
+    @check_types
+    @doc(
+        summary="List sample sets that contain phenotypic data.",
+        returns=dict(sample_sets="List of sample set identifiers with phenotype data."),
+    )
     def phenotype_sample_sets(self) -> List[str]:
-        """
-        Get list of sample sets that have phenotypic data available.
+        """Identify sample sets containing phenotype data."""
 
-        Returns
-        -------
-        List[str]
-            List of sample set identifiers with available phenotype data.
-        """
         all_sample_sets = self.sample_sets()["sample_set"].tolist()  # type: ignore[operator]
         phenotype_sample_sets = []
         base_phenotype_path = f"{self._url}v3.2/phenotypes/all"
@@ -517,6 +492,12 @@ def phenotype_sample_sets(self) -> List[str]:
 
         return phenotype_sample_sets
 
+    @doc(
+        summary="Convert phenotype data into binary format for statistical analysis.",
+        returns=dict(
+            binary="Pandas Series indexed by sample_id with binary classification: 1 for resistant, 0 for susceptible, NaN for unknown."
+        ),
+    )
     def phenotype_binary(
         self,
         sample_sets: Optional[base_params.sample_sets] = None,
@@ -531,10 +512,8 @@ def phenotype_binary(
         min_cohort_size: Optional[base_params.min_cohort_size] = None,
         max_cohort_size: Optional[base_params.max_cohort_size] = None,
     ) -> pd.Series:
-        """
-        Load phenotypic data as binary outcomes (1=alive/resistant, 0=dead/susceptible, NaN=unknown).
-        Returns a pandas Series indexed by sample_id.
-        """
+        """Generate binary phenotypic labels from raw phenotype data."""
+
         # Build the sample_query string from individual parameters
         query_parts = []
         if insecticide is not None: