Fix: Restore phenotype_binary function and update tests/notebook

mohamed-laarej · mohamed-laarej · commit dffbec50dda0 · 2025-07-01T02:36:32.000+01:00
- Re-implemented the phenotype_binary method, which was inadvertently removed during previous refactoring.
- Updated phenotype_binary to leverage the sample_query mechanism for filtering, aligning with mentor feedback.
- Added new integration tests for phenotype_binary functionality.
- Incorporated phenotype_binary examples into the demonstration notebook.
diff --git a/malariagen_data/anoph/phenotypes.py b/malariagen_data/anoph/phenotypes.py
@@ -3,7 +3,7 @@
 from typing import Callable, Optional, List, Any
 import warnings
 import fsspec
-from malariagen_data.anoph import base_params
+from malariagen_data.anoph import base_params, phenotype_params
 
 
 class AnophelesPhenotypeData:
@@ -516,3 +516,73 @@ def phenotype_sample_sets(self) -> List[str]:
                 continue
 
         return phenotype_sample_sets
+
+    def phenotype_binary(
+        self,
+        sample_sets: Optional[base_params.sample_sets] = None,
+        insecticide: Optional[phenotype_params.insecticide] = None,
+        dose: Optional[phenotype_params.dose] = None,
+        phenotype: Optional[phenotype_params.phenotype] = None,
+        sample_query: Optional[
+            base_params.sample_query
+        ] = None,  # Allow direct sample_query
+        sample_query_options: Optional[base_params.sample_query_options] = None,
+        cohort_size: Optional[base_params.cohort_size] = None,
+        min_cohort_size: Optional[base_params.min_cohort_size] = None,
+        max_cohort_size: Optional[base_params.max_cohort_size] = None,
+    ) -> pd.Series:
+        """
+        Load phenotypic data as binary outcomes (1=alive/resistant, 0=dead/susceptible, NaN=unknown).
+        Returns a pandas Series indexed by sample_id.
+        """
+        # Build the sample_query string from individual parameters
+        query_parts = []
+        if insecticide is not None:
+            if isinstance(insecticide, list):
+                query_parts.append(f"insecticide in {insecticide}")
+            else:
+                query_parts.append(f"insecticide == '{insecticide}'")
+        if dose is not None:
+            if isinstance(dose, list):
+                query_parts.append(f"dose in {dose}")
+            else:
+                query_parts.append(f"dose == {dose}")
+        if phenotype is not None:
+            if isinstance(phenotype, list):
+                query_parts.append(f"phenotype in {phenotype}")
+            else:
+                query_parts.append(f"phenotype == '{phenotype}'")
+
+        # Combine with an existing sample_query if provided
+        final_sample_query = sample_query
+        if query_parts:
+            generated_query = " and ".join(query_parts)
+            if final_sample_query:
+                final_sample_query = f"({final_sample_query}) and ({generated_query})"
+            else:
+                final_sample_query = generated_query
+
+        df = self.phenotype_data(
+            sample_sets=sample_sets,
+            sample_query=final_sample_query,
+            sample_query_options=sample_query_options,
+            cohort_size=cohort_size,
+            min_cohort_size=min_cohort_size,
+            max_cohort_size=max_cohort_size,
+        )
+
+        if df.empty:
+            return pd.Series(dtype=float, name="phenotype_binary")
+
+        binary_series = self._create_phenotype_binary_series(df)
+
+        binary_series.name = "phenotype_binary"
+        # Ensure the index is correctly set to sample_id
+        if "sample_id" in df.columns:
+            binary_series.index = pd.Index(df["sample_id"])
+        else:
+            warnings.warn(
+                "Cannot set index to sample_id as it is missing from the DataFrame returned by phenotype_data."
+            )
+
+        return binary_series
diff --git a/notebooks/phenotype_data_demo.ipynb b/notebooks/phenotype_data_demo.ipynb
@@ -23,7 +23,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 1,
    "id": "f1280de7-129f-4875-bf12-eb68b8d88573",
    "metadata": {},
    "outputs": [
@@ -396,7 +396,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 2,
    "id": "7dd3ebc4-64f7-4d0b-ad01-fbc1e08b0636",
    "metadata": {},
    "outputs": [
@@ -443,7 +443,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 3,
    "id": "c0b9d0ae-6a79-458a-ad9f-9281a1c6112e",
    "metadata": {},
    "outputs": [
@@ -453,7 +453,7 @@
      "text": [
       "\n",
       "--- Loading phenotype data for '1237-VO-BJ-DJOGBENOU-VMF00050' filtered by Deltamethrin ---\n",
-      "Shape of DataFrame: (88, 60)\n",
+      "Shape of DataFrame: (88, 60)         \n",
       "\n",
       "First 5 rows of the filtered DataFrame:\n",
       "\n",
@@ -555,7 +555,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 4,
    "id": "eab7d81c-d80c-4279-9263-a0d7b425ff85",
    "metadata": {},
    "outputs": [
@@ -671,7 +671,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 5,
    "id": "de8cd777-a4b2-44dd-a0bf-64a6ef419da7",
    "metadata": {},
    "outputs": [
@@ -780,18 +780,198 @@
     "#         print(f\"All cohorts meet min_cohort_size (>=10): {all(cohort_sizes >= 10)}\")\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "9a1f4cc9-528f-40bf-9efe-71aee41c21c3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "--- Getting binary phenotype outcomes for '1237-VO-BJ-DJOGBENOU-VMF00050' ---\n",
+      "Shape of binary series: (88,)\n",
+      "First 5 entries:\n",
+      "sample_id\n",
+      "VBS18949-5562STDY7801785    0.0\n",
+      "VBS18950-5562STDY7801786    0.0\n",
+      "VBS18951-5562STDY7801787    0.0\n",
+      "VBS18952-5562STDY7801788    0.0\n",
+      "VBS18953-5562STDY7801789    0.0\n",
+      "Name: phenotype_binary, dtype: float64\n",
+      "Unique values in series: [0. 1.]\n",
+      "\n",
+      "Shape of binary series (alive Deltamethrin): (48,)\n",
+      "First 5 entries:\n",
+      "sample_id\n",
+      "VBS18995-5562STDY7801828    1.0\n",
+      "VBS18996-5562STDY7801829    1.0\n",
+      "VBS18998-5562STDY7801830    1.0\n",
+      "VBS18999-5562STDY7801831    1.0\n",
+      "VBS19000-5562STDY7801832    1.0\n",
+      "Name: phenotype_binary, dtype: float64\n",
+      "Unique values in series: [1.]\n",
+      "\n",
+      "Shape of binary series (dose 0.5): (40,)\n",
+      "First 5 entries:\n",
+      "sample_id\n",
+      "VBS18949-5562STDY7801785    0.0\n",
+      "VBS18950-5562STDY7801786    0.0\n",
+      "VBS18951-5562STDY7801787    0.0\n",
+      "VBS18952-5562STDY7801788    0.0\n",
+      "VBS18953-5562STDY7801789    0.0\n",
+      "Name: phenotype_binary, dtype: float64\n",
+      "Unique values in series: [0.]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f\"\\n--- Getting binary phenotype outcomes for '{demo_sample_set}' ---\")\n",
+    "\n",
+    "# Example 1: Binary outcomes for all Deltamethrin samples\n",
+    "binary_deltamethrin = ag3.phenotype_binary(\n",
+    "    sample_sets=[demo_sample_set],\n",
+    "    sample_query=\"insecticide == 'Deltamethrin'\"\n",
+    ")\n",
+    "\n",
+    "print(f\"Shape of binary series: {binary_deltamethrin.shape}\")\n",
+    "print(\"First 5 entries:\")\n",
+    "print(binary_deltamethrin.head())\n",
+    "print(f\"Unique values in series: {binary_deltamethrin.unique()}\")\n",
+    "\n",
+    "\n",
+    "# Example 2: Binary outcomes for samples that were 'alive' with Deltamethrin\n",
+    "binary_alive_deltamethrin = ag3.phenotype_binary(\n",
+    "    sample_sets=[demo_sample_set],\n",
+    "    sample_query=\"insecticide == 'Deltamethrin' and phenotype == 'alive'\"\n",
+    ")\n",
+    "\n",
+    "print(f\"\\nShape of binary series (alive Deltamethrin): {binary_alive_deltamethrin.shape}\")\n",
+    "print(\"First 5 entries:\")\n",
+    "print(binary_alive_deltamethrin.head())\n",
+    "print(f\"Unique values in series: {binary_alive_deltamethrin.unique()}\")\n",
+    "\n",
+    "# Example 3: Binary outcomes for samples with dose 0.5\n",
+    "binary_dose_0_5 = ag3.phenotype_binary(\n",
+    "    sample_sets=[demo_sample_set],\n",
+    "    sample_query=\"dose == 0.5\"\n",
+    ")\n",
+    "\n",
+    "print(f\"\\nShape of binary series (dose 0.5): {binary_dose_0_5.shape}\")\n",
+    "print(\"First 5 entries:\")\n",
+    "print(binary_dose_0_5.head())\n",
+    "print(f\"Unique values in series: {binary_dose_0_5.unique()}\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "a92e430e-7f40-48a8-8c98-2405d27aabfb",
    "metadata": {},
    "source": [
-    "### 4. Loading Phenotype Data Merged with SNP Calls\n",
+    "## 4. Getting Binary Phenotype Outcomes with phenotype_binary\n",
+    "\n",
+    "The `phenotype_binary()` method provides a convenient way to get phenotype outcomes as a binary Pandas Series (1 for alive/resistant, 0 for dead/susceptible, NaN for unmapped). It also uses the `sample_query` for filtering."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "fda88368",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "--- Getting binary phenotype outcomes for '1237-VO-BJ-DJOGBENOU-VMF00050' ---\n",
+      "Shape of binary series: (88,)\n",
+      "First 5 entries:\n",
+      "sample_id\n",
+      "VBS18949-5562STDY7801785    0.0\n",
+      "VBS18950-5562STDY7801786    0.0\n",
+      "VBS18951-5562STDY7801787    0.0\n",
+      "VBS18952-5562STDY7801788    0.0\n",
+      "VBS18953-5562STDY7801789    0.0\n",
+      "Name: phenotype_binary, dtype: float64\n",
+      "Unique values in series: [0. 1.]\n",
+      "\n",
+      "Shape of binary series (alive Deltamethrin): (48,)\n",
+      "First 5 entries:\n",
+      "sample_id\n",
+      "VBS18995-5562STDY7801828    1.0\n",
+      "VBS18996-5562STDY7801829    1.0\n",
+      "VBS18998-5562STDY7801830    1.0\n",
+      "VBS18999-5562STDY7801831    1.0\n",
+      "VBS19000-5562STDY7801832    1.0\n",
+      "Name: phenotype_binary, dtype: float64\n",
+      "Unique values in series: [1.]\n",
+      "\n",
+      "Shape of binary series (dose 0.5): (40,)\n",
+      "First 5 entries:\n",
+      "sample_id\n",
+      "VBS18949-5562STDY7801785    0.0\n",
+      "VBS18950-5562STDY7801786    0.0\n",
+      "VBS18951-5562STDY7801787    0.0\n",
+      "VBS18952-5562STDY7801788    0.0\n",
+      "VBS18953-5562STDY7801789    0.0\n",
+      "Name: phenotype_binary, dtype: float64\n",
+      "Unique values in series: [0.]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f\"\\n--- Getting binary phenotype outcomes for '{demo_sample_set}' ---\")\n",
+    "\n",
+    "# Example 1: Binary outcomes for all Deltamethrin samples\n",
+    "binary_deltamethrin = ag3.phenotype_binary(\n",
+    "    sample_sets=[demo_sample_set],\n",
+    "    sample_query=\"insecticide == 'Deltamethrin'\"\n",
+    ")\n",
+    "\n",
+    "print(f\"Shape of binary series: {binary_deltamethrin.shape}\")\n",
+    "print(\"First 5 entries:\")\n",
+    "print(binary_deltamethrin.head())\n",
+    "print(f\"Unique values in series: {binary_deltamethrin.unique()}\")\n",
+    "\n",
+    "\n",
+    "# Example 2: Binary outcomes for samples that were 'alive' with Deltamethrin\n",
+    "binary_alive_deltamethrin = ag3.phenotype_binary(\n",
+    "    sample_sets=[demo_sample_set],\n",
+    "    sample_query=\"insecticide == 'Deltamethrin' and phenotype == 'alive'\"\n",
+    ")\n",
+    "\n",
+    "print(f\"\\nShape of binary series (alive Deltamethrin): {binary_alive_deltamethrin.shape}\")\n",
+    "print(\"First 5 entries:\")\n",
+    "print(binary_alive_deltamethrin.head())\n",
+    "print(f\"Unique values in series: {binary_alive_deltamethrin.unique()}\")\n",
+    "\n",
+    "# Example 3: Binary outcomes for samples with dose 0.5\n",
+    "binary_dose_0_5 = ag3.phenotype_binary(\n",
+    "    sample_sets=[demo_sample_set],\n",
+    "    sample_query=\"dose == 0.5\"\n",
+    ")\n",
+    "\n",
+    "print(f\"\\nShape of binary series (dose 0.5): {binary_dose_0_5.shape}\")\n",
+    "print(\"First 5 entries:\")\n",
+    "print(binary_dose_0_5.head())\n",
+    "print(f\"Unique values in series: {binary_dose_0_5.unique()}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "62e84ea0",
+   "metadata": {},
+   "source": [
+    "## 5. Loading Phenotype Data Merged with SNP Calls\n",
     "The phenotypes_with_snps() method returns an xarray.Dataset that combines phenotype data with SNP calls for a specified genomic region. The sample_query parameter is still used to filter the phenotype data before merging."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 8,
    "id": "726ab8a3-6573-49d5-9b29-479176e5dee6",
    "metadata": {},
    "outputs": [
@@ -880,13 +1060,13 @@
    "id": "99e7b505-4ea4-43dc-91ae-14ac8f12076e",
    "metadata": {},
    "source": [
-    "### 5. Loading Phenotype Data Merged with Haplotypes\n",
+    "## 6. Loading Phenotype Data Merged with Haplotypes\n",
     "Similarly, the phenotypes_with_haplotypes() method returns an xarray.Dataset combining phenotype data with haplotype calls for a given region."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "id": "403f3a26-71eb-4781-975d-d94b7a021d97",
    "metadata": {},
    "outputs": [
diff --git a/tests/integration/test_ag3.py b/tests/integration/test_ag3.py