Skip to content

Commit dffbec5

Browse files
Fix: Restore phenotype_binary function and update tests/notebook
- Re-implemented the phenotype_binary method, which was inadvertently removed during previous refactoring. - Updated phenotype_binary to leverage the sample_query mechanism for filtering, aligning with mentor feedback. - Added new integration tests for phenotype_binary functionality. - Incorporated phenotype_binary examples into the demonstration notebook.
1 parent dd9146b commit dffbec5

3 files changed

Lines changed: 310 additions & 11 deletions

File tree

malariagen_data/anoph/phenotypes.py

Lines changed: 71 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from typing import Callable, Optional, List, Any
44
import warnings
55
import fsspec
6-
from malariagen_data.anoph import base_params
6+
from malariagen_data.anoph import base_params, phenotype_params
77

88

99
class AnophelesPhenotypeData:
@@ -516,3 +516,73 @@ def phenotype_sample_sets(self) -> List[str]:
516516
continue
517517

518518
return phenotype_sample_sets
519+
520+
def phenotype_binary(
521+
self,
522+
sample_sets: Optional[base_params.sample_sets] = None,
523+
insecticide: Optional[phenotype_params.insecticide] = None,
524+
dose: Optional[phenotype_params.dose] = None,
525+
phenotype: Optional[phenotype_params.phenotype] = None,
526+
sample_query: Optional[
527+
base_params.sample_query
528+
] = None, # Allow direct sample_query
529+
sample_query_options: Optional[base_params.sample_query_options] = None,
530+
cohort_size: Optional[base_params.cohort_size] = None,
531+
min_cohort_size: Optional[base_params.min_cohort_size] = None,
532+
max_cohort_size: Optional[base_params.max_cohort_size] = None,
533+
) -> pd.Series:
534+
"""
535+
Load phenotypic data as binary outcomes (1=alive/resistant, 0=dead/susceptible, NaN=unknown).
536+
Returns a pandas Series indexed by sample_id.
537+
"""
538+
# Build the sample_query string from individual parameters
539+
query_parts = []
540+
if insecticide is not None:
541+
if isinstance(insecticide, list):
542+
query_parts.append(f"insecticide in {insecticide}")
543+
else:
544+
query_parts.append(f"insecticide == '{insecticide}'")
545+
if dose is not None:
546+
if isinstance(dose, list):
547+
query_parts.append(f"dose in {dose}")
548+
else:
549+
query_parts.append(f"dose == {dose}")
550+
if phenotype is not None:
551+
if isinstance(phenotype, list):
552+
query_parts.append(f"phenotype in {phenotype}")
553+
else:
554+
query_parts.append(f"phenotype == '{phenotype}'")
555+
556+
# Combine with an existing sample_query if provided
557+
final_sample_query = sample_query
558+
if query_parts:
559+
generated_query = " and ".join(query_parts)
560+
if final_sample_query:
561+
final_sample_query = f"({final_sample_query}) and ({generated_query})"
562+
else:
563+
final_sample_query = generated_query
564+
565+
df = self.phenotype_data(
566+
sample_sets=sample_sets,
567+
sample_query=final_sample_query,
568+
sample_query_options=sample_query_options,
569+
cohort_size=cohort_size,
570+
min_cohort_size=min_cohort_size,
571+
max_cohort_size=max_cohort_size,
572+
)
573+
574+
if df.empty:
575+
return pd.Series(dtype=float, name="phenotype_binary")
576+
577+
binary_series = self._create_phenotype_binary_series(df)
578+
579+
binary_series.name = "phenotype_binary"
580+
# Ensure the index is correctly set to sample_id
581+
if "sample_id" in df.columns:
582+
binary_series.index = pd.Index(df["sample_id"])
583+
else:
584+
warnings.warn(
585+
"Cannot set index to sample_id as it is missing from the DataFrame returned by phenotype_data."
586+
)
587+
588+
return binary_series

notebooks/phenotype_data_demo.ipynb

Lines changed: 190 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
},
2424
{
2525
"cell_type": "code",
26-
"execution_count": 3,
26+
"execution_count": 1,
2727
"id": "f1280de7-129f-4875-bf12-eb68b8d88573",
2828
"metadata": {},
2929
"outputs": [
@@ -396,7 +396,7 @@
396396
},
397397
{
398398
"cell_type": "code",
399-
"execution_count": 4,
399+
"execution_count": 2,
400400
"id": "7dd3ebc4-64f7-4d0b-ad01-fbc1e08b0636",
401401
"metadata": {},
402402
"outputs": [
@@ -443,7 +443,7 @@
443443
},
444444
{
445445
"cell_type": "code",
446-
"execution_count": 11,
446+
"execution_count": 3,
447447
"id": "c0b9d0ae-6a79-458a-ad9f-9281a1c6112e",
448448
"metadata": {},
449449
"outputs": [
@@ -453,7 +453,7 @@
453453
"text": [
454454
"\n",
455455
"--- Loading phenotype data for '1237-VO-BJ-DJOGBENOU-VMF00050' filtered by Deltamethrin ---\n",
456-
"Shape of DataFrame: (88, 60)\n",
456+
"Shape of DataFrame: (88, 60) \n",
457457
"\n",
458458
"First 5 rows of the filtered DataFrame:\n",
459459
"\n",
@@ -555,7 +555,7 @@
555555
},
556556
{
557557
"cell_type": "code",
558-
"execution_count": 12,
558+
"execution_count": 4,
559559
"id": "eab7d81c-d80c-4279-9263-a0d7b425ff85",
560560
"metadata": {},
561561
"outputs": [
@@ -671,7 +671,7 @@
671671
},
672672
{
673673
"cell_type": "code",
674-
"execution_count": 13,
674+
"execution_count": 5,
675675
"id": "de8cd777-a4b2-44dd-a0bf-64a6ef419da7",
676676
"metadata": {},
677677
"outputs": [
@@ -780,18 +780,198 @@
780780
"# print(f\"All cohorts meet min_cohort_size (>=10): {all(cohort_sizes >= 10)}\")\n"
781781
]
782782
},
783+
{
784+
"cell_type": "code",
785+
"execution_count": 6,
786+
"id": "9a1f4cc9-528f-40bf-9efe-71aee41c21c3",
787+
"metadata": {},
788+
"outputs": [
789+
{
790+
"name": "stdout",
791+
"output_type": "stream",
792+
"text": [
793+
"\n",
794+
"--- Getting binary phenotype outcomes for '1237-VO-BJ-DJOGBENOU-VMF00050' ---\n",
795+
"Shape of binary series: (88,)\n",
796+
"First 5 entries:\n",
797+
"sample_id\n",
798+
"VBS18949-5562STDY7801785 0.0\n",
799+
"VBS18950-5562STDY7801786 0.0\n",
800+
"VBS18951-5562STDY7801787 0.0\n",
801+
"VBS18952-5562STDY7801788 0.0\n",
802+
"VBS18953-5562STDY7801789 0.0\n",
803+
"Name: phenotype_binary, dtype: float64\n",
804+
"Unique values in series: [0. 1.]\n",
805+
"\n",
806+
"Shape of binary series (alive Deltamethrin): (48,)\n",
807+
"First 5 entries:\n",
808+
"sample_id\n",
809+
"VBS18995-5562STDY7801828 1.0\n",
810+
"VBS18996-5562STDY7801829 1.0\n",
811+
"VBS18998-5562STDY7801830 1.0\n",
812+
"VBS18999-5562STDY7801831 1.0\n",
813+
"VBS19000-5562STDY7801832 1.0\n",
814+
"Name: phenotype_binary, dtype: float64\n",
815+
"Unique values in series: [1.]\n",
816+
"\n",
817+
"Shape of binary series (dose 0.5): (40,)\n",
818+
"First 5 entries:\n",
819+
"sample_id\n",
820+
"VBS18949-5562STDY7801785 0.0\n",
821+
"VBS18950-5562STDY7801786 0.0\n",
822+
"VBS18951-5562STDY7801787 0.0\n",
823+
"VBS18952-5562STDY7801788 0.0\n",
824+
"VBS18953-5562STDY7801789 0.0\n",
825+
"Name: phenotype_binary, dtype: float64\n",
826+
"Unique values in series: [0.]\n"
827+
]
828+
}
829+
],
830+
"source": [
831+
"print(f\"\\n--- Getting binary phenotype outcomes for '{demo_sample_set}' ---\")\n",
832+
"\n",
833+
"# Example 1: Binary outcomes for all Deltamethrin samples\n",
834+
"binary_deltamethrin = ag3.phenotype_binary(\n",
835+
" sample_sets=[demo_sample_set],\n",
836+
" sample_query=\"insecticide == 'Deltamethrin'\"\n",
837+
")\n",
838+
"\n",
839+
"print(f\"Shape of binary series: {binary_deltamethrin.shape}\")\n",
840+
"print(\"First 5 entries:\")\n",
841+
"print(binary_deltamethrin.head())\n",
842+
"print(f\"Unique values in series: {binary_deltamethrin.unique()}\")\n",
843+
"\n",
844+
"\n",
845+
"# Example 2: Binary outcomes for samples that were 'alive' with Deltamethrin\n",
846+
"binary_alive_deltamethrin = ag3.phenotype_binary(\n",
847+
" sample_sets=[demo_sample_set],\n",
848+
" sample_query=\"insecticide == 'Deltamethrin' and phenotype == 'alive'\"\n",
849+
")\n",
850+
"\n",
851+
"print(f\"\\nShape of binary series (alive Deltamethrin): {binary_alive_deltamethrin.shape}\")\n",
852+
"print(\"First 5 entries:\")\n",
853+
"print(binary_alive_deltamethrin.head())\n",
854+
"print(f\"Unique values in series: {binary_alive_deltamethrin.unique()}\")\n",
855+
"\n",
856+
"# Example 3: Binary outcomes for samples with dose 0.5\n",
857+
"binary_dose_0_5 = ag3.phenotype_binary(\n",
858+
" sample_sets=[demo_sample_set],\n",
859+
" sample_query=\"dose == 0.5\"\n",
860+
")\n",
861+
"\n",
862+
"print(f\"\\nShape of binary series (dose 0.5): {binary_dose_0_5.shape}\")\n",
863+
"print(\"First 5 entries:\")\n",
864+
"print(binary_dose_0_5.head())\n",
865+
"print(f\"Unique values in series: {binary_dose_0_5.unique()}\")"
866+
]
867+
},
783868
{
784869
"cell_type": "markdown",
785870
"id": "a92e430e-7f40-48a8-8c98-2405d27aabfb",
786871
"metadata": {},
787872
"source": [
788-
"### 4. Loading Phenotype Data Merged with SNP Calls\n",
873+
"## 4. Getting Binary Phenotype Outcomes with phenotype_binary\n",
874+
"\n",
875+
"The `phenotype_binary()` method provides a convenient way to get phenotype outcomes as a binary Pandas Series (1 for alive/resistant, 0 for dead/susceptible, NaN for unmapped). It also uses the `sample_query` for filtering."
876+
]
877+
},
878+
{
879+
"cell_type": "code",
880+
"execution_count": 7,
881+
"id": "fda88368",
882+
"metadata": {},
883+
"outputs": [
884+
{
885+
"name": "stdout",
886+
"output_type": "stream",
887+
"text": [
888+
"\n",
889+
"--- Getting binary phenotype outcomes for '1237-VO-BJ-DJOGBENOU-VMF00050' ---\n",
890+
"Shape of binary series: (88,)\n",
891+
"First 5 entries:\n",
892+
"sample_id\n",
893+
"VBS18949-5562STDY7801785 0.0\n",
894+
"VBS18950-5562STDY7801786 0.0\n",
895+
"VBS18951-5562STDY7801787 0.0\n",
896+
"VBS18952-5562STDY7801788 0.0\n",
897+
"VBS18953-5562STDY7801789 0.0\n",
898+
"Name: phenotype_binary, dtype: float64\n",
899+
"Unique values in series: [0. 1.]\n",
900+
"\n",
901+
"Shape of binary series (alive Deltamethrin): (48,)\n",
902+
"First 5 entries:\n",
903+
"sample_id\n",
904+
"VBS18995-5562STDY7801828 1.0\n",
905+
"VBS18996-5562STDY7801829 1.0\n",
906+
"VBS18998-5562STDY7801830 1.0\n",
907+
"VBS18999-5562STDY7801831 1.0\n",
908+
"VBS19000-5562STDY7801832 1.0\n",
909+
"Name: phenotype_binary, dtype: float64\n",
910+
"Unique values in series: [1.]\n",
911+
"\n",
912+
"Shape of binary series (dose 0.5): (40,)\n",
913+
"First 5 entries:\n",
914+
"sample_id\n",
915+
"VBS18949-5562STDY7801785 0.0\n",
916+
"VBS18950-5562STDY7801786 0.0\n",
917+
"VBS18951-5562STDY7801787 0.0\n",
918+
"VBS18952-5562STDY7801788 0.0\n",
919+
"VBS18953-5562STDY7801789 0.0\n",
920+
"Name: phenotype_binary, dtype: float64\n",
921+
"Unique values in series: [0.]\n"
922+
]
923+
}
924+
],
925+
"source": [
926+
"print(f\"\\n--- Getting binary phenotype outcomes for '{demo_sample_set}' ---\")\n",
927+
"\n",
928+
"# Example 1: Binary outcomes for all Deltamethrin samples\n",
929+
"binary_deltamethrin = ag3.phenotype_binary(\n",
930+
" sample_sets=[demo_sample_set],\n",
931+
" sample_query=\"insecticide == 'Deltamethrin'\"\n",
932+
")\n",
933+
"\n",
934+
"print(f\"Shape of binary series: {binary_deltamethrin.shape}\")\n",
935+
"print(\"First 5 entries:\")\n",
936+
"print(binary_deltamethrin.head())\n",
937+
"print(f\"Unique values in series: {binary_deltamethrin.unique()}\")\n",
938+
"\n",
939+
"\n",
940+
"# Example 2: Binary outcomes for samples that were 'alive' with Deltamethrin\n",
941+
"binary_alive_deltamethrin = ag3.phenotype_binary(\n",
942+
" sample_sets=[demo_sample_set],\n",
943+
" sample_query=\"insecticide == 'Deltamethrin' and phenotype == 'alive'\"\n",
944+
")\n",
945+
"\n",
946+
"print(f\"\\nShape of binary series (alive Deltamethrin): {binary_alive_deltamethrin.shape}\")\n",
947+
"print(\"First 5 entries:\")\n",
948+
"print(binary_alive_deltamethrin.head())\n",
949+
"print(f\"Unique values in series: {binary_alive_deltamethrin.unique()}\")\n",
950+
"\n",
951+
"# Example 3: Binary outcomes for samples with dose 0.5\n",
952+
"binary_dose_0_5 = ag3.phenotype_binary(\n",
953+
" sample_sets=[demo_sample_set],\n",
954+
" sample_query=\"dose == 0.5\"\n",
955+
")\n",
956+
"\n",
957+
"print(f\"\\nShape of binary series (dose 0.5): {binary_dose_0_5.shape}\")\n",
958+
"print(\"First 5 entries:\")\n",
959+
"print(binary_dose_0_5.head())\n",
960+
"print(f\"Unique values in series: {binary_dose_0_5.unique()}\")"
961+
]
962+
},
963+
{
964+
"cell_type": "markdown",
965+
"id": "62e84ea0",
966+
"metadata": {},
967+
"source": [
968+
"## 5. Loading Phenotype Data Merged with SNP Calls\n",
789969
"The phenotypes_with_snps() method returns an xarray.Dataset that combines phenotype data with SNP calls for a specified genomic region. The sample_query parameter is still used to filter the phenotype data before merging."
790970
]
791971
},
792972
{
793973
"cell_type": "code",
794-
"execution_count": 14,
974+
"execution_count": 8,
795975
"id": "726ab8a3-6573-49d5-9b29-479176e5dee6",
796976
"metadata": {},
797977
"outputs": [
@@ -880,13 +1060,13 @@
8801060
"id": "99e7b505-4ea4-43dc-91ae-14ac8f12076e",
8811061
"metadata": {},
8821062
"source": [
883-
"### 5. Loading Phenotype Data Merged with Haplotypes\n",
1063+
"## 6. Loading Phenotype Data Merged with Haplotypes\n",
8841064
"Similarly, the phenotypes_with_haplotypes() method returns an xarray.Dataset combining phenotype data with haplotype calls for a given region."
8851065
]
8861066
},
8871067
{
8881068
"cell_type": "code",
889-
"execution_count": 10,
1069+
"execution_count": 9,
8901070
"id": "403f3a26-71eb-4781-975d-d94b7a021d97",
8911071
"metadata": {},
8921072
"outputs": [

0 commit comments

Comments
 (0)