Skip to content

Commit b160cc2

Browse files
authored
Merge pull request #869 from Aryan-SINGH-GIT/informative-valueError
Improve plot_njt errors for insufficient data
2 parents c8344ea + 9799bb0 commit b160cc2

22 files changed

+239
-136
lines changed

malariagen_data/anoph/distance.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -527,6 +527,20 @@ def plot_njt(
527527
count_sort = True
528528
distance_sort = False
529529

530+
# Ensure we have enough samples for a tree.
531+
# If we have 0 samples, `biallelic_snp_calls` or `snp_calls` should have already raised "No samples found".
532+
# However, if we have 1 sample, it might pass through until here, where it would cause a failure in njt.
533+
df_samples = self.sample_metadata(
534+
sample_sets=sample_sets,
535+
sample_query=sample_query,
536+
sample_query_options=sample_query_options,
537+
sample_indices=sample_indices,
538+
)
539+
if 0 < len(df_samples) < 2:
540+
raise ValueError(
541+
f"Not enough samples for neighbour-joining tree. Found {len(df_samples)}, needed at least 2."
542+
)
543+
530544
# Compute neighbour-joining tree.
531545
Z, samples, n_snps_used = self.njt(
532546
region=region,

malariagen_data/anoph/karyotype_params.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
"""Parameter definitions for karyotype analysis functions."""
22

3-
43
from typing_extensions import Annotated, TypeAlias
54

65
inversion_param: TypeAlias = Annotated[

malariagen_data/mjn.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,7 @@ def _mjn_graph_edges(
264264

265265
# add further intermediate nodes as necessary
266266
for k in range(1, sep - 1):
267-
source = f"anon_{i}_{j}_{k-1}"
267+
source = f"anon_{i}_{j}_{k - 1}"
268268
target = f"anon_{i}_{j}_{k}"
269269
graph_node = {
270270
"id": target,
@@ -280,10 +280,10 @@ def _mjn_graph_edges(
280280
graph_edges.append(graph_edge)
281281

282282
# add edge from final intermediate node to node j
283-
source = f"anon_{i}_{j}_{sep-2}"
283+
source = f"anon_{i}_{j}_{sep - 2}"
284284
target = j
285285
graph_edge = {
286-
"id": f"edge_{i}_{j}_{sep-1}",
286+
"id": f"edge_{i}_{j}_{sep - 1}",
287287
"source": source,
288288
"target": target,
289289
}

notebooks/auto_chunks.ipynb

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,7 @@
3939
"metadata": {},
4040
"outputs": [],
4141
"source": [
42-
"ds = ag3.snp_calls(\n",
43-
" region=\"3R\", sample_sets=\"AG1000G-BF-A\"\n",
44-
")"
42+
"ds = ag3.snp_calls(region=\"3R\", sample_sets=\"AG1000G-BF-A\")"
4543
]
4644
},
4745
{
@@ -80,7 +78,9 @@
8078
"outputs": [],
8179
"source": [
8280
"ds = ag3.snp_calls(\n",
83-
" region=\"3R\", sample_sets=\"AG1000G-BF-A\", chunks=\"300MB\",\n",
81+
" region=\"3R\",\n",
82+
" sample_sets=\"AG1000G-BF-A\",\n",
83+
" chunks=\"300MB\",\n",
8484
")"
8585
]
8686
},
@@ -119,9 +119,7 @@
119119
"metadata": {},
120120
"outputs": [],
121121
"source": [
122-
"ds = ag3.snp_calls(\n",
123-
" region=\"3R\", sample_sets=\"AG1000G-BF-A\", chunks=\"auto\"\n",
124-
")"
122+
"ds = ag3.snp_calls(region=\"3R\", sample_sets=\"AG1000G-BF-A\", chunks=\"auto\")"
125123
]
126124
},
127125
{
@@ -159,9 +157,7 @@
159157
"metadata": {},
160158
"outputs": [],
161159
"source": [
162-
"ds = ag3.snp_calls(\n",
163-
" region=\"3R\", sample_sets=\"AG1000G-BF-A\", chunks=\"ndauto\"\n",
164-
")"
160+
"ds = ag3.snp_calls(region=\"3R\", sample_sets=\"AG1000G-BF-A\", chunks=\"ndauto\")"
165161
]
166162
},
167163
{

notebooks/extra_metadata.ipynb

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,9 @@
112112
"metadata": {},
113113
"outputs": [],
114114
"source": [
115-
"ag3.haplotypes(region=\"3R\", analysis=\"gamb_colu_arab\", sample_query=sample_query, sample_sets=\"3.0\")"
115+
"ag3.haplotypes(\n",
116+
" region=\"3R\", analysis=\"gamb_colu_arab\", sample_query=sample_query, sample_sets=\"3.0\"\n",
117+
")"
116118
]
117119
},
118120
{

notebooks/karyotype.ipynb

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,9 @@
120120
},
121121
"outputs": [],
122122
"source": [
123-
"ag3.plot_pca_coords(pca_df_2la, color=\"karyotype_2La\", symbol=\"taxon\", width=600, height=500)"
123+
"ag3.plot_pca_coords(\n",
124+
" pca_df_2la, color=\"karyotype_2La\", symbol=\"taxon\", width=600, height=500\n",
125+
")"
124126
]
125127
},
126128
{
@@ -195,7 +197,9 @@
195197
},
196198
"outputs": [],
197199
"source": [
198-
"ag3.plot_pca_coords(pca_df_2rb, color=\"karyotype_2Rb\", symbol=\"taxon\", width=600, height=500)"
200+
"ag3.plot_pca_coords(\n",
201+
" pca_df_2rb, color=\"karyotype_2Rb\", symbol=\"taxon\", width=600, height=500\n",
202+
")"
199203
]
200204
},
201205
{
@@ -228,7 +232,7 @@
228232
"outputs": [],
229233
"source": [
230234
"kt_df_2rc_gam = ag3.karyotype(\n",
231-
" inversion=\"2Rc_gam\", \n",
235+
" inversion=\"2Rc_gam\",\n",
232236
" sample_sets=sample_sets,\n",
233237
" sample_query=\"taxon == 'gambiae'\",\n",
234238
")\n",
@@ -275,7 +279,9 @@
275279
"metadata": {},
276280
"outputs": [],
277281
"source": [
278-
"ag3.plot_pca_coords(pca_df_2rc_gam, color=\"karyotype_2Rc_gam\", symbol=\"taxon\", width=600, height=500)"
282+
"ag3.plot_pca_coords(\n",
283+
" pca_df_2rc_gam, color=\"karyotype_2Rc_gam\", symbol=\"taxon\", width=600, height=500\n",
284+
")"
279285
]
280286
},
281287
{
@@ -287,7 +293,9 @@
287293
},
288294
"outputs": [],
289295
"source": [
290-
"ag3.plot_pca_coords(pca_df_2rc_gam, color=\"country\", symbol=\"taxon\", width=600, height=500)"
296+
"ag3.plot_pca_coords(\n",
297+
" pca_df_2rc_gam, color=\"country\", symbol=\"taxon\", width=600, height=500\n",
298+
")"
291299
]
292300
},
293301
{
@@ -308,7 +316,7 @@
308316
"outputs": [],
309317
"source": [
310318
"kt_df_2rc_col = ag3.karyotype(\n",
311-
" inversion=\"2Rc_col\", \n",
319+
" inversion=\"2Rc_col\",\n",
312320
" sample_sets=sample_sets,\n",
313321
" sample_query=\"taxon == 'coluzzii'\",\n",
314322
")\n",
@@ -357,7 +365,9 @@
357365
},
358366
"outputs": [],
359367
"source": [
360-
"ag3.plot_pca_coords(pca_df_2rc_col, color=\"karyotype_2Rc_col\", symbol=\"country\", width=600, height=500)"
368+
"ag3.plot_pca_coords(\n",
369+
" pca_df_2rc_col, color=\"karyotype_2Rc_col\", symbol=\"country\", width=600, height=500\n",
370+
")"
361371
]
362372
}
363373
],

notebooks/local_cluster.ipynb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
"outputs": [],
4545
"source": [
4646
"from distributed import LocalCluster, Client\n",
47+
"\n",
4748
"cluster = LocalCluster()\n",
4849
"cluster"
4950
]

notebooks/phenotype_data_demo.ipynb

Lines changed: 36 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -382,7 +382,7 @@
382382
"ag3 = Ag3(pre=True)\n",
383383
"\n",
384384
"print(\"MalariaGEN Ag3 API client initialized.\")\n",
385-
"print(ag3)\n"
385+
"print(ag3)"
386386
]
387387
},
388388
{
@@ -416,12 +416,14 @@
416416
"\n",
417417
"# We'll pick one sample set for demonstration, preferably one known to have data\n",
418418
"# For this example, we'll use '1237-VO-BJ-DJOGBENOU-VMF00050'\n",
419-
"demo_sample_set = '1237-VO-BJ-DJOGBENOU-VMF00050'\n",
419+
"demo_sample_set = \"1237-VO-BJ-DJOGBENOU-VMF00050\"\n",
420420
"if demo_sample_set not in phenotype_sample_sets:\n",
421-
" print(f\"Warning: '{demo_sample_set}' not found. Using the first available: {phenotype_sample_sets}\")\n",
421+
" print(\n",
422+
" f\"Warning: '{demo_sample_set}' not found. Using the first available: {phenotype_sample_sets}\"\n",
423+
" )\n",
422424
" demo_sample_set = phenotype_sample_sets\n",
423425
"\n",
424-
"print(f\"\\nUsing sample set for demonstration: {demo_sample_set}\")\n"
426+
"print(f\"\\nUsing sample set for demonstration: {demo_sample_set}\")"
425427
]
426428
},
427429
{
@@ -531,18 +533,21 @@
531533
}
532534
],
533535
"source": [
534-
"print(f\"\\n--- Loading phenotype data for '{demo_sample_set}' filtered by Deltamethrin ---\")\n",
536+
"print(\n",
537+
" f\"\\n--- Loading phenotype data for '{demo_sample_set}' filtered by Deltamethrin ---\"\n",
538+
")\n",
535539
"df_deltamethrin = ag3.phenotype_data(\n",
536-
" sample_sets=[demo_sample_set],\n",
537-
" sample_query=\"insecticide == 'Deltamethrin'\"\n",
540+
" sample_sets=[demo_sample_set], sample_query=\"insecticide == 'Deltamethrin'\"\n",
538541
")\n",
539542
"\n",
540543
"print(f\"Shape of DataFrame: {df_deltamethrin.shape}\")\n",
541544
"print(\"\\nFirst 5 rows of the filtered DataFrame:\")\n",
542545
"df_deltamethrin.head()\n",
543-
"print(f\"\\nUnique insecticides in filtered data: {df_deltamethrin['insecticide'].unique()}\")\n",
546+
"print(\n",
547+
" f\"\\nUnique insecticides in filtered data: {df_deltamethrin['insecticide'].unique()}\"\n",
548+
")\n",
544549
"print(\"\\nDataFrame Info:\")\n",
545-
"df_deltamethrin.info() "
550+
"df_deltamethrin.info()"
546551
]
547552
},
548553
{
@@ -647,7 +652,7 @@
647652
"print(f\"\\n--- Loading phenotype data filtered by Deltamethrin and dose >= 1.0 ---\")\n",
648653
"df_filtered_multi = ag3.phenotype_data(\n",
649654
" sample_sets=[demo_sample_set],\n",
650-
" sample_query=\"insecticide == 'Deltamethrin' and dose >= 1.0\"\n",
655+
" sample_query=\"insecticide == 'Deltamethrin' and dose >= 1.0\",\n",
651656
")\n",
652657
"\n",
653658
"print(f\"Shape of DataFrame: {df_filtered_multi.shape}\")\n",
@@ -657,7 +662,7 @@
657662
"print(f\"\\nUnique insecticides: {df_filtered_multi['insecticide'].unique()}\")\n",
658663
"print(f\"Unique doses: {df_filtered_multi['dose'].unique()}\")\n",
659664
"print(\"\\nDataFrame Info:\")\n",
660-
"df_filtered_multi.info()\n"
665+
"df_filtered_multi.info()"
661666
]
662667
},
663668
{
@@ -761,14 +766,14 @@
761766
"df_cohort_filtered = ag3.phenotype_data(\n",
762767
" sample_sets=[demo_sample_set],\n",
763768
" sample_query=\"insecticide == 'Deltamethrin'\",\n",
764-
" min_cohort_size=10\n",
769+
" min_cohort_size=10,\n",
765770
")\n",
766771
"\n",
767772
"print(f\"Shape of DataFrame: {df_cohort_filtered.shape}\")\n",
768773
"print(\"\\nFirst 5 rows of the cohort-filtered DataFrame:\")\n",
769-
"df_cohort_filtered.head() \n",
774+
"df_cohort_filtered.head()\n",
770775
"print(\"\\nDataFrame Info:\")\n",
771-
"df_cohort_filtered.info() \n",
776+
"df_cohort_filtered.info()\n",
772777
"# Verify cohort sizes (optional, for internal testing)\n",
773778
"# if not df_cohort_filtered.empty:\n",
774779
"# cohort_keys = [\"insecticide\", \"dose\", \"location\", \"country\", \"sample_set\"]\n",
@@ -777,7 +782,7 @@
777782
"# cohort_sizes = df_cohort_filtered.groupby(available_keys).size()\n",
778783
"# print(\"\\nCohort sizes after filtering:\")\n",
779784
"# print(cohort_sizes)\n",
780-
"# print(f\"All cohorts meet min_cohort_size (>=10): {all(cohort_sizes >= 10)}\")\n"
785+
"# print(f\"All cohorts meet min_cohort_size (>=10): {all(cohort_sizes >= 10)}\")"
781786
]
782787
},
783788
{
@@ -832,8 +837,7 @@
832837
"\n",
833838
"# Example 1: Binary outcomes for all Deltamethrin samples\n",
834839
"binary_deltamethrin = ag3.phenotype_binary(\n",
835-
" sample_sets=[demo_sample_set],\n",
836-
" sample_query=\"insecticide == 'Deltamethrin'\"\n",
840+
" sample_sets=[demo_sample_set], sample_query=\"insecticide == 'Deltamethrin'\"\n",
837841
")\n",
838842
"\n",
839843
"print(f\"Shape of binary series: {binary_deltamethrin.shape}\")\n",
@@ -845,18 +849,19 @@
845849
"# Example 2: Binary outcomes for samples that were 'alive' with Deltamethrin\n",
846850
"binary_alive_deltamethrin = ag3.phenotype_binary(\n",
847851
" sample_sets=[demo_sample_set],\n",
848-
" sample_query=\"insecticide == 'Deltamethrin' and phenotype == 'alive'\"\n",
852+
" sample_query=\"insecticide == 'Deltamethrin' and phenotype == 'alive'\",\n",
849853
")\n",
850854
"\n",
851-
"print(f\"\\nShape of binary series (alive Deltamethrin): {binary_alive_deltamethrin.shape}\")\n",
855+
"print(\n",
856+
" f\"\\nShape of binary series (alive Deltamethrin): {binary_alive_deltamethrin.shape}\"\n",
857+
")\n",
852858
"print(\"First 5 entries:\")\n",
853859
"print(binary_alive_deltamethrin.head())\n",
854860
"print(f\"Unique values in series: {binary_alive_deltamethrin.unique()}\")\n",
855861
"\n",
856862
"# Example 3: Binary outcomes for samples with dose 0.5\n",
857863
"binary_dose_0_5 = ag3.phenotype_binary(\n",
858-
" sample_sets=[demo_sample_set],\n",
859-
" sample_query=\"dose == 0.5\"\n",
864+
" sample_sets=[demo_sample_set], sample_query=\"dose == 0.5\"\n",
860865
")\n",
861866
"\n",
862867
"print(f\"\\nShape of binary series (dose 0.5): {binary_dose_0_5.shape}\")\n",
@@ -927,8 +932,7 @@
927932
"\n",
928933
"# Example 1: Binary outcomes for all Deltamethrin samples\n",
929934
"binary_deltamethrin = ag3.phenotype_binary(\n",
930-
" sample_sets=[demo_sample_set],\n",
931-
" sample_query=\"insecticide == 'Deltamethrin'\"\n",
935+
" sample_sets=[demo_sample_set], sample_query=\"insecticide == 'Deltamethrin'\"\n",
932936
")\n",
933937
"\n",
934938
"print(f\"Shape of binary series: {binary_deltamethrin.shape}\")\n",
@@ -940,18 +944,19 @@
940944
"# Example 2: Binary outcomes for samples that were 'alive' with Deltamethrin\n",
941945
"binary_alive_deltamethrin = ag3.phenotype_binary(\n",
942946
" sample_sets=[demo_sample_set],\n",
943-
" sample_query=\"insecticide == 'Deltamethrin' and phenotype == 'alive'\"\n",
947+
" sample_query=\"insecticide == 'Deltamethrin' and phenotype == 'alive'\",\n",
944948
")\n",
945949
"\n",
946-
"print(f\"\\nShape of binary series (alive Deltamethrin): {binary_alive_deltamethrin.shape}\")\n",
950+
"print(\n",
951+
" f\"\\nShape of binary series (alive Deltamethrin): {binary_alive_deltamethrin.shape}\"\n",
952+
")\n",
947953
"print(\"First 5 entries:\")\n",
948954
"print(binary_alive_deltamethrin.head())\n",
949955
"print(f\"Unique values in series: {binary_alive_deltamethrin.unique()}\")\n",
950956
"\n",
951957
"# Example 3: Binary outcomes for samples with dose 0.5\n",
952958
"binary_dose_0_5 = ag3.phenotype_binary(\n",
953-
" sample_sets=[demo_sample_set],\n",
954-
" sample_query=\"dose == 0.5\"\n",
959+
" sample_sets=[demo_sample_set], sample_query=\"dose == 0.5\"\n",
955960
")\n",
956961
"\n",
957962
"print(f\"\\nShape of binary series (dose 0.5): {binary_dose_0_5.shape}\")\n",
@@ -1038,7 +1043,7 @@
10381043
"ds_snps = ag3.phenotypes_with_snps(\n",
10391044
" sample_sets=[demo_sample_set],\n",
10401045
" sample_query=\"insecticide == 'Deltamethrin' and phenotype == 'alive'\",\n",
1041-
" region=demo_region_snps\n",
1046+
" region=demo_region_snps,\n",
10421047
")\n",
10431048
"\n",
10441049
"print(f\"Dataset dimensions: {ds_snps.dims}\")\n",
@@ -1052,7 +1057,7 @@
10521057
"print(\"\\nFirst 5 variant positions:\")\n",
10531058
"print(ds_snps[\"variant_position\"].head(5).values)\n",
10541059
"print(\"\\nDataset Info:\")\n",
1055-
"ds_snps.info()\n"
1060+
"ds_snps.info()"
10561061
]
10571062
},
10581063
{
@@ -1126,7 +1131,7 @@
11261131
"ds_haps = ag3.phenotypes_with_haplotypes(\n",
11271132
" sample_sets=[demo_sample_set],\n",
11281133
" sample_query=\"insecticide == 'Deltamethrin' and phenotype == 'dead'\",\n",
1129-
" region=demo_region_haps\n",
1134+
" region=demo_region_haps,\n",
11301135
")\n",
11311136
"\n",
11321137
"print(f\"Dataset dimensions: {ds_haps.dims}\")\n",
@@ -1140,7 +1145,7 @@
11401145
"print(\"\\nFirst 5 variant positions:\")\n",
11411146
"print(ds_haps[\"variant_position\"].head(5).values)\n",
11421147
"print(\"\\nDataset Info:\")\n",
1143-
"ds_haps.info()\n"
1148+
"ds_haps.info()"
11441149
]
11451150
},
11461151
{

0 commit comments

Comments
 (0)