Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions malariagen_data/anoph/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,20 @@ def plot_njt(
count_sort = True
distance_sort = False

# Ensure we have enough samples for a tree.
# If we have 0 samples, `biallelic_snp_calls` or `snp_calls` should have already raised "No samples found".
# However, if we have 1 sample, it might pass through until here, where it would cause a failure in njt.
df_samples = self.sample_metadata(
sample_sets=sample_sets,
sample_query=sample_query,
sample_query_options=sample_query_options,
sample_indices=sample_indices,
)
if 0 < len(df_samples) < 2:
raise ValueError(
f"Not enough samples for neighbour-joining tree. Found {len(df_samples)}, needed at least 2."
)

# Compute neighbour-joining tree.
Z, samples, n_snps_used = self.njt(
region=region,
Expand Down
1 change: 0 additions & 1 deletion malariagen_data/anoph/karyotype_params.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Parameter definitions for karyotype analysis functions."""


from typing_extensions import Annotated, TypeAlias

inversion_param: TypeAlias = Annotated[
Expand Down
6 changes: 3 additions & 3 deletions malariagen_data/mjn.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ def _mjn_graph_edges(

# add further intermediate nodes as necessary
for k in range(1, sep - 1):
source = f"anon_{i}_{j}_{k-1}"
source = f"anon_{i}_{j}_{k - 1}"
target = f"anon_{i}_{j}_{k}"
graph_node = {
"id": target,
Expand All @@ -280,10 +280,10 @@ def _mjn_graph_edges(
graph_edges.append(graph_edge)

# add edge from final intermediate node to node j
source = f"anon_{i}_{j}_{sep-2}"
source = f"anon_{i}_{j}_{sep - 2}"
target = j
graph_edge = {
"id": f"edge_{i}_{j}_{sep-1}",
"id": f"edge_{i}_{j}_{sep - 1}",
"source": source,
"target": target,
}
Expand Down
16 changes: 6 additions & 10 deletions notebooks/auto_chunks.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,7 @@
"metadata": {},
"outputs": [],
"source": [
"ds = ag3.snp_calls(\n",
" region=\"3R\", sample_sets=\"AG1000G-BF-A\"\n",
")"
"ds = ag3.snp_calls(region=\"3R\", sample_sets=\"AG1000G-BF-A\")"
]
},
{
Expand Down Expand Up @@ -80,7 +78,9 @@
"outputs": [],
"source": [
"ds = ag3.snp_calls(\n",
" region=\"3R\", sample_sets=\"AG1000G-BF-A\", chunks=\"300MB\",\n",
" region=\"3R\",\n",
" sample_sets=\"AG1000G-BF-A\",\n",
" chunks=\"300MB\",\n",
")"
]
},
Expand Down Expand Up @@ -119,9 +119,7 @@
"metadata": {},
"outputs": [],
"source": [
"ds = ag3.snp_calls(\n",
" region=\"3R\", sample_sets=\"AG1000G-BF-A\", chunks=\"auto\"\n",
")"
"ds = ag3.snp_calls(region=\"3R\", sample_sets=\"AG1000G-BF-A\", chunks=\"auto\")"
]
},
{
Expand Down Expand Up @@ -159,9 +157,7 @@
"metadata": {},
"outputs": [],
"source": [
"ds = ag3.snp_calls(\n",
" region=\"3R\", sample_sets=\"AG1000G-BF-A\", chunks=\"ndauto\"\n",
")"
"ds = ag3.snp_calls(region=\"3R\", sample_sets=\"AG1000G-BF-A\", chunks=\"ndauto\")"
]
},
{
Expand Down
4 changes: 3 additions & 1 deletion notebooks/extra_metadata.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,9 @@
"metadata": {},
"outputs": [],
"source": [
"ag3.haplotypes(region=\"3R\", analysis=\"gamb_colu_arab\", sample_query=sample_query, sample_sets=\"3.0\")"
"ag3.haplotypes(\n",
" region=\"3R\", analysis=\"gamb_colu_arab\", sample_query=sample_query, sample_sets=\"3.0\"\n",
")"
]
},
{
Expand Down
24 changes: 17 additions & 7 deletions notebooks/karyotype.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,9 @@
},
"outputs": [],
"source": [
"ag3.plot_pca_coords(pca_df_2la, color=\"karyotype_2La\", symbol=\"taxon\", width=600, height=500)"
"ag3.plot_pca_coords(\n",
" pca_df_2la, color=\"karyotype_2La\", symbol=\"taxon\", width=600, height=500\n",
")"
]
},
{
Expand Down Expand Up @@ -195,7 +197,9 @@
},
"outputs": [],
"source": [
"ag3.plot_pca_coords(pca_df_2rb, color=\"karyotype_2Rb\", symbol=\"taxon\", width=600, height=500)"
"ag3.plot_pca_coords(\n",
" pca_df_2rb, color=\"karyotype_2Rb\", symbol=\"taxon\", width=600, height=500\n",
")"
]
},
{
Expand Down Expand Up @@ -228,7 +232,7 @@
"outputs": [],
"source": [
"kt_df_2rc_gam = ag3.karyotype(\n",
" inversion=\"2Rc_gam\", \n",
" inversion=\"2Rc_gam\",\n",
" sample_sets=sample_sets,\n",
" sample_query=\"taxon == 'gambiae'\",\n",
")\n",
Expand Down Expand Up @@ -275,7 +279,9 @@
"metadata": {},
"outputs": [],
"source": [
"ag3.plot_pca_coords(pca_df_2rc_gam, color=\"karyotype_2Rc_gam\", symbol=\"taxon\", width=600, height=500)"
"ag3.plot_pca_coords(\n",
" pca_df_2rc_gam, color=\"karyotype_2Rc_gam\", symbol=\"taxon\", width=600, height=500\n",
")"
]
},
{
Expand All @@ -287,7 +293,9 @@
},
"outputs": [],
"source": [
"ag3.plot_pca_coords(pca_df_2rc_gam, color=\"country\", symbol=\"taxon\", width=600, height=500)"
"ag3.plot_pca_coords(\n",
" pca_df_2rc_gam, color=\"country\", symbol=\"taxon\", width=600, height=500\n",
")"
]
},
{
Expand All @@ -308,7 +316,7 @@
"outputs": [],
"source": [
"kt_df_2rc_col = ag3.karyotype(\n",
" inversion=\"2Rc_col\", \n",
" inversion=\"2Rc_col\",\n",
" sample_sets=sample_sets,\n",
" sample_query=\"taxon == 'coluzzii'\",\n",
")\n",
Expand Down Expand Up @@ -357,7 +365,9 @@
},
"outputs": [],
"source": [
"ag3.plot_pca_coords(pca_df_2rc_col, color=\"karyotype_2Rc_col\", symbol=\"country\", width=600, height=500)"
"ag3.plot_pca_coords(\n",
" pca_df_2rc_col, color=\"karyotype_2Rc_col\", symbol=\"country\", width=600, height=500\n",
")"
]
}
],
Expand Down
1 change: 1 addition & 0 deletions notebooks/local_cluster.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
"outputs": [],
"source": [
"from distributed import LocalCluster, Client\n",
"\n",
"cluster = LocalCluster()\n",
"cluster"
]
Expand Down
67 changes: 36 additions & 31 deletions notebooks/phenotype_data_demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@
"ag3 = Ag3(pre=True)\n",
"\n",
"print(\"MalariaGEN Ag3 API client initialized.\")\n",
"print(ag3)\n"
"print(ag3)"
]
},
{
Expand Down Expand Up @@ -416,12 +416,14 @@
"\n",
"# We'll pick one sample set for demonstration, preferably one known to have data\n",
"# For this example, we'll use '1237-VO-BJ-DJOGBENOU-VMF00050'\n",
"demo_sample_set = '1237-VO-BJ-DJOGBENOU-VMF00050'\n",
"demo_sample_set = \"1237-VO-BJ-DJOGBENOU-VMF00050\"\n",
"if demo_sample_set not in phenotype_sample_sets:\n",
" print(f\"Warning: '{demo_sample_set}' not found. Using the first available: {phenotype_sample_sets}\")\n",
" print(\n",
" f\"Warning: '{demo_sample_set}' not found. Using the first available: {phenotype_sample_sets}\"\n",
" )\n",
" demo_sample_set = phenotype_sample_sets\n",
"\n",
"print(f\"\\nUsing sample set for demonstration: {demo_sample_set}\")\n"
"print(f\"\\nUsing sample set for demonstration: {demo_sample_set}\")"
]
},
{
Expand Down Expand Up @@ -531,18 +533,21 @@
}
],
"source": [
"print(f\"\\n--- Loading phenotype data for '{demo_sample_set}' filtered by Deltamethrin ---\")\n",
"print(\n",
" f\"\\n--- Loading phenotype data for '{demo_sample_set}' filtered by Deltamethrin ---\"\n",
")\n",
"df_deltamethrin = ag3.phenotype_data(\n",
" sample_sets=[demo_sample_set],\n",
" sample_query=\"insecticide == 'Deltamethrin'\"\n",
" sample_sets=[demo_sample_set], sample_query=\"insecticide == 'Deltamethrin'\"\n",
")\n",
"\n",
"print(f\"Shape of DataFrame: {df_deltamethrin.shape}\")\n",
"print(\"\\nFirst 5 rows of the filtered DataFrame:\")\n",
"df_deltamethrin.head()\n",
"print(f\"\\nUnique insecticides in filtered data: {df_deltamethrin['insecticide'].unique()}\")\n",
"print(\n",
" f\"\\nUnique insecticides in filtered data: {df_deltamethrin['insecticide'].unique()}\"\n",
")\n",
"print(\"\\nDataFrame Info:\")\n",
"df_deltamethrin.info() "
"df_deltamethrin.info()"
]
},
{
Expand Down Expand Up @@ -647,7 +652,7 @@
"print(f\"\\n--- Loading phenotype data filtered by Deltamethrin and dose >= 1.0 ---\")\n",
"df_filtered_multi = ag3.phenotype_data(\n",
" sample_sets=[demo_sample_set],\n",
" sample_query=\"insecticide == 'Deltamethrin' and dose >= 1.0\"\n",
" sample_query=\"insecticide == 'Deltamethrin' and dose >= 1.0\",\n",
")\n",
"\n",
"print(f\"Shape of DataFrame: {df_filtered_multi.shape}\")\n",
Expand All @@ -657,7 +662,7 @@
"print(f\"\\nUnique insecticides: {df_filtered_multi['insecticide'].unique()}\")\n",
"print(f\"Unique doses: {df_filtered_multi['dose'].unique()}\")\n",
"print(\"\\nDataFrame Info:\")\n",
"df_filtered_multi.info()\n"
"df_filtered_multi.info()"
]
},
{
Expand Down Expand Up @@ -761,14 +766,14 @@
"df_cohort_filtered = ag3.phenotype_data(\n",
" sample_sets=[demo_sample_set],\n",
" sample_query=\"insecticide == 'Deltamethrin'\",\n",
" min_cohort_size=10\n",
" min_cohort_size=10,\n",
")\n",
"\n",
"print(f\"Shape of DataFrame: {df_cohort_filtered.shape}\")\n",
"print(\"\\nFirst 5 rows of the cohort-filtered DataFrame:\")\n",
"df_cohort_filtered.head() \n",
"df_cohort_filtered.head()\n",
"print(\"\\nDataFrame Info:\")\n",
"df_cohort_filtered.info() \n",
"df_cohort_filtered.info()\n",
"# Verify cohort sizes (optional, for internal testing)\n",
"# if not df_cohort_filtered.empty:\n",
"# cohort_keys = [\"insecticide\", \"dose\", \"location\", \"country\", \"sample_set\"]\n",
Expand All @@ -777,7 +782,7 @@
"# cohort_sizes = df_cohort_filtered.groupby(available_keys).size()\n",
"# print(\"\\nCohort sizes after filtering:\")\n",
"# print(cohort_sizes)\n",
"# print(f\"All cohorts meet min_cohort_size (>=10): {all(cohort_sizes >= 10)}\")\n"
"# print(f\"All cohorts meet min_cohort_size (>=10): {all(cohort_sizes >= 10)}\")"
]
},
{
Expand Down Expand Up @@ -832,8 +837,7 @@
"\n",
"# Example 1: Binary outcomes for all Deltamethrin samples\n",
"binary_deltamethrin = ag3.phenotype_binary(\n",
" sample_sets=[demo_sample_set],\n",
" sample_query=\"insecticide == 'Deltamethrin'\"\n",
" sample_sets=[demo_sample_set], sample_query=\"insecticide == 'Deltamethrin'\"\n",
")\n",
"\n",
"print(f\"Shape of binary series: {binary_deltamethrin.shape}\")\n",
Expand All @@ -845,18 +849,19 @@
"# Example 2: Binary outcomes for samples that were 'alive' with Deltamethrin\n",
"binary_alive_deltamethrin = ag3.phenotype_binary(\n",
" sample_sets=[demo_sample_set],\n",
" sample_query=\"insecticide == 'Deltamethrin' and phenotype == 'alive'\"\n",
" sample_query=\"insecticide == 'Deltamethrin' and phenotype == 'alive'\",\n",
")\n",
"\n",
"print(f\"\\nShape of binary series (alive Deltamethrin): {binary_alive_deltamethrin.shape}\")\n",
"print(\n",
" f\"\\nShape of binary series (alive Deltamethrin): {binary_alive_deltamethrin.shape}\"\n",
")\n",
"print(\"First 5 entries:\")\n",
"print(binary_alive_deltamethrin.head())\n",
"print(f\"Unique values in series: {binary_alive_deltamethrin.unique()}\")\n",
"\n",
"# Example 3: Binary outcomes for samples with dose 0.5\n",
"binary_dose_0_5 = ag3.phenotype_binary(\n",
" sample_sets=[demo_sample_set],\n",
" sample_query=\"dose == 0.5\"\n",
" sample_sets=[demo_sample_set], sample_query=\"dose == 0.5\"\n",
")\n",
"\n",
"print(f\"\\nShape of binary series (dose 0.5): {binary_dose_0_5.shape}\")\n",
Expand Down Expand Up @@ -927,8 +932,7 @@
"\n",
"# Example 1: Binary outcomes for all Deltamethrin samples\n",
"binary_deltamethrin = ag3.phenotype_binary(\n",
" sample_sets=[demo_sample_set],\n",
" sample_query=\"insecticide == 'Deltamethrin'\"\n",
" sample_sets=[demo_sample_set], sample_query=\"insecticide == 'Deltamethrin'\"\n",
")\n",
"\n",
"print(f\"Shape of binary series: {binary_deltamethrin.shape}\")\n",
Expand All @@ -940,18 +944,19 @@
"# Example 2: Binary outcomes for samples that were 'alive' with Deltamethrin\n",
"binary_alive_deltamethrin = ag3.phenotype_binary(\n",
" sample_sets=[demo_sample_set],\n",
" sample_query=\"insecticide == 'Deltamethrin' and phenotype == 'alive'\"\n",
" sample_query=\"insecticide == 'Deltamethrin' and phenotype == 'alive'\",\n",
")\n",
"\n",
"print(f\"\\nShape of binary series (alive Deltamethrin): {binary_alive_deltamethrin.shape}\")\n",
"print(\n",
" f\"\\nShape of binary series (alive Deltamethrin): {binary_alive_deltamethrin.shape}\"\n",
")\n",
"print(\"First 5 entries:\")\n",
"print(binary_alive_deltamethrin.head())\n",
"print(f\"Unique values in series: {binary_alive_deltamethrin.unique()}\")\n",
"\n",
"# Example 3: Binary outcomes for samples with dose 0.5\n",
"binary_dose_0_5 = ag3.phenotype_binary(\n",
" sample_sets=[demo_sample_set],\n",
" sample_query=\"dose == 0.5\"\n",
" sample_sets=[demo_sample_set], sample_query=\"dose == 0.5\"\n",
")\n",
"\n",
"print(f\"\\nShape of binary series (dose 0.5): {binary_dose_0_5.shape}\")\n",
Expand Down Expand Up @@ -1038,7 +1043,7 @@
"ds_snps = ag3.phenotypes_with_snps(\n",
" sample_sets=[demo_sample_set],\n",
" sample_query=\"insecticide == 'Deltamethrin' and phenotype == 'alive'\",\n",
" region=demo_region_snps\n",
" region=demo_region_snps,\n",
")\n",
"\n",
"print(f\"Dataset dimensions: {ds_snps.dims}\")\n",
Expand All @@ -1052,7 +1057,7 @@
"print(\"\\nFirst 5 variant positions:\")\n",
"print(ds_snps[\"variant_position\"].head(5).values)\n",
"print(\"\\nDataset Info:\")\n",
"ds_snps.info()\n"
"ds_snps.info()"
]
},
{
Expand Down Expand Up @@ -1126,7 +1131,7 @@
"ds_haps = ag3.phenotypes_with_haplotypes(\n",
" sample_sets=[demo_sample_set],\n",
" sample_query=\"insecticide == 'Deltamethrin' and phenotype == 'dead'\",\n",
" region=demo_region_haps\n",
" region=demo_region_haps,\n",
")\n",
"\n",
"print(f\"Dataset dimensions: {ds_haps.dims}\")\n",
Expand All @@ -1140,7 +1145,7 @@
"print(\"\\nFirst 5 variant positions:\")\n",
"print(ds_haps[\"variant_position\"].head(5).values)\n",
"print(\"\\nDataset Info:\")\n",
"ds_haps.info()\n"
"ds_haps.info()"
]
},
{
Expand Down
Loading
Loading