diff --git a/malariagen_data/anoph/distance.py b/malariagen_data/anoph/distance.py index 84d4f4820..2049929d1 100644 --- a/malariagen_data/anoph/distance.py +++ b/malariagen_data/anoph/distance.py @@ -527,6 +527,20 @@ def plot_njt( count_sort = True distance_sort = False + # Ensure we have enough samples for a tree. + # If we have 0 samples, `biallelic_snp_calls` or `snp_calls` should have already raised "No samples found". + # However, if we have 1 sample, it might pass through until here, where it would cause a failure in njt. + df_samples = self.sample_metadata( + sample_sets=sample_sets, + sample_query=sample_query, + sample_query_options=sample_query_options, + sample_indices=sample_indices, + ) + if 0 < len(df_samples) < 2: + raise ValueError( + f"Not enough samples for neighbour-joining tree. Found {len(df_samples)}, needed at least 2." + ) + # Compute neighbour-joining tree. Z, samples, n_snps_used = self.njt( region=region, diff --git a/malariagen_data/anoph/karyotype_params.py b/malariagen_data/anoph/karyotype_params.py index e13eaffc9..930597dec 100644 --- a/malariagen_data/anoph/karyotype_params.py +++ b/malariagen_data/anoph/karyotype_params.py @@ -1,6 +1,5 @@ """Parameter definitions for karyotype analysis functions.""" - from typing_extensions import Annotated, TypeAlias inversion_param: TypeAlias = Annotated[ diff --git a/malariagen_data/mjn.py b/malariagen_data/mjn.py index 8b3f5bc11..520a51777 100644 --- a/malariagen_data/mjn.py +++ b/malariagen_data/mjn.py @@ -264,7 +264,7 @@ def _mjn_graph_edges( # add further intermediate nodes as necessary for k in range(1, sep - 1): - source = f"anon_{i}_{j}_{k-1}" + source = f"anon_{i}_{j}_{k - 1}" target = f"anon_{i}_{j}_{k}" graph_node = { "id": target, @@ -280,10 +280,10 @@ def _mjn_graph_edges( graph_edges.append(graph_edge) # add edge from final intermediate node to node j - source = f"anon_{i}_{j}_{sep-2}" + source = f"anon_{i}_{j}_{sep - 2}" target = j graph_edge = { - "id": f"edge_{i}_{j}_{sep-1}", + "id": f"edge_{i}_{j}_{sep - 1}", "source": source, "target": target, } diff --git a/notebooks/auto_chunks.ipynb b/notebooks/auto_chunks.ipynb index eb637b87d..291f7318b 100644 --- a/notebooks/auto_chunks.ipynb +++ b/notebooks/auto_chunks.ipynb @@ -39,9 +39,7 @@ "metadata": {}, "outputs": [], "source": [ - "ds = ag3.snp_calls(\n", - " region=\"3R\", sample_sets=\"AG1000G-BF-A\"\n", - ")" + "ds = ag3.snp_calls(region=\"3R\", sample_sets=\"AG1000G-BF-A\")" ] }, { @@ -80,7 +78,9 @@ "outputs": [], "source": [ "ds = ag3.snp_calls(\n", - " region=\"3R\", sample_sets=\"AG1000G-BF-A\", chunks=\"300MB\",\n", + " region=\"3R\",\n", + " sample_sets=\"AG1000G-BF-A\",\n", + " chunks=\"300MB\",\n", ")" ] }, @@ -119,9 +119,7 @@ "metadata": {}, "outputs": [], "source": [ - "ds = ag3.snp_calls(\n", - " region=\"3R\", sample_sets=\"AG1000G-BF-A\", chunks=\"auto\"\n", - ")" + "ds = ag3.snp_calls(region=\"3R\", sample_sets=\"AG1000G-BF-A\", chunks=\"auto\")" ] }, { @@ -159,9 +157,7 @@ "metadata": {}, "outputs": [], "source": [ - "ds = ag3.snp_calls(\n", - " region=\"3R\", sample_sets=\"AG1000G-BF-A\", chunks=\"ndauto\"\n", - ")" + "ds = ag3.snp_calls(region=\"3R\", sample_sets=\"AG1000G-BF-A\", chunks=\"ndauto\")" ] }, { diff --git a/notebooks/extra_metadata.ipynb b/notebooks/extra_metadata.ipynb index 466d74ac6..c75b619f0 100644 --- a/notebooks/extra_metadata.ipynb +++ b/notebooks/extra_metadata.ipynb @@ -112,7 +112,9 @@ "metadata": {}, "outputs": [], "source": [ - "ag3.haplotypes(region=\"3R\", analysis=\"gamb_colu_arab\", sample_query=sample_query, sample_sets=\"3.0\")" + "ag3.haplotypes(\n", + " region=\"3R\", analysis=\"gamb_colu_arab\", sample_query=sample_query, sample_sets=\"3.0\"\n", + ")" ] }, { diff --git a/notebooks/karyotype.ipynb b/notebooks/karyotype.ipynb index bd26b8a8d..857d560ca 100644 --- a/notebooks/karyotype.ipynb +++ b/notebooks/karyotype.ipynb @@ -120,7 +120,9 @@ }, "outputs": [], "source": [ - "ag3.plot_pca_coords(pca_df_2la, color=\"karyotype_2La\", symbol=\"taxon\", width=600, height=500)" + "ag3.plot_pca_coords(\n", + " pca_df_2la, color=\"karyotype_2La\", symbol=\"taxon\", width=600, height=500\n", + ")" ] }, { @@ -195,7 +197,9 @@ }, "outputs": [], "source": [ - "ag3.plot_pca_coords(pca_df_2rb, color=\"karyotype_2Rb\", symbol=\"taxon\", width=600, height=500)" + "ag3.plot_pca_coords(\n", + " pca_df_2rb, color=\"karyotype_2Rb\", symbol=\"taxon\", width=600, height=500\n", + ")" ] }, { @@ -228,7 +232,7 @@ "outputs": [], "source": [ "kt_df_2rc_gam = ag3.karyotype(\n", - " inversion=\"2Rc_gam\", \n", + " inversion=\"2Rc_gam\",\n", " sample_sets=sample_sets,\n", " sample_query=\"taxon == 'gambiae'\",\n", ")\n", @@ -275,7 +279,9 @@ "metadata": {}, "outputs": [], "source": [ - "ag3.plot_pca_coords(pca_df_2rc_gam, color=\"karyotype_2Rc_gam\", symbol=\"taxon\", width=600, height=500)" + "ag3.plot_pca_coords(\n", + " pca_df_2rc_gam, color=\"karyotype_2Rc_gam\", symbol=\"taxon\", width=600, height=500\n", + ")" ] }, { @@ -287,7 +293,9 @@ }, "outputs": [], "source": [ - "ag3.plot_pca_coords(pca_df_2rc_gam, color=\"country\", symbol=\"taxon\", width=600, height=500)" + "ag3.plot_pca_coords(\n", + " pca_df_2rc_gam, color=\"country\", symbol=\"taxon\", width=600, height=500\n", + ")" ] }, { @@ -308,7 +316,7 @@ "outputs": [], "source": [ "kt_df_2rc_col = ag3.karyotype(\n", - " inversion=\"2Rc_col\", \n", + " inversion=\"2Rc_col\",\n", " sample_sets=sample_sets,\n", " sample_query=\"taxon == 'coluzzii'\",\n", ")\n", @@ -357,7 +365,9 @@ }, "outputs": [], "source": [ - "ag3.plot_pca_coords(pca_df_2rc_col, color=\"karyotype_2Rc_col\", symbol=\"country\", width=600, height=500)" + "ag3.plot_pca_coords(\n", + " pca_df_2rc_col, color=\"karyotype_2Rc_col\", symbol=\"country\", width=600, height=500\n", + ")" ] } ], diff --git a/notebooks/local_cluster.ipynb b/notebooks/local_cluster.ipynb index 0d77b5159..91d56dc41 100644 --- a/notebooks/local_cluster.ipynb +++ b/notebooks/local_cluster.ipynb @@ -44,6 +44,7 @@ "outputs": [], "source": [ "from distributed import LocalCluster, Client\n", + "\n", "cluster = LocalCluster()\n", "cluster" ] diff --git a/notebooks/phenotype_data_demo.ipynb b/notebooks/phenotype_data_demo.ipynb index 4db6d0306..ac1e224f8 100644 --- a/notebooks/phenotype_data_demo.ipynb +++ b/notebooks/phenotype_data_demo.ipynb @@ -382,7 +382,7 @@ "ag3 = Ag3(pre=True)\n", "\n", "print(\"MalariaGEN Ag3 API client initialized.\")\n", - "print(ag3)\n" + "print(ag3)" ] }, { @@ -416,12 +416,14 @@ "\n", "# We'll pick one sample set for demonstration, preferably one known to have data\n", "# For this example, we'll use '1237-VO-BJ-DJOGBENOU-VMF00050'\n", - "demo_sample_set = '1237-VO-BJ-DJOGBENOU-VMF00050'\n", + "demo_sample_set = \"1237-VO-BJ-DJOGBENOU-VMF00050\"\n", "if demo_sample_set not in phenotype_sample_sets:\n", - " print(f\"Warning: '{demo_sample_set}' not found. Using the first available: {phenotype_sample_sets}\")\n", + " print(\n", + " f\"Warning: '{demo_sample_set}' not found. Using the first available: {phenotype_sample_sets}\"\n", + " )\n", " demo_sample_set = phenotype_sample_sets\n", "\n", - "print(f\"\\nUsing sample set for demonstration: {demo_sample_set}\")\n" + "print(f\"\\nUsing sample set for demonstration: {demo_sample_set}\")" ] }, { @@ -531,18 +533,21 @@ } ], "source": [ - "print(f\"\\n--- Loading phenotype data for '{demo_sample_set}' filtered by Deltamethrin ---\")\n", + "print(\n", + " f\"\\n--- Loading phenotype data for '{demo_sample_set}' filtered by Deltamethrin ---\"\n", + ")\n", "df_deltamethrin = ag3.phenotype_data(\n", - " sample_sets=[demo_sample_set],\n", - " sample_query=\"insecticide == 'Deltamethrin'\"\n", + " sample_sets=[demo_sample_set], sample_query=\"insecticide == 'Deltamethrin'\"\n", ")\n", "\n", "print(f\"Shape of DataFrame: {df_deltamethrin.shape}\")\n", "print(\"\\nFirst 5 rows of the filtered DataFrame:\")\n", "df_deltamethrin.head()\n", - "print(f\"\\nUnique insecticides in filtered data: {df_deltamethrin['insecticide'].unique()}\")\n", + "print(\n", + " f\"\\nUnique insecticides in filtered data: {df_deltamethrin['insecticide'].unique()}\"\n", + ")\n", "print(\"\\nDataFrame Info:\")\n", - "df_deltamethrin.info() " + "df_deltamethrin.info()" ] }, { @@ -647,7 +652,7 @@ "print(f\"\\n--- Loading phenotype data filtered by Deltamethrin and dose >= 1.0 ---\")\n", "df_filtered_multi = ag3.phenotype_data(\n", " sample_sets=[demo_sample_set],\n", - " sample_query=\"insecticide == 'Deltamethrin' and dose >= 1.0\"\n", + " sample_query=\"insecticide == 'Deltamethrin' and dose >= 1.0\",\n", ")\n", "\n", "print(f\"Shape of DataFrame: {df_filtered_multi.shape}\")\n", @@ -657,7 +662,7 @@ "print(f\"\\nUnique insecticides: {df_filtered_multi['insecticide'].unique()}\")\n", "print(f\"Unique doses: {df_filtered_multi['dose'].unique()}\")\n", "print(\"\\nDataFrame Info:\")\n", - "df_filtered_multi.info()\n" + "df_filtered_multi.info()" ] }, { @@ -761,14 +766,14 @@ "df_cohort_filtered = ag3.phenotype_data(\n", " sample_sets=[demo_sample_set],\n", " sample_query=\"insecticide == 'Deltamethrin'\",\n", - " min_cohort_size=10\n", + " min_cohort_size=10,\n", ")\n", "\n", "print(f\"Shape of DataFrame: {df_cohort_filtered.shape}\")\n", "print(\"\\nFirst 5 rows of the cohort-filtered DataFrame:\")\n", - "df_cohort_filtered.head() \n", + "df_cohort_filtered.head()\n", "print(\"\\nDataFrame Info:\")\n", - "df_cohort_filtered.info() \n", + "df_cohort_filtered.info()\n", "# Verify cohort sizes (optional, for internal testing)\n", "# if not df_cohort_filtered.empty:\n", "# cohort_keys = [\"insecticide\", \"dose\", \"location\", \"country\", \"sample_set\"]\n", @@ -777,7 +782,7 @@ "# cohort_sizes = df_cohort_filtered.groupby(available_keys).size()\n", "# print(\"\\nCohort sizes after filtering:\")\n", "# print(cohort_sizes)\n", - "# print(f\"All cohorts meet min_cohort_size (>=10): {all(cohort_sizes >= 10)}\")\n" + "# print(f\"All cohorts meet min_cohort_size (>=10): {all(cohort_sizes >= 10)}\")" ] }, { @@ -832,8 +837,7 @@ "\n", "# Example 1: Binary outcomes for all Deltamethrin samples\n", "binary_deltamethrin = ag3.phenotype_binary(\n", - " sample_sets=[demo_sample_set],\n", - " sample_query=\"insecticide == 'Deltamethrin'\"\n", + " sample_sets=[demo_sample_set], sample_query=\"insecticide == 'Deltamethrin'\"\n", ")\n", "\n", "print(f\"Shape of binary series: {binary_deltamethrin.shape}\")\n", @@ -845,18 +849,19 @@ "# Example 2: Binary outcomes for samples that were 'alive' with Deltamethrin\n", "binary_alive_deltamethrin = ag3.phenotype_binary(\n", " sample_sets=[demo_sample_set],\n", - " sample_query=\"insecticide == 'Deltamethrin' and phenotype == 'alive'\"\n", + " sample_query=\"insecticide == 'Deltamethrin' and phenotype == 'alive'\",\n", ")\n", "\n", - "print(f\"\\nShape of binary series (alive Deltamethrin): {binary_alive_deltamethrin.shape}\")\n", + "print(\n", + " f\"\\nShape of binary series (alive Deltamethrin): {binary_alive_deltamethrin.shape}\"\n", + ")\n", "print(\"First 5 entries:\")\n", "print(binary_alive_deltamethrin.head())\n", "print(f\"Unique values in series: {binary_alive_deltamethrin.unique()}\")\n", "\n", "# Example 3: Binary outcomes for samples with dose 0.5\n", "binary_dose_0_5 = ag3.phenotype_binary(\n", - " sample_sets=[demo_sample_set],\n", - " sample_query=\"dose == 0.5\"\n", + " sample_sets=[demo_sample_set], sample_query=\"dose == 0.5\"\n", ")\n", "\n", "print(f\"\\nShape of binary series (dose 0.5): {binary_dose_0_5.shape}\")\n", @@ -927,8 +932,7 @@ "\n", "# Example 1: Binary outcomes for all Deltamethrin samples\n", "binary_deltamethrin = ag3.phenotype_binary(\n", - " sample_sets=[demo_sample_set],\n", - " sample_query=\"insecticide == 'Deltamethrin'\"\n", + " sample_sets=[demo_sample_set], sample_query=\"insecticide == 'Deltamethrin'\"\n", ")\n", "\n", "print(f\"Shape of binary series: {binary_deltamethrin.shape}\")\n", @@ -940,18 +944,19 @@ "# Example 2: Binary outcomes for samples that were 'alive' with Deltamethrin\n", "binary_alive_deltamethrin = ag3.phenotype_binary(\n", " sample_sets=[demo_sample_set],\n", - " sample_query=\"insecticide == 'Deltamethrin' and phenotype == 'alive'\"\n", + " sample_query=\"insecticide == 'Deltamethrin' and phenotype == 'alive'\",\n", ")\n", "\n", - "print(f\"\\nShape of binary series (alive Deltamethrin): {binary_alive_deltamethrin.shape}\")\n", + "print(\n", + " f\"\\nShape of binary series (alive Deltamethrin): {binary_alive_deltamethrin.shape}\"\n", + ")\n", "print(\"First 5 entries:\")\n", "print(binary_alive_deltamethrin.head())\n", "print(f\"Unique values in series: {binary_alive_deltamethrin.unique()}\")\n", "\n", "# Example 3: Binary outcomes for samples with dose 0.5\n", "binary_dose_0_5 = ag3.phenotype_binary(\n", - " sample_sets=[demo_sample_set],\n", - " sample_query=\"dose == 0.5\"\n", + " sample_sets=[demo_sample_set], sample_query=\"dose == 0.5\"\n", ")\n", "\n", "print(f\"\\nShape of binary series (dose 0.5): {binary_dose_0_5.shape}\")\n", @@ -1038,7 +1043,7 @@ "ds_snps = ag3.phenotypes_with_snps(\n", " sample_sets=[demo_sample_set],\n", " sample_query=\"insecticide == 'Deltamethrin' and phenotype == 'alive'\",\n", - " region=demo_region_snps\n", + " region=demo_region_snps,\n", ")\n", "\n", "print(f\"Dataset dimensions: {ds_snps.dims}\")\n", @@ -1052,7 +1057,7 @@ "print(\"\\nFirst 5 variant positions:\")\n", "print(ds_snps[\"variant_position\"].head(5).values)\n", "print(\"\\nDataset Info:\")\n", - "ds_snps.info()\n" + "ds_snps.info()" ] }, { @@ -1126,7 +1131,7 @@ "ds_haps = ag3.phenotypes_with_haplotypes(\n", " sample_sets=[demo_sample_set],\n", " sample_query=\"insecticide == 'Deltamethrin' and phenotype == 'dead'\",\n", - " region=demo_region_haps\n", + " region=demo_region_haps,\n", ")\n", "\n", "print(f\"Dataset dimensions: {ds_haps.dims}\")\n", @@ -1140,7 +1145,7 @@ "print(\"\\nFirst 5 variant positions:\")\n", "print(ds_haps[\"variant_position\"].head(5).values)\n", "print(\"\\nDataset Info:\")\n", - "ds_haps.info()\n" + "ds_haps.info()" ] }, { diff --git a/notebooks/plink_convert.ipynb b/notebooks/plink_convert.ipynb index deb9b984b..a82710ef2 100644 --- a/notebooks/plink_convert.ipynb +++ b/notebooks/plink_convert.ipynb @@ -7,7 +7,7 @@ "outputs": [], "source": [ "import malariagen_data\n", - "import os \n", + "import os\n", "\n", "ag3 = malariagen_data.Ag3(pre=True)" ] @@ -18,11 +18,12 @@ "metadata": {}, "outputs": [], "source": [ - "ag3.biallelic_snps_to_plink(output_dir=os.getcwd(),\n", - " region='2L:100000-2000000',\n", - " n_snps=2000,\n", - " sample_sets='AG1000G-AO',\n", - " )" + "ag3.biallelic_snps_to_plink(\n", + " output_dir=os.getcwd(),\n", + " region=\"2L:100000-2000000\",\n", + " n_snps=2000,\n", + " sample_sets=\"AG1000G-AO\",\n", + ")" ] } ], diff --git a/notebooks/plot_diplotype_clustering.ipynb b/notebooks/plot_diplotype_clustering.ipynb index 83d9c5053..83a7df3ea 100644 --- a/notebooks/plot_diplotype_clustering.ipynb +++ b/notebooks/plot_diplotype_clustering.ipynb @@ -43,7 +43,7 @@ "fig = ag3.plot_diplotype_clustering_advanced(\n", " region=\"2R:28,480,000-28,500,000\",\n", " cnv_region=\"2R:28,480,000-28,500,000\",\n", - " snp_transcript='AGAP002862-RA',\n", + " snp_transcript=\"AGAP002862-RA\",\n", " snp_filter_min_maf=0.05,\n", " sample_sets=\"AG1000G-GH\",\n", " site_mask=\"gamb_colu\",\n", @@ -65,8 +65,8 @@ "outputs": [], "source": [ "ag3.plot_diplotype_clustering_advanced(\n", - " region='2R:28,480,000-28,490,000',\n", - " snp_transcript=['AGAP002862-RA', 'AGAP002864-RA'],\n", + " region=\"2R:28,480,000-28,490,000\",\n", + " snp_transcript=[\"AGAP002862-RA\", \"AGAP002864-RA\"],\n", " snp_query=\"effect == 'NON_SYNONYMOUS_CODING'\",\n", " snp_filter_min_maf=0.1,\n", " sample_sets=\"AG1000G-GH\",\n", @@ -88,9 +88,9 @@ "source": [ "ag3.plot_diplotype_clustering_advanced(\n", " region=\"2R:28,480,000-28,500,000\",\n", - " cnv_region = \"2R:28,480,000-28,500,000\",\n", + " cnv_region=\"2R:28,480,000-28,500,000\",\n", " snp_transcript=None,\n", - " sample_sets=[\"AG1000G-GH\", 'AG1000G-BF-A'],\n", + " sample_sets=[\"AG1000G-GH\", \"AG1000G-BF-A\"],\n", " snp_filter_min_maf=0.05,\n", " site_mask=\"gamb_colu\",\n", " color=\"taxon\",\n", @@ -411,11 +411,15 @@ "outputs": [], "source": [ "af1.plot_diplotype_clustering_advanced(\n", - " region = \"X:8,438,477-8,460,887\",\n", + " region=\"X:8,438,477-8,460,887\",\n", " snp_transcript=[\"LOC125764232_t1\"],\n", " cnv_region=\"X:8,418,477-8,480,887\",\n", - " sample_sets=[\"1232-VO-KE-OCHOMO-VMF00044\", \"1231-VO-MULTI-WONDJI-VMF00043\", \"1236-VO-TZ-OKUMU-VMF00090\"],\n", - " sample_query=\"country in ['Kenya', 'Uganda', 'Tanzania'] and taxon == 'funestus'\"\n", + " sample_sets=[\n", + " \"1232-VO-KE-OCHOMO-VMF00044\",\n", + " \"1231-VO-MULTI-WONDJI-VMF00043\",\n", + " \"1236-VO-TZ-OKUMU-VMF00090\",\n", + " ],\n", + " sample_query=\"country in ['Kenya', 'Uganda', 'Tanzania'] and taxon == 'funestus'\",\n", ")" ] }, diff --git a/notebooks/plot_frequencies_heatmap.ipynb b/notebooks/plot_frequencies_heatmap.ipynb index 56414912a..daab3576e 100644 --- a/notebooks/plot_frequencies_heatmap.ipynb +++ b/notebooks/plot_frequencies_heatmap.ipynb @@ -388,7 +388,7 @@ " \"AGAP002865\", # Cyp6p3\n", " \"AGAP000818\", # Cyp9k1\n", " \"AGAP008212\", # Cyp6m2\n", - " \"AGAP008218\", # Cyp6z2 \n", + " \"AGAP008218\", # Cyp6z2\n", "]\n", "\n", "cyp_cnv_freqs_df = ag3.gene_cnv_frequencies(\n", diff --git a/notebooks/plot_frequencies_space_time.ipynb b/notebooks/plot_frequencies_space_time.ipynb index 273ba679b..231d86aaa 100644 --- a/notebooks/plot_frequencies_space_time.ipynb +++ b/notebooks/plot_frequencies_space_time.ipynb @@ -92,7 +92,9 @@ "metadata": {}, "outputs": [], "source": [ - "ag3.plot_frequencies_time_series(ds, taxa=(\"gambiae\", \"arabiensis\"), height=500, width=1000)" + "ag3.plot_frequencies_time_series(\n", + " ds, taxa=(\"gambiae\", \"arabiensis\"), height=500, width=1000\n", + ")" ] }, { @@ -506,7 +508,7 @@ "metadata": {}, "outputs": [], "source": [ - "type(extra_metadata_df['random_year_as_period'][0])" + "type(extra_metadata_df[\"random_year_as_period\"][0])" ] }, { @@ -546,7 +548,7 @@ "metadata": {}, "outputs": [], "source": [ - "extra_sample_metadata_df['random_year_as_period'][:3]" + "extra_sample_metadata_df[\"random_year_as_period\"][:3]" ] }, { diff --git a/notebooks/plot_g123_gwss.ipynb b/notebooks/plot_g123_gwss.ipynb index 73ac584af..f2f31a42f 100644 --- a/notebooks/plot_g123_gwss.ipynb +++ b/notebooks/plot_g123_gwss.ipynb @@ -56,7 +56,7 @@ "contig = \"3L\"\n", "sample_set = \"AG1000G-BF-A\"\n", "sample_query = 'taxon == \"gambiae\"'\n", - "site_mask = \"gamb_colu\"\n" + "site_mask = \"gamb_colu\"" ] }, { @@ -208,7 +208,9 @@ "metadata": {}, "outputs": [], "source": [ - "af1.sample_metadata(sample_sets=\"1.0\").groupby([\"sample_set\", \"cohort_admin1_year\"]).size()" + "af1.sample_metadata(sample_sets=\"1.0\").groupby(\n", + " [\"sample_set\", \"cohort_admin1_year\"]\n", + ").size()" ] }, { diff --git a/notebooks/plot_genes.ipynb b/notebooks/plot_genes.ipynb index ee5ac1a17..8e7e9ff02 100644 --- a/notebooks/plot_genes.ipynb +++ b/notebooks/plot_genes.ipynb @@ -243,15 +243,15 @@ "outputs": [], "source": [ "ag3.plot_genes(\n", - " region=\"2R\",\n", - " gene_labels={\n", - " \"AGAP001096\": \"far left + gene\",\n", - " \"AGAP001099\": \"far left - gene\",\n", - " \"AGAP002942\": \"central + gene\",\n", - " \"AGAP002949\": \"central - gene\",\n", - " \"AGAP004676\": \"far right + gene\",\n", - " \"AGAP004674\": \"far right - gene\",\n", - " }\n", + " region=\"2R\",\n", + " gene_labels={\n", + " \"AGAP001096\": \"far left + gene\",\n", + " \"AGAP001099\": \"far left - gene\",\n", + " \"AGAP002942\": \"central + gene\",\n", + " \"AGAP002949\": \"central - gene\",\n", + " \"AGAP004676\": \"far right + gene\",\n", + " \"AGAP004674\": \"far right - gene\",\n", + " },\n", ")" ] }, @@ -264,29 +264,30 @@ "source": [ "import bokeh\n", "import pandas as pd\n", + "\n", "data = pd.DataFrame.from_dict(\n", - " [\n", - " {'pos': 10_000_000, 'y': 2, 'label': 'Custom Label A'},\n", - " {'pos': 30_000_000, 'y': 0.9, 'label': 'Custom Label B'},\n", - " {'pos': 50_000_000, 'y': -0.1, 'label': 'Custom Label C'},\n", - " ]\n", + " [\n", + " {\"pos\": 10_000_000, \"y\": 2, \"label\": \"Custom Label A\"},\n", + " {\"pos\": 30_000_000, \"y\": 0.9, \"label\": \"Custom Label B\"},\n", + " {\"pos\": 50_000_000, \"y\": -0.1, \"label\": \"Custom Label C\"},\n", + " ]\n", ")\n", "data_as_cds = bokeh.models.ColumnDataSource(data)\n", "gene_labelset = bokeh.models.LabelSet(\n", " source=data_as_cds,\n", - " x='pos',\n", - " y='y',\n", - " text='label',\n", - " text_align='center',\n", - " text_baseline='middle',\n", - " text_font_size='9pt',\n", - " text_color='blue',\n", + " x=\"pos\",\n", + " y=\"y\",\n", + " text=\"label\",\n", + " text_align=\"center\",\n", + " text_baseline=\"middle\",\n", + " text_font_size=\"9pt\",\n", + " text_color=\"blue\",\n", ")\n", "\n", "ag3.plot_genes(\n", - " region=\"2R\",\n", - " gene_labelset=gene_labelset,\n", - " height=200,\n", + " region=\"2R\",\n", + " gene_labelset=gene_labelset,\n", + " height=200,\n", ")" ] }, diff --git a/notebooks/plot_h12_h1x.ipynb b/notebooks/plot_h12_h1x.ipynb index 33e00f53a..2f751b5c8 100644 --- a/notebooks/plot_h12_h1x.ipynb +++ b/notebooks/plot_h12_h1x.ipynb @@ -206,7 +206,7 @@ " sample_sets=\"3.0\",\n", " analysis=\"gamb_colu\",\n", " cohort_size=20,\n", - " contig_colors=[\"red\", \"green\"]\n", + " contig_colors=[\"red\", \"green\"],\n", ")" ] }, diff --git a/notebooks/plot_haplotype_networks.ipynb b/notebooks/plot_haplotype_networks.ipynb index 38adaa9a8..0ecc0d121 100644 --- a/notebooks/plot_haplotype_networks.ipynb +++ b/notebooks/plot_haplotype_networks.ipynb @@ -121,10 +121,7 @@ "metadata": {}, "outputs": [], "source": [ - "color_mapping = {\n", - " \"Ghana\": \"country == 'Ghana'\",\n", - " \"Other\": \"country != 'Ghana'\"\n", - "}\n", + "color_mapping = {\"Ghana\": \"country == 'Ghana'\", \"Other\": \"country != 'Ghana'\"}\n", "ag3.plot_haplotype_network(\n", " region=\"2L:2,358,158-2,431,617\",\n", " analysis=\"gamb_colu\",\n", @@ -276,10 +273,7 @@ "metadata": {}, "outputs": [], "source": [ - "color_mapping = {\n", - " \"2012\": \"year == 2012\",\n", - " \"2014\": \"year == 2014\"\n", - "}\n", + "color_mapping = {\"2012\": \"year == 2012\", \"2014\": \"year == 2014\"}\n", "af1.plot_haplotype_network(\n", " region=\"2RL:2,358,158-2,431,617\",\n", " sample_query=\"country == 'Ghana'\",\n", diff --git a/notebooks/plot_haplotypes_frequencies.ipynb b/notebooks/plot_haplotypes_frequencies.ipynb index 0704d537c..9ad777363 100644 --- a/notebooks/plot_haplotypes_frequencies.ipynb +++ b/notebooks/plot_haplotypes_frequencies.ipynb @@ -32,7 +32,11 @@ "metadata": {}, "outputs": [], "source": [ - "hap_df = ag3.haplotypes_frequencies(\"2L:2,358,158-2,431,617\", \"admin1_year\", sample_sets=(\"AG1000G-BF-A\", \"AG1000G-BF-B\", \"AG1000G-BF-C\"))\n", + "hap_df = ag3.haplotypes_frequencies(\n", + " \"2L:2,358,158-2,431,617\",\n", + " \"admin1_year\",\n", + " sample_sets=(\"AG1000G-BF-A\", \"AG1000G-BF-B\", \"AG1000G-BF-C\"),\n", + ")\n", "hap_df" ] }, @@ -53,7 +57,12 @@ "metadata": {}, "outputs": [], "source": [ - "hap_xr = ag3.haplotypes_frequencies_advanced(region=\"2L:2,358,158-2,431,617\", area_by=\"admin1_iso\", period_by=\"year\", sample_sets = [\"AG1000G-BF-A\", \"AG1000G-BF-B\"])" + "hap_xr = ag3.haplotypes_frequencies_advanced(\n", + " region=\"2L:2,358,158-2,431,617\",\n", + " area_by=\"admin1_iso\",\n", + " period_by=\"year\",\n", + " sample_sets=[\"AG1000G-BF-A\", \"AG1000G-BF-B\"],\n", + ")" ] }, { diff --git a/notebooks/plot_pairwise_average_fst.ipynb b/notebooks/plot_pairwise_average_fst.ipynb index cefddfe75..0b41014a6 100644 --- a/notebooks/plot_pairwise_average_fst.ipynb +++ b/notebooks/plot_pairwise_average_fst.ipynb @@ -47,9 +47,9 @@ "metadata": {}, "outputs": [], "source": [ - "region=\"3L:15,000,000-16,000,000\"\n", - "site_mask='gamb_colu'\n", - "n_jack=200" + "region = \"3L:15,000,000-16,000,000\"\n", + "site_mask = \"gamb_colu\"\n", + "n_jack = 200" ] }, { @@ -65,7 +65,8 @@ " cohort2_query=\"cohort_admin2_year == 'ML-2_Kati_gamb_2014'\",\n", " sample_sets=\"3.0\",\n", " n_jack=n_jack,\n", - " site_mask=site_mask)\n", + " site_mask=site_mask,\n", + ")\n", "fst_hudson, se_hudson" ] }, @@ -134,9 +135,9 @@ "metadata": {}, "outputs": [], "source": [ - "region=\"3L:15,000,000-16,000,000\"\n", - "site_mask='arab'\n", - "n_jack=200" + "region = \"3L:15,000,000-16,000,000\"\n", + "site_mask = \"arab\"\n", + "n_jack = 200" ] }, { @@ -203,7 +204,7 @@ " \"Kilifi_2012\": \"taxon == 'arabiensis' and location == 'Kilifi' and year == 2012\",\n", "}\n", "fst_df = ag3.pairwise_average_fst(\n", - " region=\"3L:15,000,000-16,000,000\", \n", + " region=\"3L:15,000,000-16,000,000\",\n", " cohorts=wild_cohorts,\n", " min_cohort_size=10,\n", " site_mask=\"arab\",\n", diff --git a/notebooks/plot_pca.ipynb b/notebooks/plot_pca.ipynb index fcda6d53a..74421ccc3 100644 --- a/notebooks/plot_pca.ipynb +++ b/notebooks/plot_pca.ipynb @@ -314,7 +314,9 @@ "ag3.plot_pca_coords_3d(\n", " df_pca,\n", " color=\"taxon\",\n", - " category_orders=dict(taxon=[\"coluzzii\", \"gambiae\", \"arabiensis\", \"gcx1\", \"gcx2\", \"gcx3\"]),\n", + " category_orders=dict(\n", + " taxon=[\"coluzzii\", \"gambiae\", \"arabiensis\", \"gcx1\", \"gcx2\", \"gcx3\"]\n", + " ),\n", " marker_size=2,\n", ")" ] diff --git a/notebooks/plot_samples.ipynb b/notebooks/plot_samples.ipynb index b4b9f2834..2f7f31c8a 100644 --- a/notebooks/plot_samples.ipynb +++ b/notebooks/plot_samples.ipynb @@ -116,10 +116,7 @@ "outputs": [], "source": [ "ag3.plot_sample_location_mapbox(\n", - " color='country',\n", - " sample_sets=[\"3.0\"],\n", - " sample_query=\"year > 2012\",\n", - " zoom=2\n", + " color=\"country\", sample_sets=[\"3.0\"], sample_query=\"year > 2012\", zoom=2\n", ")" ] }, @@ -130,9 +127,9 @@ "outputs": [], "source": [ "ag3.plot_sample_location_geo(\n", - " color='country',\n", - " sample_sets=[\"3.0\"],\n", - " sample_query=\"year > 2012\",\n", + " color=\"country\",\n", + " sample_sets=[\"3.0\"],\n", + " sample_query=\"year > 2012\",\n", ")" ] }, @@ -196,10 +193,7 @@ "outputs": [], "source": [ "af1.plot_sample_location_mapbox(\n", - " color='country',\n", - " sample_sets=[\"1.0\"],\n", - " sample_query=\"year > 2015\",\n", - " zoom=2\n", + " color=\"country\", sample_sets=[\"1.0\"], sample_query=\"year > 2015\", zoom=2\n", ")" ] }, @@ -210,9 +204,9 @@ "outputs": [], "source": [ "af1.plot_sample_location_geo(\n", - " color='country',\n", - " sample_sets=[\"1.0\"],\n", - " sample_query=\"year > 2015\",\n", + " color=\"country\",\n", + " sample_sets=[\"1.0\"],\n", + " sample_query=\"year > 2015\",\n", ")" ] }, diff --git a/tests/anoph/test_distance_errors.py b/tests/anoph/test_distance_errors.py new file mode 100644 index 000000000..0d224d15d --- /dev/null +++ b/tests/anoph/test_distance_errors.py @@ -0,0 +1,66 @@ +import pytest +from malariagen_data import ag3 as _ag3 +from malariagen_data.anoph.distance import AnophelesDistanceAnalysis + + +@pytest.fixture +def ag3_sim_api(ag3_sim_fixture): + return AnophelesDistanceAnalysis( + url=ag3_sim_fixture.url, + public_url=ag3_sim_fixture.url, + config_path=_ag3.CONFIG_PATH, + major_version_number=_ag3.MAJOR_VERSION_NUMBER, + major_version_path=_ag3.MAJOR_VERSION_PATH, + pre=True, + aim_metadata_dtype={ + "aim_species_fraction_arab": "float64", + "aim_species_fraction_colu": "float64", + "aim_species_fraction_colu_no2l": "float64", + "aim_species_gambcolu_arabiensis": object, + "aim_species_gambiae_coluzzii": object, + "aim_species": object, + }, + gff_gene_type="gene", + gff_gene_name_attribute="Name", + gff_default_attributes=("ID", "Parent", "Name", "description"), + default_site_mask="gamb_colu_arab", + results_cache=ag3_sim_fixture.results_cache_path.as_posix(), + taxon_colors=_ag3.TAXON_COLORS, + virtual_contigs=_ag3.VIRTUAL_CONTIGS, + ) + + +def test_plot_njt_no_samples(ag3_sim_api): + # Test with a query matching no samples. + with pytest.raises(ValueError) as e: + ag3_sim_api.plot_njt( + region="2L", n_snps=10, sample_query="sex_call == 'Impossible_Value'" + ) + assert "No samples found for query" in str( + e.value + ) or "No relevant samples found" in str(e.value) + + +def test_plot_njt_not_enough_snps(ag3_sim_api): + # Request more SNPs than available in the region + with pytest.raises(ValueError) as e: + ag3_sim_api.plot_njt(region="2L", n_snps=10000000, sample_query=None) + assert "Not enough SNPs." in str(e.value) + assert "Requested 10000000" in str(e.value) + + +def test_plot_njt_one_sample(ag3_sim_api): + # Test with a query that returns only 1 sample. + # This should trigger the minimum sample check in plot_njt. + + # First, find a sample so we can query for just one + df_samples = ag3_sim_api.sample_metadata() + sample_id = df_samples.iloc[0]["sample_id"] + + with pytest.raises(ValueError) as e: + ag3_sim_api.plot_njt( + region="2L", n_snps=10, sample_query=f"sample_id == '{sample_id}'" + ) + assert "Not enough samples for neighbour-joining tree" in str(e.value) + assert "Found 1" in str(e.value) + assert "needed at least 2" in str(e.value) diff --git a/tests/anoph/test_pca.py b/tests/anoph/test_pca.py index 4a603da39..13b0eec42 100644 --- a/tests/anoph/test_pca.py +++ b/tests/anoph/test_pca.py @@ -160,7 +160,7 @@ def test_pca_plotting(fixture, api: AnophelesPca): # Check sizes. assert len(pca_df) == ds.sizes["samples"] for i in range(n_components): - assert f"PC{i+1}" in pca_df.columns, ( + assert f"PC{i + 1}" in pca_df.columns, ( "n_components", n_components, "n_samples", @@ -252,7 +252,7 @@ def test_pca_exclude_samples(fixture, api: AnophelesPca): # Check sizes. assert len(pca_df) == n_samples for i in range(n_components): - assert f"PC{i+1}" in pca_df.columns, ( + assert f"PC{i + 1}" in pca_df.columns, ( "n_components", n_components, "n_samples", @@ -262,7 +262,7 @@ def test_pca_exclude_samples(fixture, api: AnophelesPca): "n_snps", n_snps, ) - assert f"PC{n_components+1}" not in pca_df.columns + assert f"PC{n_components + 1}" not in pca_df.columns assert "pca_fit" in pca_df.columns assert pca_df["pca_fit"].all() assert pca_evr.ndim == 1 @@ -315,7 +315,7 @@ def test_pca_fit_exclude_samples(fixture, api: AnophelesPca): # Check sizes. assert len(pca_df) == n_samples for i in range(n_components): - assert f"PC{i+1}" in pca_df.columns, ( + assert f"PC{i + 1}" in pca_df.columns, ( "n_components", n_components, "n_samples", @@ -325,7 +325,7 @@ def test_pca_fit_exclude_samples(fixture, api: AnophelesPca): "n_snps", n_snps, ) - assert f"PC{n_components+1}" not in pca_df.columns + assert f"PC{n_components + 1}" not in pca_df.columns assert "pca_fit" in pca_df.columns assert pca_evr.ndim == 1 assert pca_evr.shape[0] == n_components