|
| 1 | +import random |
| 2 | + |
| 3 | +import pytest |
| 4 | +from pytest_cases import parametrize_with_cases |
| 5 | + |
| 6 | +from malariagen_data import af1 as _af1 |
| 7 | +from malariagen_data import ag3 as _ag3 |
| 8 | + |
| 9 | +from malariagen_data.anoph.ld import AnophelesLdAnalysis |
| 10 | + |
| 11 | + |
| 12 | +@pytest.fixture |
| 13 | +def ag3_sim_api(ag3_sim_fixture): |
| 14 | + return AnophelesLdAnalysis( |
| 15 | + url=ag3_sim_fixture.url, |
| 16 | + public_url=ag3_sim_fixture.url, |
| 17 | + config_path=_ag3.CONFIG_PATH, |
| 18 | + major_version_number=_ag3.MAJOR_VERSION_NUMBER, |
| 19 | + major_version_path=_ag3.MAJOR_VERSION_PATH, |
| 20 | + pre=True, |
| 21 | + aim_metadata_dtype={ |
| 22 | + "aim_species_fraction_arab": "float64", |
| 23 | + "aim_species_fraction_colu": "float64", |
| 24 | + "aim_species_fraction_colu_no2l": "float64", |
| 25 | + "aim_species_gambcolu_arabiensis": object, |
| 26 | + "aim_species_gambiae_coluzzii": object, |
| 27 | + "aim_species": object, |
| 28 | + }, |
| 29 | + gff_gene_type="gene", |
| 30 | + gff_gene_name_attribute="Name", |
| 31 | + gff_default_attributes=("ID", "Parent", "Name", "description"), |
| 32 | + default_site_mask="gamb_colu_arab", |
| 33 | + results_cache=ag3_sim_fixture.results_cache_path.as_posix(), |
| 34 | + taxon_colors=_ag3.TAXON_COLORS, |
| 35 | + virtual_contigs=_ag3.VIRTUAL_CONTIGS, |
| 36 | + ) |
| 37 | + |
| 38 | + |
| 39 | +@pytest.fixture |
| 40 | +def af1_sim_api(af1_sim_fixture): |
| 41 | + return AnophelesLdAnalysis( |
| 42 | + url=af1_sim_fixture.url, |
| 43 | + public_url=af1_sim_fixture.url, |
| 44 | + config_path=_af1.CONFIG_PATH, |
| 45 | + major_version_number=_af1.MAJOR_VERSION_NUMBER, |
| 46 | + major_version_path=_af1.MAJOR_VERSION_PATH, |
| 47 | + pre=False, |
| 48 | + gff_gene_type="protein_coding_gene", |
| 49 | + gff_gene_name_attribute="Note", |
| 50 | + gff_default_attributes=("ID", "Parent", "Note", "description"), |
| 51 | + default_site_mask="funestus", |
| 52 | + results_cache=af1_sim_fixture.results_cache_path.as_posix(), |
| 53 | + taxon_colors=_af1.TAXON_COLORS, |
| 54 | + ) |
| 55 | + |
| 56 | + |
| 57 | +def case_ag3_sim(ag3_sim_fixture, ag3_sim_api): |
| 58 | + return ag3_sim_fixture, ag3_sim_api |
| 59 | + |
| 60 | + |
| 61 | +def case_af1_sim(af1_sim_fixture, af1_sim_api): |
| 62 | + return af1_sim_fixture, af1_sim_api |
| 63 | + |
| 64 | + |
| 65 | +@parametrize_with_cases("fixture,api", cases=".") |
| 66 | +def test_ld_pruning_returns_fewer_snps(fixture, api: AnophelesLdAnalysis): |
| 67 | + region = random.choice(api.contigs) |
| 68 | + site_mask = random.choice(api.site_mask_ids) |
| 69 | + ds_full = api.biallelic_snp_calls( |
| 70 | + region=region, |
| 71 | + site_mask=site_mask, |
| 72 | + min_minor_ac=1, |
| 73 | + max_missing_an=0, |
| 74 | + ) |
| 75 | + n_available = ds_full.sizes["variants"] |
| 76 | + if n_available < 10: |
| 77 | + pytest.skip("Not enough variants for LD pruning test") |
| 78 | + |
| 79 | + n_snps = min(n_available, 200) |
| 80 | + |
| 81 | + ds_pruned = api.biallelic_snp_calls_ld_pruned( |
| 82 | + region=region, |
| 83 | + n_snps=n_snps, |
| 84 | + site_mask=site_mask, |
| 85 | + min_minor_ac=1, |
| 86 | + max_missing_an=0, |
| 87 | + ) |
| 88 | + |
| 89 | + # Pruned dataset should have fewer or equal variants. |
| 90 | + assert ds_pruned.sizes["variants"] <= n_snps |
| 91 | + assert ds_pruned.sizes["variants"] > 0 |
| 92 | + |
| 93 | + |
| 94 | +@parametrize_with_cases("fixture,api", cases=".") |
| 95 | +def test_ld_pruned_dataset_structure(fixture, api: AnophelesLdAnalysis): |
| 96 | + region = random.choice(api.contigs) |
| 97 | + site_mask = random.choice(api.site_mask_ids) |
| 98 | + ds_full = api.biallelic_snp_calls( |
| 99 | + region=region, |
| 100 | + site_mask=site_mask, |
| 101 | + min_minor_ac=1, |
| 102 | + max_missing_an=0, |
| 103 | + ) |
| 104 | + n_available = ds_full.sizes["variants"] |
| 105 | + if n_available < 10: |
| 106 | + pytest.skip("Not enough variants for LD pruning test") |
| 107 | + |
| 108 | + n_snps = min(n_available, 200) |
| 109 | + |
| 110 | + ds_pruned = api.biallelic_snp_calls_ld_pruned( |
| 111 | + region=region, |
| 112 | + n_snps=n_snps, |
| 113 | + site_mask=site_mask, |
| 114 | + min_minor_ac=1, |
| 115 | + max_missing_an=0, |
| 116 | + ) |
| 117 | + |
| 118 | + # Check expected coordinates. |
| 119 | + assert "sample_id" in ds_pruned.coords |
| 120 | + assert "variant_position" in ds_pruned.coords |
| 121 | + assert "variant_contig" in ds_pruned.coords |
| 122 | + |
| 123 | + # Check expected data variables. |
| 124 | + assert "variant_allele" in ds_pruned.data_vars |
| 125 | + assert "call_genotype" in ds_pruned.data_vars |
| 126 | + |
| 127 | + # Check dimensions. |
| 128 | + assert "variants" in ds_pruned.dims |
| 129 | + assert "samples" in ds_pruned.dims |
| 130 | + assert "ploidy" in ds_pruned.dims |
| 131 | + assert "alleles" in ds_pruned.dims |
| 132 | + |
| 133 | + # Check alleles are biallelic. |
| 134 | + assert ds_pruned.sizes["alleles"] == 2 |
| 135 | + |
| 136 | + |
| 137 | +@parametrize_with_cases("fixture,api", cases=".") |
| 138 | +def test_ld_pruned_plink_compatibility(fixture, api: AnophelesLdAnalysis): |
| 139 | + region = random.choice(api.contigs) |
| 140 | + site_mask = random.choice(api.site_mask_ids) |
| 141 | + ds_full = api.biallelic_snp_calls( |
| 142 | + region=region, |
| 143 | + site_mask=site_mask, |
| 144 | + min_minor_ac=1, |
| 145 | + max_missing_an=0, |
| 146 | + ) |
| 147 | + n_available = ds_full.sizes["variants"] |
| 148 | + if n_available < 10: |
| 149 | + pytest.skip("Not enough variants for LD pruning test") |
| 150 | + |
| 151 | + n_snps = min(n_available, 200) |
| 152 | + |
| 153 | + ds_pruned = api.biallelic_snp_calls_ld_pruned( |
| 154 | + region=region, |
| 155 | + n_snps=n_snps, |
| 156 | + site_mask=site_mask, |
| 157 | + min_minor_ac=1, |
| 158 | + max_missing_an=0, |
| 159 | + ) |
| 160 | + |
| 161 | + # Verify the pruned dataset has all variables required by PlinkConverter. |
| 162 | + assert "call_genotype" in ds_pruned |
| 163 | + assert "variant_allele" in ds_pruned |
| 164 | + assert "variant_contig" in ds_pruned.coords |
| 165 | + assert "variant_position" in ds_pruned.coords |
| 166 | + assert "sample_id" in ds_pruned.coords |
| 167 | + |
| 168 | + # Verify shapes are internally consistent. |
| 169 | + n_variants = ds_pruned.sizes["variants"] |
| 170 | + n_samples = ds_pruned.sizes["samples"] |
| 171 | + assert ds_pruned["call_genotype"].shape == (n_variants, n_samples, 2) |
| 172 | + assert ds_pruned["variant_allele"].shape == (n_variants, 2) |
0 commit comments