Skip to content

Commit 686a556

Browse files
add conf tests for dirus, simulated data, fixture, etc
1 parent ef78031 commit 686a556

17 files changed

Lines changed: 909 additions & 0 deletions

File tree

tests/anoph/conftest.py

Lines changed: 325 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2364,6 +2364,326 @@ def init_cnv_discordant_read_calls(self):
23642364
)
23652365

23662366

2367+
class Adir1Simulator(AnophelesSimulator):
2368+
def __init__(self, fixture_dir):
2369+
super().__init__(
2370+
fixture_dir=fixture_dir,
2371+
bucket="vo_adir_release_master_us_central1",
2372+
releases=("1.0",),
2373+
has_aims=False,
2374+
has_cohorts_by_quarter=False,
2375+
has_sequence_qc=True,
2376+
)
2377+
2378+
def init_config(self):
2379+
self.config = {
2380+
"PUBLIC_RELEASES": ["1.0"],
2381+
"GENESET_GFF3_PATH": "reference/genome/AdirusWRAIR2/VectorBase-68_AdirusWRAIR2.gff.gz",
2382+
"GENOME_FASTA_PATH": "reference/genome/AdirusWRAIR2/VectorBase-56_AdirusWRAIR2_Genome.fasta",
2383+
"GENOME_FAI_PATH": "reference/genome/AdirusWRAIR2/VectorBase-56_AdirusWRAIR2_Genome.fasta.fai",
2384+
"GENOME_ZARR_PATH": "reference/genome/AdirusWRAIR2/VectorBase-56_AdirusWRAIR2_Genome.zarr",
2385+
"GENOME_REF_ID": "AdirusWRAIR2",
2386+
"GENOME_REF_NAME": "Anopheles dirus",
2387+
"CONTIGS": [
2388+
"KB672490",
2389+
"KB672868",
2390+
"KB672979",
2391+
], # Just using the three largest.
2392+
"SITE_ANNOTATIONS_ZARR_PATH": "reference/genome/AdirusWRAIR2/VectorBase-56_AdirusWRAIR2_Genome.SEQANNOTATION.zarr",
2393+
"DEFAULT_SITE_FILTERS_ANALYSIS": "sc_20250610",
2394+
"DEFAULT_COHORTS_ANALYSIS": "20250710",
2395+
"DEFAULT_DISCORDANT_READ_CALLS_ANALYSIS": "",
2396+
"SITE_MASK_IDS": ["dirus"],
2397+
"PHASING_ANALYSIS_IDS": ["dirus_noneyet"],
2398+
}
2399+
config_path = self.bucket_path / "v1.0-config.json"
2400+
with config_path.open(mode="w") as f:
2401+
json.dump(self.config, f, indent=4)
2402+
2403+
def init_public_release_manifest(self):
2404+
# Here we create a release manifest for an Adir1-style
2405+
# public release. Note this is not the exact same data
2406+
# as the real release.
2407+
release_path = self.bucket_path / "v1.0"
2408+
release_path.mkdir(parents=True, exist_ok=True)
2409+
manifest_path = release_path / "manifest.tsv"
2410+
manifest = pd.DataFrame(
2411+
{
2412+
"sample_set": [
2413+
"1277-VO-KH-WITKOWSKI-VMF00151",
2414+
"1276-AD-BD-ALAM-VMF00156",
2415+
],
2416+
"sample_count": [20, 10],
2417+
"study_id": [
2418+
"1277-VO-KH-WITKOWSKI",
2419+
"1276-AD-BD-ALAM",
2420+
],
2421+
"study_url": [
2422+
"https://www.malariagen.net/network/where-we-work/1277-VO-KH-WITKOWSKI",
2423+
"https://www.malariagen.net/network/where-we-work/1276-AD-BD-ALAM",
2424+
],
2425+
"terms_of_use_expiry_date": [
2426+
"2027-06-01",
2427+
"2027-06-01",
2428+
],
2429+
"terms_of_use_url": [
2430+
"https://malariagen.github.io/vector-data/adir1/adir1.0.html#terms-of-use",
2431+
"https://malariagen.github.io/vector-data/adir1/adir1.0.html#terms-of-use",
2432+
],
2433+
}
2434+
)
2435+
manifest.to_csv(manifest_path, index=False, sep="\t")
2436+
self.release_manifests["1.0"] = manifest
2437+
2438+
def init_genome_sequence(self):
2439+
# Here we simulate a reference genome in a simple way
2440+
# but with much smaller contigs. The data are stored
2441+
# using zarr as with the real data releases.
2442+
2443+
# Use real base composition.
2444+
base_composition = {
2445+
b"a": 0.0,
2446+
b"c": 0.0,
2447+
b"g": 0.0,
2448+
b"t": 0.0,
2449+
b"n": 0.0,
2450+
b"A": 0.29432128333333335,
2451+
b"C": 0.20542065,
2452+
b"G": 0.20575796666666665,
2453+
b"T": 0.2944834333333333,
2454+
b"N": 1.6666666666666667e-05,
2455+
}
2456+
path = self.bucket_path / self.config["GENOME_ZARR_PATH"]
2457+
self.genome = simulate_genome(
2458+
path=path,
2459+
contigs=self.contigs,
2460+
low=80_000,
2461+
high=120_000,
2462+
base_composition=base_composition,
2463+
)
2464+
self.contig_sizes = {
2465+
contig: self.genome[contig].shape[0] for contig in self.contigs
2466+
}
2467+
2468+
def init_genome_features(self):
2469+
path = self.bucket_path / self.config["GENESET_GFF3_PATH"]
2470+
path.parent.mkdir(parents=True, exist_ok=True)
2471+
simulator = Gff3Simulator(
2472+
contig_sizes=self.contig_sizes,
2473+
# Af1 has a different gene type
2474+
gene_type="protein_coding_gene",
2475+
# Af1 has different attributes
2476+
attrs=("Note", "description"),
2477+
)
2478+
self.genome_features = simulator.simulate_gff(path=path)
2479+
2480+
def write_metadata(self, release, release_path, sample_set, sequence_qc=True):
2481+
# Here we take the approach of using some of the real metadata,
2482+
# but truncating it to the number of samples included in the
2483+
# simulated data resource.
2484+
2485+
# Look up the number of samples in this sample set within the
2486+
# simulated data resource.
2487+
n_samples_sim = (
2488+
self.release_manifests[release]
2489+
.set_index("sample_set")
2490+
.loc[sample_set]["sample_count"]
2491+
)
2492+
2493+
# Create general metadata by sampling from some real metadata files.
2494+
src_path = (
2495+
self.fixture_dir
2496+
/ "vo_adir_release_master_us_central1"
2497+
/ release_path
2498+
/ "metadata"
2499+
/ "general"
2500+
/ sample_set
2501+
/ "samples.meta.csv"
2502+
)
2503+
df_general = pd.read_csv(src_path)
2504+
df_general_ds = df_general.sample(n_samples_sim, replace=False)
2505+
samples_ds = df_general_ds["sample_id"].tolist()
2506+
dst_path = (
2507+
self.bucket_path
2508+
/ release_path
2509+
/ "metadata"
2510+
/ "general"
2511+
/ sample_set
2512+
/ "samples.meta.csv"
2513+
)
2514+
dst_path.parent.mkdir(parents=True, exist_ok=True)
2515+
df_general_ds.to_csv(dst_path, index=False)
2516+
2517+
if sequence_qc:
2518+
# Create sequence QC metadata by sample from real metadata files.
2519+
src_path = (
2520+
self.fixture_dir
2521+
/ "vo_adir_release_master_us_central1"
2522+
/ release_path
2523+
/ "metadata"
2524+
/ "curation"
2525+
/ sample_set
2526+
/ "sequence_qc_stats.csv"
2527+
)
2528+
df_sequence_qc_stats = pd.read_csv(src_path)
2529+
df_sequence_qc_stats_ds = (
2530+
df_sequence_qc_stats.set_index("sample_id")
2531+
.loc[samples_ds]
2532+
.reset_index()
2533+
)
2534+
dst_path = (
2535+
self.bucket_path
2536+
/ release_path
2537+
/ "metadata"
2538+
/ "curation"
2539+
/ sample_set
2540+
/ "sequence_qc_stats.csv"
2541+
)
2542+
dst_path.parent.mkdir(parents=True, exist_ok=True)
2543+
df_sequence_qc_stats_ds.to_csv(dst_path, index=False)
2544+
2545+
# Create cohorts metadata by sampling from some real metadata files.
2546+
src_path = (
2547+
self.fixture_dir
2548+
/ "vo_adir_release_master_us_central1"
2549+
/ release_path
2550+
/ "metadata"
2551+
/ "cohorts_20250710"
2552+
/ sample_set
2553+
/ "samples.cohorts.csv"
2554+
)
2555+
df_coh = pd.read_csv(src_path)
2556+
df_coh_ds = df_coh.set_index("sample_id").loc[samples_ds].reset_index()
2557+
dst_path = (
2558+
self.bucket_path
2559+
/ release_path
2560+
/ "metadata"
2561+
/ "cohorts_20250710"
2562+
/ sample_set
2563+
/ "samples.cohorts.csv"
2564+
)
2565+
dst_path.parent.mkdir(parents=True, exist_ok=True)
2566+
df_coh_ds.to_csv(dst_path, index=False)
2567+
2568+
# Create data catalog by sampling from some real metadata files.
2569+
src_path = (
2570+
self.fixture_dir
2571+
/ "vo_adir_release_master_us_central1"
2572+
/ release_path
2573+
/ "metadata"
2574+
/ "general"
2575+
/ sample_set
2576+
/ "wgs_snp_data.csv"
2577+
)
2578+
df_cat = pd.read_csv(src_path)
2579+
df_cat_ds = df_cat.set_index("sample_id").loc[samples_ds].reset_index()
2580+
dst_path = (
2581+
self.bucket_path
2582+
/ release_path
2583+
/ "metadata"
2584+
/ "general"
2585+
/ sample_set
2586+
/ "wgs_snp_data.csv"
2587+
)
2588+
dst_path.parent.mkdir(parents=True, exist_ok=True)
2589+
df_cat_ds.to_csv(dst_path, index=False)
2590+
2591+
# # Create accessions catalog by sampling from some real metadata files.
2592+
# src_path = (
2593+
# self.fixture_dir
2594+
# / "vo_adir_release_master_us_central1"
2595+
# / release_path
2596+
# / "metadata"
2597+
# / "general"
2598+
# / sample_set
2599+
# / "wgs_accession_data.csv"
2600+
# )
2601+
# df_cat = pd.read_csv(src_path)
2602+
# df_cat_ds = df_cat.set_index("sample_id").loc[samples_ds].reset_index()
2603+
# dst_path = (
2604+
# self.bucket_path
2605+
# / release_path
2606+
# / "metadata"
2607+
# / "general"
2608+
# / sample_set
2609+
# / "wgs_accession_data.csv"
2610+
# )
2611+
# dst_path.parent.mkdir(parents=True, exist_ok=True)
2612+
# df_cat_ds.to_csv(dst_path, index=False)
2613+
2614+
def init_metadata(self):
2615+
self.write_metadata(
2616+
release="1.0",
2617+
release_path="v1.0",
2618+
sample_set="1277-VO-KH-WITKOWSKI-VMF00151",
2619+
)
2620+
self.write_metadata(
2621+
release="1.0",
2622+
release_path="v1.0",
2623+
sample_set="1276-AD-BD-ALAM-VMF00156",
2624+
)
2625+
2626+
def init_snp_sites(self):
2627+
path = self.bucket_path / "v1.0/snp_genotypes/all/sites/"
2628+
self.snp_sites, self.n_snp_sites = simulate_snp_sites(
2629+
path=path, contigs=self.contigs, genome=self.genome
2630+
)
2631+
2632+
def init_site_filters(self):
2633+
analysis = self.config["DEFAULT_SITE_FILTERS_ANALYSIS"]
2634+
2635+
# Simulate the funestus mask.
2636+
mask = "dirus"
2637+
p_pass = 0.59
2638+
path = self.bucket_path / "v1.0/site_filters" / analysis / mask
2639+
simulate_site_filters(
2640+
path=path, contigs=self.contigs, p_pass=p_pass, n_sites=self.n_snp_sites
2641+
)
2642+
2643+
def init_snp_genotypes(self):
2644+
# Iterate over releases.
2645+
for release, manifest in self.release_manifests.items():
2646+
# Determine release path.
2647+
release_path = f"v{release}"
2648+
2649+
# Iterate over sample sets in the release.
2650+
for rec in manifest.itertuples():
2651+
sample_set = rec.sample_set
2652+
metadata_path = (
2653+
self.bucket_path
2654+
/ release_path
2655+
/ "metadata"
2656+
/ "general"
2657+
/ sample_set
2658+
/ "samples.meta.csv"
2659+
)
2660+
2661+
# Create zarr hierarchy.
2662+
zarr_path = (
2663+
self.bucket_path
2664+
/ release_path
2665+
/ "snp_genotypes"
2666+
/ "all"
2667+
/ sample_set
2668+
)
2669+
2670+
# Simulate SNP genotype data.
2671+
p_allele = np.array([0.981, 0.006, 0.008, 0.005])
2672+
p_missing = np.array([0.95, 0.05])
2673+
simulate_snp_genotypes(
2674+
zarr_path=zarr_path,
2675+
metadata_path=metadata_path,
2676+
contigs=self.contigs,
2677+
n_sites=self.n_snp_sites,
2678+
p_allele=p_allele,
2679+
p_missing=p_missing,
2680+
)
2681+
2682+
def init_site_annotations(self):
2683+
path = self.bucket_path / self.config["SITE_ANNOTATIONS_ZARR_PATH"]
2684+
simulate_site_annotations(path=path, genome=self.genome)
2685+
2686+
23672687
# For the following data fixtures we will use the "session" scope
23682688
# so that the fixture data will be created only once per test
23692689
# session (i.e., per invocation of pytest).
@@ -2384,3 +2704,8 @@ def ag3_sim_fixture(fixture_dir):
23842704
@pytest.fixture(scope="session")
23852705
def af1_sim_fixture(fixture_dir):
23862706
return Af1Simulator(fixture_dir=fixture_dir)
2707+
2708+
2709+
@pytest.fixture(scope="session")
2710+
def adir1_sim_fixture(fixture_dir):
2711+
return Adir1Simulator(fixture_dir=fixture_dir)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
sample_id,country,country_ISO,adm1_name,adm1_ISO,adm2_name
2+
VBS46299-6321STDY9453299,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
3+
VBS46307-6321STDY9453307,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
4+
VBS46315-6321STDY9453315,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
5+
VBS46323-6321STDY9453323,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
6+
VBS46331-6321STDY9453331,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
7+
VBS46339-6321STDY9453339,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
8+
VBS46300-6321STDY9453300,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
9+
VBS46308-6321STDY9453308,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
10+
VBS46316-6321STDY9453316,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
11+
VBS46324-6321STDY9453324,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
12+
VBS46332-6321STDY9453332,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
13+
VBS46340-6321STDY9453340,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
14+
VBS46301-6321STDY9453301,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
15+
VBS46309-6321STDY9453309,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
16+
VBS46317-6321STDY9453317,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
17+
VBS46325-6321STDY9453325,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
18+
VBS46333-6321STDY9453333,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
19+
VBS46341-6321STDY9453341,Bangladesh,BGD,Chittagong Division,BD-B,Rangamati
20+
VBS46302-6321STDY9453302,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
21+
VBS46310-6321STDY9453310,Bangladesh,BGD,Chittagong Division,BD-B,Rangamati
22+
VBS46318-6321STDY9453318,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
23+
VBS46326-6321STDY9453326,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
24+
VBS46334-6321STDY9453334,Bangladesh,BGD,Chittagong Division,BD-B,Rangamati
25+
VBS46342-6321STDY9453342,Bangladesh,BGD,Chittagong Division,BD-B,Rangamati
26+
VBS46303-6321STDY9453303,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
27+
VBS46311-6321STDY9453311,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
28+
VBS46319-6321STDY9453319,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
29+
VBS46327-6321STDY9453327,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
30+
VBS46335-6321STDY9453335,Bangladesh,BGD,Chittagong Division,BD-B,Rangamati
31+
VBS46343-6321STDY9453343,Bangladesh,BGD,Chittagong Division,BD-B,Rangamati
32+
VBS46304-6321STDY9453304,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
33+
VBS46312-6321STDY9453312,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
34+
VBS46320-6321STDY9453320,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
35+
VBS46328-6321STDY9453328,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
36+
VBS46336-6321STDY9453336,Bangladesh,BGD,Chittagong Division,BD-B,Rangamati
37+
VBS46344-6321STDY9453344,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
38+
VBS46305-6321STDY9453305,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
39+
VBS46313-6321STDY9453313,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
40+
VBS46321-6321STDY9453321,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
41+
VBS46329-6321STDY9453329,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
42+
VBS46337-6321STDY9453337,Bangladesh,BGD,Chittagong Division,BD-B,Rangamati
43+
VBS46345-6321STDY9453345,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
44+
VBS46306-6321STDY9453306,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
45+
VBS46314-6321STDY9453314,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
46+
VBS46322-6321STDY9453322,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
47+
VBS46330-6321STDY9453330,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
48+
VBS46338-6321STDY9453338,Bangladesh,BGD,Chittagong Division,BD-B,Rangamati

0 commit comments

Comments
 (0)