@@ -2364,6 +2364,326 @@ def init_cnv_discordant_read_calls(self):
23642364 )
23652365
23662366
2367+ class Adir1Simulator (AnophelesSimulator ):
2368+ def __init__ (self , fixture_dir ):
2369+ super ().__init__ (
2370+ fixture_dir = fixture_dir ,
2371+ bucket = "vo_adir_release_master_us_central1" ,
2372+ releases = ("1.0" ,),
2373+ has_aims = False ,
2374+ has_cohorts_by_quarter = False ,
2375+ has_sequence_qc = True ,
2376+ )
2377+
2378+ def init_config (self ):
2379+ self .config = {
2380+ "PUBLIC_RELEASES" : ["1.0" ],
2381+ "GENESET_GFF3_PATH" : "reference/genome/AdirusWRAIR2/VectorBase-68_AdirusWRAIR2.gff.gz" ,
2382+ "GENOME_FASTA_PATH" : "reference/genome/AdirusWRAIR2/VectorBase-56_AdirusWRAIR2_Genome.fasta" ,
2383+ "GENOME_FAI_PATH" : "reference/genome/AdirusWRAIR2/VectorBase-56_AdirusWRAIR2_Genome.fasta.fai" ,
2384+ "GENOME_ZARR_PATH" : "reference/genome/AdirusWRAIR2/VectorBase-56_AdirusWRAIR2_Genome.zarr" ,
2385+ "GENOME_REF_ID" : "AdirusWRAIR2" ,
2386+ "GENOME_REF_NAME" : "Anopheles dirus" ,
2387+ "CONTIGS" : [
2388+ "KB672490" ,
2389+ "KB672868" ,
2390+ "KB672979" ,
2391+ ], # Just using the three largest.
2392+ "SITE_ANNOTATIONS_ZARR_PATH" : "reference/genome/AdirusWRAIR2/VectorBase-56_AdirusWRAIR2_Genome.SEQANNOTATION.zarr" ,
2393+ "DEFAULT_SITE_FILTERS_ANALYSIS" : "sc_20250610" ,
2394+ "DEFAULT_COHORTS_ANALYSIS" : "20250710" ,
2395+ "DEFAULT_DISCORDANT_READ_CALLS_ANALYSIS" : "" ,
2396+ "SITE_MASK_IDS" : ["dirus" ],
2397+ "PHASING_ANALYSIS_IDS" : ["dirus_noneyet" ],
2398+ }
2399+ config_path = self .bucket_path / "v1.0-config.json"
2400+ with config_path .open (mode = "w" ) as f :
2401+ json .dump (self .config , f , indent = 4 )
2402+
2403+ def init_public_release_manifest (self ):
2404+ # Here we create a release manifest for an Adir1-style
2405+ # public release. Note this is not the exact same data
2406+ # as the real release.
2407+ release_path = self .bucket_path / "v1.0"
2408+ release_path .mkdir (parents = True , exist_ok = True )
2409+ manifest_path = release_path / "manifest.tsv"
2410+ manifest = pd .DataFrame (
2411+ {
2412+ "sample_set" : [
2413+ "1277-VO-KH-WITKOWSKI-VMF00151" ,
2414+ "1276-AD-BD-ALAM-VMF00156" ,
2415+ ],
2416+ "sample_count" : [20 , 10 ],
2417+ "study_id" : [
2418+ "1277-VO-KH-WITKOWSKI" ,
2419+ "1276-AD-BD-ALAM" ,
2420+ ],
2421+ "study_url" : [
2422+ "https://www.malariagen.net/network/where-we-work/1277-VO-KH-WITKOWSKI" ,
2423+ "https://www.malariagen.net/network/where-we-work/1276-AD-BD-ALAM" ,
2424+ ],
2425+ "terms_of_use_expiry_date" : [
2426+ "2027-06-01" ,
2427+ "2027-06-01" ,
2428+ ],
2429+ "terms_of_use_url" : [
2430+ "https://malariagen.github.io/vector-data/adir1/adir1.0.html#terms-of-use" ,
2431+ "https://malariagen.github.io/vector-data/adir1/adir1.0.html#terms-of-use" ,
2432+ ],
2433+ }
2434+ )
2435+ manifest .to_csv (manifest_path , index = False , sep = "\t " )
2436+ self .release_manifests ["1.0" ] = manifest
2437+
2438+ def init_genome_sequence (self ):
2439+ # Here we simulate a reference genome in a simple way
2440+ # but with much smaller contigs. The data are stored
2441+ # using zarr as with the real data releases.
2442+
2443+ # Use real base composition.
2444+ base_composition = {
2445+ b"a" : 0.0 ,
2446+ b"c" : 0.0 ,
2447+ b"g" : 0.0 ,
2448+ b"t" : 0.0 ,
2449+ b"n" : 0.0 ,
2450+ b"A" : 0.29432128333333335 ,
2451+ b"C" : 0.20542065 ,
2452+ b"G" : 0.20575796666666665 ,
2453+ b"T" : 0.2944834333333333 ,
2454+ b"N" : 1.6666666666666667e-05 ,
2455+ }
2456+ path = self .bucket_path / self .config ["GENOME_ZARR_PATH" ]
2457+ self .genome = simulate_genome (
2458+ path = path ,
2459+ contigs = self .contigs ,
2460+ low = 80_000 ,
2461+ high = 120_000 ,
2462+ base_composition = base_composition ,
2463+ )
2464+ self .contig_sizes = {
2465+ contig : self .genome [contig ].shape [0 ] for contig in self .contigs
2466+ }
2467+
2468+ def init_genome_features (self ):
2469+ path = self .bucket_path / self .config ["GENESET_GFF3_PATH" ]
2470+ path .parent .mkdir (parents = True , exist_ok = True )
2471+ simulator = Gff3Simulator (
2472+ contig_sizes = self .contig_sizes ,
2473+ # Af1 has a different gene type
2474+ gene_type = "protein_coding_gene" ,
2475+ # Af1 has different attributes
2476+ attrs = ("Note" , "description" ),
2477+ )
2478+ self .genome_features = simulator .simulate_gff (path = path )
2479+
2480+ def write_metadata (self , release , release_path , sample_set , sequence_qc = True ):
2481+ # Here we take the approach of using some of the real metadata,
2482+ # but truncating it to the number of samples included in the
2483+ # simulated data resource.
2484+
2485+ # Look up the number of samples in this sample set within the
2486+ # simulated data resource.
2487+ n_samples_sim = (
2488+ self .release_manifests [release ]
2489+ .set_index ("sample_set" )
2490+ .loc [sample_set ]["sample_count" ]
2491+ )
2492+
2493+ # Create general metadata by sampling from some real metadata files.
2494+ src_path = (
2495+ self .fixture_dir
2496+ / "vo_adir_release_master_us_central1"
2497+ / release_path
2498+ / "metadata"
2499+ / "general"
2500+ / sample_set
2501+ / "samples.meta.csv"
2502+ )
2503+ df_general = pd .read_csv (src_path )
2504+ df_general_ds = df_general .sample (n_samples_sim , replace = False )
2505+ samples_ds = df_general_ds ["sample_id" ].tolist ()
2506+ dst_path = (
2507+ self .bucket_path
2508+ / release_path
2509+ / "metadata"
2510+ / "general"
2511+ / sample_set
2512+ / "samples.meta.csv"
2513+ )
2514+ dst_path .parent .mkdir (parents = True , exist_ok = True )
2515+ df_general_ds .to_csv (dst_path , index = False )
2516+
2517+ if sequence_qc :
2518+ # Create sequence QC metadata by sample from real metadata files.
2519+ src_path = (
2520+ self .fixture_dir
2521+ / "vo_adir_release_master_us_central1"
2522+ / release_path
2523+ / "metadata"
2524+ / "curation"
2525+ / sample_set
2526+ / "sequence_qc_stats.csv"
2527+ )
2528+ df_sequence_qc_stats = pd .read_csv (src_path )
2529+ df_sequence_qc_stats_ds = (
2530+ df_sequence_qc_stats .set_index ("sample_id" )
2531+ .loc [samples_ds ]
2532+ .reset_index ()
2533+ )
2534+ dst_path = (
2535+ self .bucket_path
2536+ / release_path
2537+ / "metadata"
2538+ / "curation"
2539+ / sample_set
2540+ / "sequence_qc_stats.csv"
2541+ )
2542+ dst_path .parent .mkdir (parents = True , exist_ok = True )
2543+ df_sequence_qc_stats_ds .to_csv (dst_path , index = False )
2544+
2545+ # Create cohorts metadata by sampling from some real metadata files.
2546+ src_path = (
2547+ self .fixture_dir
2548+ / "vo_adir_release_master_us_central1"
2549+ / release_path
2550+ / "metadata"
2551+ / "cohorts_20250710"
2552+ / sample_set
2553+ / "samples.cohorts.csv"
2554+ )
2555+ df_coh = pd .read_csv (src_path )
2556+ df_coh_ds = df_coh .set_index ("sample_id" ).loc [samples_ds ].reset_index ()
2557+ dst_path = (
2558+ self .bucket_path
2559+ / release_path
2560+ / "metadata"
2561+ / "cohorts_20250710"
2562+ / sample_set
2563+ / "samples.cohorts.csv"
2564+ )
2565+ dst_path .parent .mkdir (parents = True , exist_ok = True )
2566+ df_coh_ds .to_csv (dst_path , index = False )
2567+
2568+ # Create data catalog by sampling from some real metadata files.
2569+ src_path = (
2570+ self .fixture_dir
2571+ / "vo_adir_release_master_us_central1"
2572+ / release_path
2573+ / "metadata"
2574+ / "general"
2575+ / sample_set
2576+ / "wgs_snp_data.csv"
2577+ )
2578+ df_cat = pd .read_csv (src_path )
2579+ df_cat_ds = df_cat .set_index ("sample_id" ).loc [samples_ds ].reset_index ()
2580+ dst_path = (
2581+ self .bucket_path
2582+ / release_path
2583+ / "metadata"
2584+ / "general"
2585+ / sample_set
2586+ / "wgs_snp_data.csv"
2587+ )
2588+ dst_path .parent .mkdir (parents = True , exist_ok = True )
2589+ df_cat_ds .to_csv (dst_path , index = False )
2590+
2591+ # # Create accessions catalog by sampling from some real metadata files.
2592+ # src_path = (
2593+ # self.fixture_dir
2594+ # / "vo_adir_release_master_us_central1"
2595+ # / release_path
2596+ # / "metadata"
2597+ # / "general"
2598+ # / sample_set
2599+ # / "wgs_accession_data.csv"
2600+ # )
2601+ # df_cat = pd.read_csv(src_path)
2602+ # df_cat_ds = df_cat.set_index("sample_id").loc[samples_ds].reset_index()
2603+ # dst_path = (
2604+ # self.bucket_path
2605+ # / release_path
2606+ # / "metadata"
2607+ # / "general"
2608+ # / sample_set
2609+ # / "wgs_accession_data.csv"
2610+ # )
2611+ # dst_path.parent.mkdir(parents=True, exist_ok=True)
2612+ # df_cat_ds.to_csv(dst_path, index=False)
2613+
2614+ def init_metadata (self ):
2615+ self .write_metadata (
2616+ release = "1.0" ,
2617+ release_path = "v1.0" ,
2618+ sample_set = "1277-VO-KH-WITKOWSKI-VMF00151" ,
2619+ )
2620+ self .write_metadata (
2621+ release = "1.0" ,
2622+ release_path = "v1.0" ,
2623+ sample_set = "1276-AD-BD-ALAM-VMF00156" ,
2624+ )
2625+
2626+ def init_snp_sites (self ):
2627+ path = self .bucket_path / "v1.0/snp_genotypes/all/sites/"
2628+ self .snp_sites , self .n_snp_sites = simulate_snp_sites (
2629+ path = path , contigs = self .contigs , genome = self .genome
2630+ )
2631+
2632+ def init_site_filters (self ):
2633+ analysis = self .config ["DEFAULT_SITE_FILTERS_ANALYSIS" ]
2634+
2635+ # Simulate the funestus mask.
2636+ mask = "dirus"
2637+ p_pass = 0.59
2638+ path = self .bucket_path / "v1.0/site_filters" / analysis / mask
2639+ simulate_site_filters (
2640+ path = path , contigs = self .contigs , p_pass = p_pass , n_sites = self .n_snp_sites
2641+ )
2642+
2643+ def init_snp_genotypes (self ):
2644+ # Iterate over releases.
2645+ for release , manifest in self .release_manifests .items ():
2646+ # Determine release path.
2647+ release_path = f"v{ release } "
2648+
2649+ # Iterate over sample sets in the release.
2650+ for rec in manifest .itertuples ():
2651+ sample_set = rec .sample_set
2652+ metadata_path = (
2653+ self .bucket_path
2654+ / release_path
2655+ / "metadata"
2656+ / "general"
2657+ / sample_set
2658+ / "samples.meta.csv"
2659+ )
2660+
2661+ # Create zarr hierarchy.
2662+ zarr_path = (
2663+ self .bucket_path
2664+ / release_path
2665+ / "snp_genotypes"
2666+ / "all"
2667+ / sample_set
2668+ )
2669+
2670+ # Simulate SNP genotype data.
2671+ p_allele = np .array ([0.981 , 0.006 , 0.008 , 0.005 ])
2672+ p_missing = np .array ([0.95 , 0.05 ])
2673+ simulate_snp_genotypes (
2674+ zarr_path = zarr_path ,
2675+ metadata_path = metadata_path ,
2676+ contigs = self .contigs ,
2677+ n_sites = self .n_snp_sites ,
2678+ p_allele = p_allele ,
2679+ p_missing = p_missing ,
2680+ )
2681+
2682+ def init_site_annotations (self ):
2683+ path = self .bucket_path / self .config ["SITE_ANNOTATIONS_ZARR_PATH" ]
2684+ simulate_site_annotations (path = path , genome = self .genome )
2685+
2686+
23672687# For the following data fixtures we will use the "session" scope
23682688# so that the fixture data will be created only once per test
23692689# session (i.e., per invocation of pytest).
@@ -2384,3 +2704,8 @@ def ag3_sim_fixture(fixture_dir):
23842704@pytest .fixture (scope = "session" )
23852705def af1_sim_fixture (fixture_dir ):
23862706 return Af1Simulator (fixture_dir = fixture_dir )
2707+
2708+
2709+ @pytest .fixture (scope = "session" )
2710+ def adir1_sim_fixture (fixture_dir ):
2711+ return Adir1Simulator (fixture_dir = fixture_dir )
0 commit comments