@@ -3195,6 +3195,286 @@ def init_site_annotations(self):
31953195 simulate_site_annotations (path = path , genome = self .genome , rng = self .rng )
31963196
31973197
3198+ class As1Simulator (AnophelesSimulator ):
3199+ def __init__ (self , fixture_dir , rng ):
3200+ super ().__init__ (
3201+ fixture_dir = fixture_dir ,
3202+ rng = rng ,
3203+ bucket = "vo_aste_release_master_us_central1" ,
3204+ releases = ("1.0" ,),
3205+ has_aims = False ,
3206+ has_cohorts_by_quarter = True ,
3207+ has_sequence_qc = True ,
3208+ )
3209+
3210+ def init_config (self ):
3211+ self .config = {
3212+ "PUBLIC_RELEASES" : ["1.0" ],
3213+ "GENESET_GFF3_PATH" : "reference/genome/UCISS2018/Anopheles-stephensi-UCISS2018-curated.gff.gz" ,
3214+ "GENOME_FASTA_PATH" : "reference/genome/UCISS2018/Anopheles-stephensi-UCISS2018-curated.fasta" ,
3215+ "GENOME_FAI_PATH" : "reference/genome/UCISS2018/Anopheles-stephensi-UCISS2018-curated.fasta.fai" ,
3216+ "GENOME_ZARR_PATH" : "reference/genome/UCISS2018/Anopheles-stephensi-UCISS2018-curated.zarr" ,
3217+ "GENOME_REF_ID" : "UCISS2018" ,
3218+ "GENOME_REF_NAME" : "Anopheles stephensi" ,
3219+ "CONTIGS" : ["2RL" , "3RL" , "X" ],
3220+ "SITE_ANNOTATIONS_ZARR_PATH" : "reference/genome/UCISS2018/Anopheles-stephensi-UCISS2018-curated_SEQANNOTATION.zarr" ,
3221+ "DEFAULT_SITE_FILTERS_ANALYSIS" : "sc_20260401" ,
3222+ "DEFAULT_COHORTS_ANALYSIS" : "20260402" ,
3223+ "DEFAULT_DISCORDANT_READ_CALLS_ANALYSIS" : "" ,
3224+ "SITE_MASK_IDS" : ["stephensi" ],
3225+ "PHASING_ANALYSIS_IDS" : [],
3226+ "COVERAGE_CALLS_ANALYSIS_IDS" : ["stephensi" ],
3227+ }
3228+ config_path = self .bucket_path / "v1.0-config.json"
3229+ with config_path .open (mode = "w" ) as f :
3230+ json .dump (self .config , f , indent = 4 )
3231+
3232+ def init_public_release_manifest (self ):
3233+ release_path = self .bucket_path / "v1.0"
3234+ release_path .mkdir (parents = True , exist_ok = True )
3235+ manifest_path = release_path / "manifest.tsv"
3236+ manifest = pd .DataFrame (
3237+ {
3238+ "sample_set" : [
3239+ "1365-VO-DJ-ADBI-VMF00318" ,
3240+ "1386-VO-KE-OCHOMO-VMF00339" ,
3241+ "1367-VO-AF-DONNELLY-VMF00320" ,
3242+ ],
3243+ "sample_count" : [21 , 29 , 24 ],
3244+ "study_id" : [
3245+ "1365-VO-DJ-ADBI" ,
3246+ "1386-VO-KE-OCHOMO" ,
3247+ "1367-VO-AF-DONNELLY" ,
3248+ ],
3249+ "study_url" : [
3250+ "https://www.malariagen.net/network/where-we-work/1365-VO-DJ-ADBI" ,
3251+ "https://www.malariagen.net/network/where-we-work/1386-VO-KE-OCHOMO" ,
3252+ "https://www.malariagen.net/network/where-we-work/1367-VO-AF-DONNELLY" ,
3253+ ],
3254+ "terms_of_use_expiry_date" : [
3255+ "2099-12-31" ,
3256+ "2099-12-31" ,
3257+ "2024-01-01" , # Set to the past in order to test unrestricted_use_only.
3258+ ],
3259+ "terms_of_use_url" : [
3260+ "https://malariagen.github.io/vector-data/as1/as1.0.html#terms-of-use" ,
3261+ "https://malariagen.github.io/vector-data/as1/as1.0.html#terms-of-use" ,
3262+ "https://malariagen.github.io/vector-data/as1/as1.0.html#terms-of-use" ,
3263+ ],
3264+ }
3265+ )
3266+ manifest .to_csv (manifest_path , index = False , sep = "\t " )
3267+ self .release_manifests ["1.0" ] = manifest
3268+
3269+ def init_genome_sequence (self ):
3270+ base_composition = {
3271+ b"a" : 0.0 ,
3272+ b"c" : 0.0 ,
3273+ b"g" : 0.0 ,
3274+ b"t" : 0.0 ,
3275+ b"n" : 0.0 ,
3276+ b"A" : 0.29432128333333335 ,
3277+ b"C" : 0.20542065 ,
3278+ b"G" : 0.20575796666666665 ,
3279+ b"T" : 0.2944834333333333 ,
3280+ b"N" : 1.6666666666666667e-05 ,
3281+ }
3282+ path = self .bucket_path / self .config ["GENOME_ZARR_PATH" ]
3283+ self .genome = simulate_genome (
3284+ path = path ,
3285+ contigs = self .contigs ,
3286+ low = 80_000 ,
3287+ high = 120_000 ,
3288+ base_composition = base_composition ,
3289+ rng = self .rng ,
3290+ )
3291+ self .contig_sizes = {
3292+ contig : self .genome [contig ].shape [0 ] for contig in self .contigs
3293+ }
3294+
3295+ def init_genome_features (self ):
3296+ path = self .bucket_path / self .config ["GENESET_GFF3_PATH" ]
3297+ path .parent .mkdir (parents = True , exist_ok = True )
3298+ simulator = Gff3Simulator (
3299+ contig_sizes = self .contig_sizes ,
3300+ rng = self .rng ,
3301+ gene_type = "protein_coding_gene" ,
3302+ attrs = ("Note" , "description" ),
3303+ )
3304+ self .genome_features = simulator .simulate_gff (path = path )
3305+
3306+ def write_metadata (self , release , release_path , sample_set , sequence_qc = True ):
3307+ n_samples_sim = (
3308+ self .release_manifests [release ]
3309+ .set_index ("sample_set" )
3310+ .loc [sample_set ]["sample_count" ]
3311+ )
3312+
3313+ src_path = (
3314+ self .fixture_dir
3315+ / "vo_aste_release_master_us_central1"
3316+ / release_path
3317+ / "metadata"
3318+ / "general"
3319+ / sample_set
3320+ / "samples.meta.csv"
3321+ )
3322+ df_general = pd .read_csv (src_path )
3323+ df_general_ds = df_general .sample (
3324+ n_samples_sim , replace = False , random_state = self .rng
3325+ )
3326+ samples_ds = df_general_ds ["sample_id" ].tolist ()
3327+ dst_path = (
3328+ self .bucket_path
3329+ / release_path
3330+ / "metadata"
3331+ / "general"
3332+ / sample_set
3333+ / "samples.meta.csv"
3334+ )
3335+ dst_path .parent .mkdir (parents = True , exist_ok = True )
3336+ df_general_ds .to_csv (dst_path , index = False )
3337+
3338+ if sequence_qc :
3339+ src_path = (
3340+ self .fixture_dir
3341+ / "vo_aste_release_master_us_central1"
3342+ / release_path
3343+ / "metadata"
3344+ / "curation"
3345+ / sample_set
3346+ / "sequence_qc_stats.csv"
3347+ )
3348+ df_sequence_qc_stats = pd .read_csv (src_path )
3349+ df_sequence_qc_stats_ds = (
3350+ df_sequence_qc_stats .set_index ("sample_id" )
3351+ .loc [samples_ds ]
3352+ .reset_index ()
3353+ )
3354+ dst_path = (
3355+ self .bucket_path
3356+ / release_path
3357+ / "metadata"
3358+ / "curation"
3359+ / sample_set
3360+ / "sequence_qc_stats.csv"
3361+ )
3362+ dst_path .parent .mkdir (parents = True , exist_ok = True )
3363+ df_sequence_qc_stats_ds .to_csv (dst_path , index = False )
3364+
3365+ src_path = (
3366+ self .fixture_dir
3367+ / "vo_aste_release_master_us_central1"
3368+ / release_path
3369+ / "metadata"
3370+ / "cohorts_20260402"
3371+ / sample_set
3372+ / "samples.cohorts.csv"
3373+ )
3374+ df_coh = pd .read_csv (src_path )
3375+ df_coh_ds = df_coh .set_index ("sample_id" ).loc [samples_ds ].reset_index ()
3376+ dst_path = (
3377+ self .bucket_path
3378+ / release_path
3379+ / "metadata"
3380+ / "cohorts_20260402"
3381+ / sample_set
3382+ / "samples.cohorts.csv"
3383+ )
3384+ dst_path .parent .mkdir (parents = True , exist_ok = True )
3385+ df_coh_ds .to_csv (dst_path , index = False )
3386+
3387+ src_path = (
3388+ self .fixture_dir
3389+ / "vo_aste_release_master_us_central1"
3390+ / release_path
3391+ / "metadata"
3392+ / "general"
3393+ / sample_set
3394+ / "wgs_snp_data.csv"
3395+ )
3396+ df_cat = pd .read_csv (src_path )
3397+ df_cat_ds = df_cat .set_index ("sample_id" ).loc [samples_ds ].reset_index ()
3398+ dst_path = (
3399+ self .bucket_path
3400+ / release_path
3401+ / "metadata"
3402+ / "general"
3403+ / sample_set
3404+ / "wgs_snp_data.csv"
3405+ )
3406+ dst_path .parent .mkdir (parents = True , exist_ok = True )
3407+ df_cat_ds .to_csv (dst_path , index = False )
3408+
3409+ def init_metadata (self ):
3410+ self .write_metadata (
3411+ release = "1.0" , release_path = "v1.0" , sample_set = "1365-VO-DJ-ADBI-VMF00318"
3412+ )
3413+ self .write_metadata (
3414+ release = "1.0" , release_path = "v1.0" , sample_set = "1386-VO-KE-OCHOMO-VMF00339"
3415+ )
3416+ self .write_metadata (
3417+ release = "1.0" ,
3418+ release_path = "v1.0" ,
3419+ sample_set = "1367-VO-AF-DONNELLY-VMF00320" ,
3420+ )
3421+
3422+ def init_snp_sites (self ):
3423+ path = self .bucket_path / "v1.0/snp_genotypes/all/sites/"
3424+ self .snp_sites , self .n_snp_sites = simulate_snp_sites (
3425+ path = path , contigs = self .contigs , genome = self .genome
3426+ )
3427+
3428+ def init_site_filters (self ):
3429+ analysis = self .config ["DEFAULT_SITE_FILTERS_ANALYSIS" ]
3430+ mask = "stephensi"
3431+ p_pass = 0.59
3432+ path = self .bucket_path / "v1.0/site_filters" / analysis / mask
3433+ simulate_site_filters (
3434+ path = path ,
3435+ contigs = self .contigs ,
3436+ p_pass = p_pass ,
3437+ n_sites = self .n_snp_sites ,
3438+ rng = self .rng ,
3439+ )
3440+
3441+ def init_snp_genotypes (self ):
3442+ for release , manifest in self .release_manifests .items ():
3443+ release_path = f"v{ release } "
3444+ for rec in manifest .itertuples ():
3445+ sample_set = rec .sample_set
3446+ metadata_path = (
3447+ self .bucket_path
3448+ / release_path
3449+ / "metadata"
3450+ / "general"
3451+ / sample_set
3452+ / "samples.meta.csv"
3453+ )
3454+ zarr_path = (
3455+ self .bucket_path
3456+ / release_path
3457+ / "snp_genotypes"
3458+ / "all"
3459+ / sample_set
3460+ )
3461+ p_allele = np .array ([0.981 , 0.006 , 0.008 , 0.005 ])
3462+ p_missing = np .array ([0.95 , 0.05 ])
3463+ simulate_snp_genotypes (
3464+ zarr_path = zarr_path ,
3465+ metadata_path = metadata_path ,
3466+ contigs = self .contigs ,
3467+ n_sites = self .n_snp_sites ,
3468+ p_allele = p_allele ,
3469+ p_missing = p_missing ,
3470+ rng = self .rng ,
3471+ )
3472+
3473+ def init_site_annotations (self ):
3474+ path = self .bucket_path / self .config ["SITE_ANNOTATIONS_ZARR_PATH" ]
3475+ simulate_site_annotations (path = path , genome = self .genome , rng = self .rng )
3476+
3477+
31983478# For the following data fixtures we will use the "session" scope
31993479# so that the fixture data will be created only once per test
32003480# session (i.e., per invocation of pytest).
@@ -3228,3 +3508,8 @@ def adir1_sim_fixture(fixture_dir):
32283508@pytest .fixture (scope = "session" )
32293509def amin1_sim_fixture (fixture_dir ):
32303510 return Amin1Simulator (fixture_dir = fixture_dir , rng = create_rng ("Amin1" ))
3511+
3512+
3513+ @pytest .fixture (scope = "session" )
3514+ def as1_sim_fixture (fixture_dir ):
3515+ return As1Simulator (fixture_dir = fixture_dir , rng = create_rng ("As1" ))
0 commit comments