Skip to content

Commit 7851553

Browse files
accidentally mangled conftest, thanks claude code, working now
1 parent 569105e commit 7851553

71 files changed

Lines changed: 3263 additions & 67 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

tests/anoph/conftest.py

Lines changed: 285 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3195,6 +3195,286 @@ def init_site_annotations(self):
31953195
simulate_site_annotations(path=path, genome=self.genome, rng=self.rng)
31963196

31973197

3198+
class As1Simulator(AnophelesSimulator):
3199+
def __init__(self, fixture_dir, rng):
3200+
super().__init__(
3201+
fixture_dir=fixture_dir,
3202+
rng=rng,
3203+
bucket="vo_aste_release_master_us_central1",
3204+
releases=("1.0",),
3205+
has_aims=False,
3206+
has_cohorts_by_quarter=True,
3207+
has_sequence_qc=True,
3208+
)
3209+
3210+
def init_config(self):
3211+
self.config = {
3212+
"PUBLIC_RELEASES": ["1.0"],
3213+
"GENESET_GFF3_PATH": "reference/genome/UCISS2018/Anopheles-stephensi-UCISS2018-curated.gff.gz",
3214+
"GENOME_FASTA_PATH": "reference/genome/UCISS2018/Anopheles-stephensi-UCISS2018-curated.fasta",
3215+
"GENOME_FAI_PATH": "reference/genome/UCISS2018/Anopheles-stephensi-UCISS2018-curated.fasta.fai",
3216+
"GENOME_ZARR_PATH": "reference/genome/UCISS2018/Anopheles-stephensi-UCISS2018-curated.zarr",
3217+
"GENOME_REF_ID": "UCISS2018",
3218+
"GENOME_REF_NAME": "Anopheles stephensi",
3219+
"CONTIGS": ["2RL", "3RL", "X"],
3220+
"SITE_ANNOTATIONS_ZARR_PATH": "reference/genome/UCISS2018/Anopheles-stephensi-UCISS2018-curated_SEQANNOTATION.zarr",
3221+
"DEFAULT_SITE_FILTERS_ANALYSIS": "sc_20260401",
3222+
"DEFAULT_COHORTS_ANALYSIS": "20260402",
3223+
"DEFAULT_DISCORDANT_READ_CALLS_ANALYSIS": "",
3224+
"SITE_MASK_IDS": ["stephensi"],
3225+
"PHASING_ANALYSIS_IDS": [],
3226+
"COVERAGE_CALLS_ANALYSIS_IDS": ["stephensi"],
3227+
}
3228+
config_path = self.bucket_path / "v1.0-config.json"
3229+
with config_path.open(mode="w") as f:
3230+
json.dump(self.config, f, indent=4)
3231+
3232+
def init_public_release_manifest(self):
3233+
release_path = self.bucket_path / "v1.0"
3234+
release_path.mkdir(parents=True, exist_ok=True)
3235+
manifest_path = release_path / "manifest.tsv"
3236+
manifest = pd.DataFrame(
3237+
{
3238+
"sample_set": [
3239+
"1365-VO-DJ-ADBI-VMF00318",
3240+
"1386-VO-KE-OCHOMO-VMF00339",
3241+
"1367-VO-AF-DONNELLY-VMF00320",
3242+
],
3243+
"sample_count": [21, 29, 24],
3244+
"study_id": [
3245+
"1365-VO-DJ-ADBI",
3246+
"1386-VO-KE-OCHOMO",
3247+
"1367-VO-AF-DONNELLY",
3248+
],
3249+
"study_url": [
3250+
"https://www.malariagen.net/network/where-we-work/1365-VO-DJ-ADBI",
3251+
"https://www.malariagen.net/network/where-we-work/1386-VO-KE-OCHOMO",
3252+
"https://www.malariagen.net/network/where-we-work/1367-VO-AF-DONNELLY",
3253+
],
3254+
"terms_of_use_expiry_date": [
3255+
"2099-12-31",
3256+
"2099-12-31",
3257+
"2024-01-01", # Set to the past in order to test unrestricted_use_only.
3258+
],
3259+
"terms_of_use_url": [
3260+
"https://malariagen.github.io/vector-data/as1/as1.0.html#terms-of-use",
3261+
"https://malariagen.github.io/vector-data/as1/as1.0.html#terms-of-use",
3262+
"https://malariagen.github.io/vector-data/as1/as1.0.html#terms-of-use",
3263+
],
3264+
}
3265+
)
3266+
manifest.to_csv(manifest_path, index=False, sep="\t")
3267+
self.release_manifests["1.0"] = manifest
3268+
3269+
def init_genome_sequence(self):
3270+
base_composition = {
3271+
b"a": 0.0,
3272+
b"c": 0.0,
3273+
b"g": 0.0,
3274+
b"t": 0.0,
3275+
b"n": 0.0,
3276+
b"A": 0.29432128333333335,
3277+
b"C": 0.20542065,
3278+
b"G": 0.20575796666666665,
3279+
b"T": 0.2944834333333333,
3280+
b"N": 1.6666666666666667e-05,
3281+
}
3282+
path = self.bucket_path / self.config["GENOME_ZARR_PATH"]
3283+
self.genome = simulate_genome(
3284+
path=path,
3285+
contigs=self.contigs,
3286+
low=80_000,
3287+
high=120_000,
3288+
base_composition=base_composition,
3289+
rng=self.rng,
3290+
)
3291+
self.contig_sizes = {
3292+
contig: self.genome[contig].shape[0] for contig in self.contigs
3293+
}
3294+
3295+
def init_genome_features(self):
3296+
path = self.bucket_path / self.config["GENESET_GFF3_PATH"]
3297+
path.parent.mkdir(parents=True, exist_ok=True)
3298+
simulator = Gff3Simulator(
3299+
contig_sizes=self.contig_sizes,
3300+
rng=self.rng,
3301+
gene_type="protein_coding_gene",
3302+
attrs=("Note", "description"),
3303+
)
3304+
self.genome_features = simulator.simulate_gff(path=path)
3305+
3306+
def write_metadata(self, release, release_path, sample_set, sequence_qc=True):
3307+
n_samples_sim = (
3308+
self.release_manifests[release]
3309+
.set_index("sample_set")
3310+
.loc[sample_set]["sample_count"]
3311+
)
3312+
3313+
src_path = (
3314+
self.fixture_dir
3315+
/ "vo_aste_release_master_us_central1"
3316+
/ release_path
3317+
/ "metadata"
3318+
/ "general"
3319+
/ sample_set
3320+
/ "samples.meta.csv"
3321+
)
3322+
df_general = pd.read_csv(src_path)
3323+
df_general_ds = df_general.sample(
3324+
n_samples_sim, replace=False, random_state=self.rng
3325+
)
3326+
samples_ds = df_general_ds["sample_id"].tolist()
3327+
dst_path = (
3328+
self.bucket_path
3329+
/ release_path
3330+
/ "metadata"
3331+
/ "general"
3332+
/ sample_set
3333+
/ "samples.meta.csv"
3334+
)
3335+
dst_path.parent.mkdir(parents=True, exist_ok=True)
3336+
df_general_ds.to_csv(dst_path, index=False)
3337+
3338+
if sequence_qc:
3339+
src_path = (
3340+
self.fixture_dir
3341+
/ "vo_aste_release_master_us_central1"
3342+
/ release_path
3343+
/ "metadata"
3344+
/ "curation"
3345+
/ sample_set
3346+
/ "sequence_qc_stats.csv"
3347+
)
3348+
df_sequence_qc_stats = pd.read_csv(src_path)
3349+
df_sequence_qc_stats_ds = (
3350+
df_sequence_qc_stats.set_index("sample_id")
3351+
.loc[samples_ds]
3352+
.reset_index()
3353+
)
3354+
dst_path = (
3355+
self.bucket_path
3356+
/ release_path
3357+
/ "metadata"
3358+
/ "curation"
3359+
/ sample_set
3360+
/ "sequence_qc_stats.csv"
3361+
)
3362+
dst_path.parent.mkdir(parents=True, exist_ok=True)
3363+
df_sequence_qc_stats_ds.to_csv(dst_path, index=False)
3364+
3365+
src_path = (
3366+
self.fixture_dir
3367+
/ "vo_aste_release_master_us_central1"
3368+
/ release_path
3369+
/ "metadata"
3370+
/ "cohorts_20260402"
3371+
/ sample_set
3372+
/ "samples.cohorts.csv"
3373+
)
3374+
df_coh = pd.read_csv(src_path)
3375+
df_coh_ds = df_coh.set_index("sample_id").loc[samples_ds].reset_index()
3376+
dst_path = (
3377+
self.bucket_path
3378+
/ release_path
3379+
/ "metadata"
3380+
/ "cohorts_20260402"
3381+
/ sample_set
3382+
/ "samples.cohorts.csv"
3383+
)
3384+
dst_path.parent.mkdir(parents=True, exist_ok=True)
3385+
df_coh_ds.to_csv(dst_path, index=False)
3386+
3387+
src_path = (
3388+
self.fixture_dir
3389+
/ "vo_aste_release_master_us_central1"
3390+
/ release_path
3391+
/ "metadata"
3392+
/ "general"
3393+
/ sample_set
3394+
/ "wgs_snp_data.csv"
3395+
)
3396+
df_cat = pd.read_csv(src_path)
3397+
df_cat_ds = df_cat.set_index("sample_id").loc[samples_ds].reset_index()
3398+
dst_path = (
3399+
self.bucket_path
3400+
/ release_path
3401+
/ "metadata"
3402+
/ "general"
3403+
/ sample_set
3404+
/ "wgs_snp_data.csv"
3405+
)
3406+
dst_path.parent.mkdir(parents=True, exist_ok=True)
3407+
df_cat_ds.to_csv(dst_path, index=False)
3408+
3409+
def init_metadata(self):
3410+
self.write_metadata(
3411+
release="1.0", release_path="v1.0", sample_set="1365-VO-DJ-ADBI-VMF00318"
3412+
)
3413+
self.write_metadata(
3414+
release="1.0", release_path="v1.0", sample_set="1386-VO-KE-OCHOMO-VMF00339"
3415+
)
3416+
self.write_metadata(
3417+
release="1.0",
3418+
release_path="v1.0",
3419+
sample_set="1367-VO-AF-DONNELLY-VMF00320",
3420+
)
3421+
3422+
def init_snp_sites(self):
3423+
path = self.bucket_path / "v1.0/snp_genotypes/all/sites/"
3424+
self.snp_sites, self.n_snp_sites = simulate_snp_sites(
3425+
path=path, contigs=self.contigs, genome=self.genome
3426+
)
3427+
3428+
def init_site_filters(self):
3429+
analysis = self.config["DEFAULT_SITE_FILTERS_ANALYSIS"]
3430+
mask = "stephensi"
3431+
p_pass = 0.59
3432+
path = self.bucket_path / "v1.0/site_filters" / analysis / mask
3433+
simulate_site_filters(
3434+
path=path,
3435+
contigs=self.contigs,
3436+
p_pass=p_pass,
3437+
n_sites=self.n_snp_sites,
3438+
rng=self.rng,
3439+
)
3440+
3441+
def init_snp_genotypes(self):
3442+
for release, manifest in self.release_manifests.items():
3443+
release_path = f"v{release}"
3444+
for rec in manifest.itertuples():
3445+
sample_set = rec.sample_set
3446+
metadata_path = (
3447+
self.bucket_path
3448+
/ release_path
3449+
/ "metadata"
3450+
/ "general"
3451+
/ sample_set
3452+
/ "samples.meta.csv"
3453+
)
3454+
zarr_path = (
3455+
self.bucket_path
3456+
/ release_path
3457+
/ "snp_genotypes"
3458+
/ "all"
3459+
/ sample_set
3460+
)
3461+
p_allele = np.array([0.981, 0.006, 0.008, 0.005])
3462+
p_missing = np.array([0.95, 0.05])
3463+
simulate_snp_genotypes(
3464+
zarr_path=zarr_path,
3465+
metadata_path=metadata_path,
3466+
contigs=self.contigs,
3467+
n_sites=self.n_snp_sites,
3468+
p_allele=p_allele,
3469+
p_missing=p_missing,
3470+
rng=self.rng,
3471+
)
3472+
3473+
def init_site_annotations(self):
3474+
path = self.bucket_path / self.config["SITE_ANNOTATIONS_ZARR_PATH"]
3475+
simulate_site_annotations(path=path, genome=self.genome, rng=self.rng)
3476+
3477+
31983478
# For the following data fixtures we will use the "session" scope
31993479
# so that the fixture data will be created only once per test
32003480
# session (i.e., per invocation of pytest).
@@ -3228,3 +3508,8 @@ def adir1_sim_fixture(fixture_dir):
32283508
@pytest.fixture(scope="session")
32293509
def amin1_sim_fixture(fixture_dir):
32303510
return Amin1Simulator(fixture_dir=fixture_dir, rng=create_rng("Amin1"))
3511+
3512+
3513+
@pytest.fixture(scope="session")
3514+
def as1_sim_fixture(fixture_dir):
3515+
return As1Simulator(fixture_dir=fixture_dir, rng=create_rng("As1"))

0 commit comments

Comments
 (0)