malariagen
diff --git a/‎tests/anoph/conftest.py‎
Lines changed: 325 additions & 0 deletions b/‎tests/anoph/conftest.py‎
Lines changed: 325 additions & 0 deletions
diff --git a/‎tests/anoph/fixture/vo_adir_release_master_us_central1/v1.0/metadata/cohorts_20250710/1276-AD-BD-ALAM-VMF00156/samples.admin_units.csv‎
Lines changed: 48 additions & 0 deletions b/‎tests/anoph/fixture/vo_adir_release_master_us_central1/v1.0/metadata/cohorts_20250710/1276-AD-BD-ALAM-VMF00156/samples.admin_units.csv‎
Lines changed: 48 additions & 0 deletions
@@ -2364,6 +2364,326 @@ def init_cnv_discordant_read_calls(self):
                 )
 
 
+class Adir1Simulator(AnophelesSimulator):
+    def __init__(self, fixture_dir):
+        super().__init__(
+            fixture_dir=fixture_dir,
+            bucket="vo_adir_release_master_us_central1",
+            releases=("1.0",),
+            has_aims=False,
+            has_cohorts_by_quarter=False,
+            has_sequence_qc=True,
+        )
+
+    def init_config(self):
+        self.config = {
+            "PUBLIC_RELEASES": ["1.0"],
+            "GENESET_GFF3_PATH": "reference/genome/AdirusWRAIR2/VectorBase-68_AdirusWRAIR2.gff.gz",
+            "GENOME_FASTA_PATH": "reference/genome/AdirusWRAIR2/VectorBase-56_AdirusWRAIR2_Genome.fasta",
+            "GENOME_FAI_PATH": "reference/genome/AdirusWRAIR2/VectorBase-56_AdirusWRAIR2_Genome.fasta.fai",
+            "GENOME_ZARR_PATH": "reference/genome/AdirusWRAIR2/VectorBase-56_AdirusWRAIR2_Genome.zarr",
+            "GENOME_REF_ID": "AdirusWRAIR2",
+            "GENOME_REF_NAME": "Anopheles dirus",
+            "CONTIGS": [
+                "KB672490",
+                "KB672868",
+                "KB672979",
+            ],  # Just using the three largest.
+            "SITE_ANNOTATIONS_ZARR_PATH": "reference/genome/AdirusWRAIR2/VectorBase-56_AdirusWRAIR2_Genome.SEQANNOTATION.zarr",
+            "DEFAULT_SITE_FILTERS_ANALYSIS": "sc_20250610",
+            "DEFAULT_COHORTS_ANALYSIS": "20250710",
+            "DEFAULT_DISCORDANT_READ_CALLS_ANALYSIS": "",
+            "SITE_MASK_IDS": ["dirus"],
+            "PHASING_ANALYSIS_IDS": ["dirus_noneyet"],
+        }
+        config_path = self.bucket_path / "v1.0-config.json"
+        with config_path.open(mode="w") as f:
+            json.dump(self.config, f, indent=4)
+
+    def init_public_release_manifest(self):
+        # Here we create a release manifest for an Adir1-style
+        # public release. Note this is not the exact same data
+        # as the real release.
+        release_path = self.bucket_path / "v1.0"
+        release_path.mkdir(parents=True, exist_ok=True)
+        manifest_path = release_path / "manifest.tsv"
+        manifest = pd.DataFrame(
+            {
+                "sample_set": [
+                    "1277-VO-KH-WITKOWSKI-VMF00151",
+                    "1276-AD-BD-ALAM-VMF00156",
+                ],
+                "sample_count": [20, 10],
+                "study_id": [
+                    "1277-VO-KH-WITKOWSKI",
+                    "1276-AD-BD-ALAM",
+                ],
+                "study_url": [
+                    "https://www.malariagen.net/network/where-we-work/1277-VO-KH-WITKOWSKI",
+                    "https://www.malariagen.net/network/where-we-work/1276-AD-BD-ALAM",
+                ],
+                "terms_of_use_expiry_date": [
+                    "2027-06-01",
+                    "2027-06-01",
+                ],
+                "terms_of_use_url": [
+                    "https://malariagen.github.io/vector-data/adir1/adir1.0.html#terms-of-use",
+                    "https://malariagen.github.io/vector-data/adir1/adir1.0.html#terms-of-use",
+                ],
+            }
+        )
+        manifest.to_csv(manifest_path, index=False, sep="\t")
+        self.release_manifests["1.0"] = manifest
+
+    def init_genome_sequence(self):
+        # Here we simulate a reference genome in a simple way
+        # but with much smaller contigs. The data are stored
+        # using zarr as with the real data releases.
+
+        # Use real base composition.
+        base_composition = {
+            b"a": 0.0,
+            b"c": 0.0,
+            b"g": 0.0,
+            b"t": 0.0,
+            b"n": 0.0,
+            b"A": 0.29432128333333335,
+            b"C": 0.20542065,
+            b"G": 0.20575796666666665,
+            b"T": 0.2944834333333333,
+            b"N": 1.6666666666666667e-05,
+        }
+        path = self.bucket_path / self.config["GENOME_ZARR_PATH"]
+        self.genome = simulate_genome(
+            path=path,
+            contigs=self.contigs,
+            low=80_000,
+            high=120_000,
+            base_composition=base_composition,
+        )
+        self.contig_sizes = {
+            contig: self.genome[contig].shape[0] for contig in self.contigs
+        }
+
+    def init_genome_features(self):
+        path = self.bucket_path / self.config["GENESET_GFF3_PATH"]
+        path.parent.mkdir(parents=True, exist_ok=True)
+        simulator = Gff3Simulator(
+            contig_sizes=self.contig_sizes,
+            # Af1 has a different gene type
+            gene_type="protein_coding_gene",
+            # Af1 has different attributes
+            attrs=("Note", "description"),
+        )
+        self.genome_features = simulator.simulate_gff(path=path)
+
+    def write_metadata(self, release, release_path, sample_set, sequence_qc=True):
+        # Here we take the approach of using some of the real metadata,
+        # but truncating it to the number of samples included in the
+        # simulated data resource.
+
+        # Look up the number of samples in this sample set within the
+        # simulated data resource.
+        n_samples_sim = (
+            self.release_manifests[release]
+            .set_index("sample_set")
+            .loc[sample_set]["sample_count"]
+        )
+
+        # Create general metadata by sampling from some real metadata files.
+        src_path = (
+            self.fixture_dir
+            / "vo_adir_release_master_us_central1"
+            / release_path
+            / "metadata"
+            / "general"
+            / sample_set
+            / "samples.meta.csv"
+        )
+        df_general = pd.read_csv(src_path)
+        df_general_ds = df_general.sample(n_samples_sim, replace=False)
+        samples_ds = df_general_ds["sample_id"].tolist()
+        dst_path = (
+            self.bucket_path
+            / release_path
+            / "metadata"
+            / "general"
+            / sample_set
+            / "samples.meta.csv"
+        )
+        dst_path.parent.mkdir(parents=True, exist_ok=True)
+        df_general_ds.to_csv(dst_path, index=False)
+
+        if sequence_qc:
+            # Create sequence QC metadata by sample from real metadata files.
+            src_path = (
+                self.fixture_dir
+                / "vo_adir_release_master_us_central1"
+                / release_path
+                / "metadata"
+                / "curation"
+                / sample_set
+                / "sequence_qc_stats.csv"
+            )
+            df_sequence_qc_stats = pd.read_csv(src_path)
+            df_sequence_qc_stats_ds = (
+                df_sequence_qc_stats.set_index("sample_id")
+                .loc[samples_ds]
+                .reset_index()
+            )
+            dst_path = (
+                self.bucket_path
+                / release_path
+                / "metadata"
+                / "curation"
+                / sample_set
+                / "sequence_qc_stats.csv"
+            )
+            dst_path.parent.mkdir(parents=True, exist_ok=True)
+            df_sequence_qc_stats_ds.to_csv(dst_path, index=False)
+
+        # Create cohorts metadata by sampling from some real metadata files.
+        src_path = (
+            self.fixture_dir
+            / "vo_adir_release_master_us_central1"
+            / release_path
+            / "metadata"
+            / "cohorts_20250710"
+            / sample_set
+            / "samples.cohorts.csv"
+        )
+        df_coh = pd.read_csv(src_path)
+        df_coh_ds = df_coh.set_index("sample_id").loc[samples_ds].reset_index()
+        dst_path = (
+            self.bucket_path
+            / release_path
+            / "metadata"
+            / "cohorts_20250710"
+            / sample_set
+            / "samples.cohorts.csv"
+        )
+        dst_path.parent.mkdir(parents=True, exist_ok=True)
+        df_coh_ds.to_csv(dst_path, index=False)
+
+        # Create data catalog by sampling from some real metadata files.
+        src_path = (
+            self.fixture_dir
+            / "vo_adir_release_master_us_central1"
+            / release_path
+            / "metadata"
+            / "general"
+            / sample_set
+            / "wgs_snp_data.csv"
+        )
+        df_cat = pd.read_csv(src_path)
+        df_cat_ds = df_cat.set_index("sample_id").loc[samples_ds].reset_index()
+        dst_path = (
+            self.bucket_path
+            / release_path
+            / "metadata"
+            / "general"
+            / sample_set
+            / "wgs_snp_data.csv"
+        )
+        dst_path.parent.mkdir(parents=True, exist_ok=True)
+        df_cat_ds.to_csv(dst_path, index=False)
+
+    #        # Create accessions catalog by sampling from some real metadata files.
+    #        src_path = (
+    #            self.fixture_dir
+    #            / "vo_adir_release_master_us_central1"
+    #            / release_path
+    #            / "metadata"
+    #            / "general"
+    #            / sample_set
+    #            / "wgs_accession_data.csv"
+    #        )
+    #        df_cat = pd.read_csv(src_path)
+    #        df_cat_ds = df_cat.set_index("sample_id").loc[samples_ds].reset_index()
+    #        dst_path = (
+    #            self.bucket_path
+    #            / release_path
+    #            / "metadata"
+    #            / "general"
+    #            / sample_set
+    #            / "wgs_accession_data.csv"
+    #        )
+    #        dst_path.parent.mkdir(parents=True, exist_ok=True)
+    #        df_cat_ds.to_csv(dst_path, index=False)
+
+    def init_metadata(self):
+        self.write_metadata(
+            release="1.0",
+            release_path="v1.0",
+            sample_set="1277-VO-KH-WITKOWSKI-VMF00151",
+        )
+        self.write_metadata(
+            release="1.0",
+            release_path="v1.0",
+            sample_set="1276-AD-BD-ALAM-VMF00156",
+        )
+
+    def init_snp_sites(self):
+        path = self.bucket_path / "v1.0/snp_genotypes/all/sites/"
+        self.snp_sites, self.n_snp_sites = simulate_snp_sites(
+            path=path, contigs=self.contigs, genome=self.genome
+        )
+
+    def init_site_filters(self):
+        analysis = self.config["DEFAULT_SITE_FILTERS_ANALYSIS"]
+
+        # Simulate the funestus mask.
+        mask = "dirus"
+        p_pass = 0.59
+        path = self.bucket_path / "v1.0/site_filters" / analysis / mask
+        simulate_site_filters(
+            path=path, contigs=self.contigs, p_pass=p_pass, n_sites=self.n_snp_sites
+        )
+
+    def init_snp_genotypes(self):
+        # Iterate over releases.
+        for release, manifest in self.release_manifests.items():
+            # Determine release path.
+            release_path = f"v{release}"
+
+            # Iterate over sample sets in the release.
+            for rec in manifest.itertuples():
+                sample_set = rec.sample_set
+                metadata_path = (
+                    self.bucket_path
+                    / release_path
+                    / "metadata"
+                    / "general"
+                    / sample_set
+                    / "samples.meta.csv"
+                )
+
+                # Create zarr hierarchy.
+                zarr_path = (
+                    self.bucket_path
+                    / release_path
+                    / "snp_genotypes"
+                    / "all"
+                    / sample_set
+                )
+
+                # Simulate SNP genotype data.
+                p_allele = np.array([0.981, 0.006, 0.008, 0.005])
+                p_missing = np.array([0.95, 0.05])
+                simulate_snp_genotypes(
+                    zarr_path=zarr_path,
+                    metadata_path=metadata_path,
+                    contigs=self.contigs,
+                    n_sites=self.n_snp_sites,
+                    p_allele=p_allele,
+                    p_missing=p_missing,
+                )
+
+    def init_site_annotations(self):
+        path = self.bucket_path / self.config["SITE_ANNOTATIONS_ZARR_PATH"]
+        simulate_site_annotations(path=path, genome=self.genome)
+
+
 # For the following data fixtures we will use the "session" scope
 # so that the fixture data will be created only once per test
 # session (i.e., per invocation of pytest).
@@ -2384,3 +2704,8 @@ def ag3_sim_fixture(fixture_dir):
 @pytest.fixture(scope="session")
 def af1_sim_fixture(fixture_dir):
     return Af1Simulator(fixture_dir=fixture_dir)
+
+
+@pytest.fixture(scope="session")
+def adir1_sim_fixture(fixture_dir):
+    return Adir1Simulator(fixture_dir=fixture_dir)
@@ -0,0 +1,48 @@
+sample_id,country,country_ISO,adm1_name,adm1_ISO,adm2_name
+VBS46299-6321STDY9453299,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46307-6321STDY9453307,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46315-6321STDY9453315,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46323-6321STDY9453323,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46331-6321STDY9453331,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46339-6321STDY9453339,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46300-6321STDY9453300,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46308-6321STDY9453308,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46316-6321STDY9453316,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46324-6321STDY9453324,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46332-6321STDY9453332,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46340-6321STDY9453340,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46301-6321STDY9453301,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46309-6321STDY9453309,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46317-6321STDY9453317,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46325-6321STDY9453325,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46333-6321STDY9453333,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46341-6321STDY9453341,Bangladesh,BGD,Chittagong Division,BD-B,Rangamati
+VBS46302-6321STDY9453302,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46310-6321STDY9453310,Bangladesh,BGD,Chittagong Division,BD-B,Rangamati
+VBS46318-6321STDY9453318,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46326-6321STDY9453326,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46334-6321STDY9453334,Bangladesh,BGD,Chittagong Division,BD-B,Rangamati
+VBS46342-6321STDY9453342,Bangladesh,BGD,Chittagong Division,BD-B,Rangamati
+VBS46303-6321STDY9453303,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46311-6321STDY9453311,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46319-6321STDY9453319,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46327-6321STDY9453327,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46335-6321STDY9453335,Bangladesh,BGD,Chittagong Division,BD-B,Rangamati
+VBS46343-6321STDY9453343,Bangladesh,BGD,Chittagong Division,BD-B,Rangamati
+VBS46304-6321STDY9453304,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46312-6321STDY9453312,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46320-6321STDY9453320,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46328-6321STDY9453328,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46336-6321STDY9453336,Bangladesh,BGD,Chittagong Division,BD-B,Rangamati
+VBS46344-6321STDY9453344,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46305-6321STDY9453305,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46313-6321STDY9453313,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46321-6321STDY9453321,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46329-6321STDY9453329,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46337-6321STDY9453337,Bangladesh,BGD,Chittagong Division,BD-B,Rangamati
+VBS46345-6321STDY9453345,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46306-6321STDY9453306,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46314-6321STDY9453314,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46322-6321STDY9453322,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46330-6321STDY9453330,Bangladesh,BGD,Chittagong Division,BD-B,Bandarban
+VBS46338-6321STDY9453338,Bangladesh,BGD,Chittagong Division,BD-B,Rangamati