Skip to content

Commit 485ef17

Browse files
committed
refactor: finalize karyotype GCS loading and simulated tests
- Replace arbitrary '20231213' analysis value with clearly synthetic 'simtest' - Remove unnecessary config keys (KARYOTYPE_INVERSIONS, KARYOTYPE_TAG_SNPS_FILENAME) - Hardcode filename in production code, matching site_filters/aim patterns - Use explicit (contig, inversion) tuples in simulator instead of string slicing - Align test fixtures with test_aim_data.py pattern (AnophelesKaryotypeAnalysis) - Use correct Af1 GFF params in test fixture - Delete empty MANIFEST.in
1 parent 7d1690a commit 485ef17

4 files changed

Lines changed: 35 additions & 30 deletions

File tree

MANIFEST.in

Whitespace-only changes.

malariagen_data/anoph/karyotype.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,12 +70,9 @@ def _require_karyotype_analysis(self):
7070
def load_inversion_tags(self, inversion: inversion_param) -> pd.DataFrame:
7171
self._require_karyotype_analysis()
7272

73-
filename = self.config.get(
74-
"KARYOTYPE_TAG_SNPS_FILENAME", "karyotype_tag_snps.csv"
75-
)
7673
path = (
7774
f"{self._base_path}/{self._major_version_path}"
78-
f"/snp_karyotype/{self._karyotype_analysis}/{filename}"
75+
f"/snp_karyotype/{self._karyotype_analysis}/karyotype_tag_snps.csv"
7976
)
8077
with self._fs.open(path) as f:
8178
df_tag_snps = pd.read_csv(f, sep=",")

tests/anoph/conftest.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1154,10 +1154,7 @@ def init_config(self):
11541154
"SITE_ANNOTATIONS_ZARR_PATH": "reference/genome/agamp4/Anopheles-gambiae-PEST_SEQANNOTATION_AgamP4.12.zarr",
11551155
"DEFAULT_AIM_ANALYSIS": "20220528",
11561156
"DEFAULT_SITE_FILTERS_ANALYSIS": "dt_20200416",
1157-
# Simulated placeholder; real value will be set in production config.
1158-
"DEFAULT_KARYOTYPE_ANALYSIS": "20231213",
1159-
"KARYOTYPE_INVERSIONS": ["2La", "2Rb"],
1160-
"KARYOTYPE_TAG_SNPS_FILENAME": "karyotype_tag_snps.csv",
1157+
"DEFAULT_KARYOTYPE_ANALYSIS": "simtest",
11611158
"DEFAULT_COHORTS_ANALYSIS": "20230516",
11621159
"SITE_MASK_IDS": ["gamb_colu_arab", "gamb_colu", "arab"],
11631160
"PHASING_ANALYSIS_IDS": ["gamb_colu_arab", "gamb_colu", "arab"],
@@ -1529,13 +1526,12 @@ def init_snp_sites(self):
15291526

15301527
def init_karyotype_tags(self):
15311528
analysis = self.config["DEFAULT_KARYOTYPE_ANALYSIS"]
1532-
inversions = self.config["KARYOTYPE_INVERSIONS"]
1533-
filename = self.config["KARYOTYPE_TAG_SNPS_FILENAME"]
15341529

15351530
# Generate tag SNP data using positions from simulated SNP sites.
1531+
# N.B., inversions are defined here with their contigs explicitly
1532+
# rather than derived via string slicing, for robustness.
15361533
tags = []
1537-
for inversion in inversions:
1538-
contig = inversion[:2]
1534+
for contig, inversion in [("2L", "2La"), ("2R", "2Rb")]:
15391535
snp_pos = self.snp_sites[contig]["variants"]["POS"][:]
15401536
snp_alt = self.snp_sites[contig]["variants"]["ALT"][:]
15411537
n_tags = min(20, len(snp_pos))
@@ -1552,7 +1548,13 @@ def init_karyotype_tags(self):
15521548
)
15531549

15541550
df = pd.DataFrame(tags)
1555-
path = self.bucket_path / "v3" / "snp_karyotype" / analysis / filename
1551+
path = (
1552+
self.bucket_path
1553+
/ "v3"
1554+
/ "snp_karyotype"
1555+
/ analysis
1556+
/ "karyotype_tag_snps.csv"
1557+
)
15561558
path.parent.mkdir(parents=True, exist_ok=True)
15571559
df.to_csv(path, index=False)
15581560

tests/anoph/test_karyotype.py

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,38 @@
11
import pandas as pd
22
import pytest
33

4-
from malariagen_data import Af1, Ag3
4+
from malariagen_data import ag3 as _ag3
5+
from malariagen_data import af1 as _af1
6+
from malariagen_data.anoph.karyotype import AnophelesKaryotypeAnalysis
57

68

79
@pytest.fixture
8-
def ag3_sim_api(ag3_sim_fixture, tmp_path):
9-
data_path = ag3_sim_fixture.bucket_path.as_posix()
10-
return Ag3(
11-
url=data_path,
12-
public_url=data_path,
10+
def ag3_sim_api(ag3_sim_fixture):
11+
return AnophelesKaryotypeAnalysis(
12+
url=ag3_sim_fixture.url,
13+
public_url=ag3_sim_fixture.url,
14+
config_path=_ag3.CONFIG_PATH,
15+
major_version_number=_ag3.MAJOR_VERSION_NUMBER,
16+
major_version_path=_ag3.MAJOR_VERSION_PATH,
1317
pre=True,
14-
check_location=False,
15-
bokeh_output_notebook=False,
16-
results_cache=tmp_path.as_posix(),
18+
gff_gene_type="gene",
19+
gff_gene_name_attribute="Name",
20+
gff_default_attributes=("ID", "Parent", "Name", "description"),
1721
)
1822

1923

2024
@pytest.fixture
21-
def af1_sim_api(af1_sim_fixture, tmp_path):
22-
data_path = af1_sim_fixture.bucket_path.as_posix()
23-
return Af1(
24-
url=data_path,
25-
public_url=data_path,
25+
def af1_sim_api(af1_sim_fixture):
26+
return AnophelesKaryotypeAnalysis(
27+
url=af1_sim_fixture.url,
28+
public_url=af1_sim_fixture.url,
29+
config_path=_af1.CONFIG_PATH,
30+
major_version_number=_af1.MAJOR_VERSION_NUMBER,
31+
major_version_path=_af1.MAJOR_VERSION_PATH,
2632
pre=True,
27-
check_location=False,
28-
bokeh_output_notebook=False,
29-
results_cache=tmp_path.as_posix(),
33+
gff_gene_type="protein_coding_gene",
34+
gff_gene_name_attribute="Note",
35+
gff_default_attributes=("ID", "Parent", "Note", "description"),
3036
)
3137

3238

0 commit comments

Comments
 (0)