|
| 1 | +from pathlib import Path |
| 2 | + |
| 3 | +import numpy as np |
| 4 | +import pandas as pd |
| 5 | +import zarr |
| 6 | + |
| 7 | +from .conftest import ( |
| 8 | + Gff3Simulator, |
| 9 | + simulate_cnv_discordant_read_calls, |
| 10 | + simulate_cnv_hmm, |
| 11 | +) |
| 12 | + |
| 13 | + |
| 14 | +def _write_sample_metadata(path: Path, n_samples: int = 100) -> None: |
| 15 | + df_samples = pd.DataFrame({"sample_id": [f"S{i:04d}" for i in range(n_samples)]}) |
| 16 | + df_samples.to_csv(path, index=False) |
| 17 | + |
| 18 | + |
| 19 | +def test_simulate_cnv_hmm_limits_high_variance_fraction(tmp_path): |
| 20 | + zarr_path = tmp_path / "cnv_hmm.zarr" |
| 21 | + metadata_path = tmp_path / "samples.csv" |
| 22 | + _write_sample_metadata(metadata_path) |
| 23 | + |
| 24 | + simulate_cnv_hmm( |
| 25 | + zarr_path=zarr_path, |
| 26 | + metadata_path=metadata_path, |
| 27 | + contigs=("2L",), |
| 28 | + contig_sizes={"2L": 10_000}, |
| 29 | + rng=np.random.default_rng(0), |
| 30 | + ) |
| 31 | + |
| 32 | + root = zarr.open(zarr_path, mode="r") |
| 33 | + high_variance_fraction = np.mean(root["sample_is_high_variance"][:]) |
| 34 | + assert high_variance_fraction < 0.3 |
| 35 | + |
| 36 | + |
| 37 | +def test_simulate_cnv_discordant_read_calls_limits_high_variance_fraction(tmp_path): |
| 38 | + zarr_path = tmp_path / "cnv_discordant.zarr" |
| 39 | + metadata_path = tmp_path / "samples.csv" |
| 40 | + _write_sample_metadata(metadata_path) |
| 41 | + |
| 42 | + simulate_cnv_discordant_read_calls( |
| 43 | + zarr_path=zarr_path, |
| 44 | + metadata_path=metadata_path, |
| 45 | + contigs=("2L",), |
| 46 | + contig_sizes={"2L": 10_000}, |
| 47 | + rng=np.random.default_rng(0), |
| 48 | + ) |
| 49 | + |
| 50 | + root = zarr.open(zarr_path, mode="r") |
| 51 | + high_variance_fraction = np.mean(root["sample_is_high_variance"][:]) |
| 52 | + assert high_variance_fraction < 0.3 |
| 53 | + |
| 54 | + |
| 55 | +def test_simulate_exons_on_minus_strand_reverses_feature_order(): |
| 56 | + sim = Gff3Simulator( |
| 57 | + contig_sizes={"2L": 10_000}, |
| 58 | + rng=np.random.default_rng(0), |
| 59 | + n_exons_low=3, |
| 60 | + n_exons_high=3, |
| 61 | + intron_size_low=10, |
| 62 | + intron_size_high=10, |
| 63 | + exon_size_low=100, |
| 64 | + exon_size_high=100, |
| 65 | + ) |
| 66 | + rows = list( |
| 67 | + sim.simulate_exons( |
| 68 | + contig="2L", |
| 69 | + strand="-", |
| 70 | + gene_ix=0, |
| 71 | + transcript_ix=0, |
| 72 | + transcript_id="transcript-2L-0-0", |
| 73 | + transcript_start=1, |
| 74 | + transcript_end=1_000, |
| 75 | + ) |
| 76 | + ) |
| 77 | + cds_and_utrs = [ |
| 78 | + row for row in rows if row[2] in {sim.utr5_type, sim.utr3_type, sim.cds_type} |
| 79 | + ] |
| 80 | + starts = [row[3] for row in cds_and_utrs] |
| 81 | + assert starts == sorted(starts, reverse=True) |
0 commit comments