Skip to content

Commit 8276c57

Browse files
Merge branch 'master' into GH1151-fix-roh-hmm-cache-name
2 parents 2039bbf + de088d7 commit 8276c57

3 files changed

Lines changed: 155 additions & 7 deletions

File tree

malariagen_data/anoph/frq_base.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,14 @@ def plot_frequencies_heatmap(
417417
`aa_allele_frequencies_advanced()` or
418418
`gene_cnv_frequencies_advanced()`.
419419
""",
420+
taxa="""
421+
Taxon or list of taxa to include in the plot. If None,
422+
all taxa are shown.
423+
""",
424+
areas="""
425+
Area or list of areas to include in the plot. If None,
426+
all areas are shown.
427+
""",
420428
kwargs="Passed through to `px.line()`.",
421429
),
422430
returns="""

malariagen_data/veff.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import collections
2+
import functools
23
import operator
34

45
import pandas as pd
@@ -31,7 +32,7 @@
3132

3233

3334
class Annotator(object):
34-
def __init__(self, genome, genome_features):
35+
def __init__(self, genome, genome_features, genome_cache_maxsize=5):
3536
"""
3637
An annotator.
3738
@@ -41,14 +42,26 @@ def __init__(self, genome, genome_features):
4142
Reference genome.
4243
genome_features : pandas dataframe
4344
Dataframe with genome annotations.
45+
genome_cache_maxsize : int or None, optional
46+
Maximum number of contig genome sequences to keep in the
47+
LRU cache. Set to ``None`` for an unbounded cache (the
48+
previous default behaviour). Default is 5.
4449
4550
"""
4651

4752
# store initialisation parameters
4853
self._genome = genome
49-
self._genome_cache = dict()
5054
self._genome_features_cache = None
5155

56+
# Create a per-instance LRU cache for genome sequences.
57+
# Defining the cached function inside __init__ ensures each
58+
# Annotator instance has its own independent cache.
59+
@functools.lru_cache(maxsize=genome_cache_maxsize)
60+
def _load_genome_seq(chrom):
61+
return self._genome[chrom][:]
62+
63+
self._load_genome_seq = _load_genome_seq
64+
5265
genome_features = genome_features[
5366
(genome_features.end - genome_features.start) > 0
5467
]
@@ -76,15 +89,15 @@ def get_children(self, feature_id):
7689

7790
def get_ref_seq(self, chrom, start, stop):
7891
"""Accepts 1-based coords."""
79-
try:
80-
seq = self._genome_cache[chrom]
81-
except KeyError:
82-
seq = self._genome[chrom][:]
83-
self._genome_cache[chrom] = seq
92+
seq = self._load_genome_seq(chrom)
8493
ref_seq = seq[start - 1 : stop]
8594
ref_seq = ref_seq.tobytes().decode()
8695
return ref_seq
8796

97+
def clear_genome_cache(self):
98+
"""Clear all cached genome sequences to free memory."""
99+
self._load_genome_seq.cache_clear()
100+
88101
def get_ref_allele_coords(self, chrom, pos, ref):
89102
# N.B., use one-based inclusive coordinate system (like GFF3) throughout
90103
ref_start = pos

tests/test_veff.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
"""Tests for the Annotator genome cache in veff.py."""
2+
3+
import numpy as np
4+
import pandas as pd
5+
6+
from malariagen_data.veff import Annotator
7+
8+
9+
def _make_genome(contigs):
10+
"""Create a minimal mock genome (dict-like) mapping contig names to
11+
numpy byte arrays."""
12+
genome = {}
13+
for name in contigs:
14+
seq = np.frombuffer(f"ATCGATCG{name}".encode(), dtype="S1")
15+
genome[name] = seq
16+
return genome
17+
18+
19+
def _make_genome_features():
20+
"""Return a minimal genome_features DataFrame with the required columns."""
21+
return pd.DataFrame(
22+
{
23+
"ID": ["gene1"],
24+
"Parent": ["root"],
25+
"type": ["gene"],
26+
"start": [1],
27+
"end": [100],
28+
"contig": ["chr1"],
29+
"strand": ["+"],
30+
}
31+
)
32+
33+
34+
class TestGenomeCacheDefaultMaxsize:
35+
"""Verify that the default cache maxsize is 5."""
36+
37+
def test_default_maxsize(self):
38+
genome = _make_genome(["chr1"])
39+
ann = Annotator(genome=genome, genome_features=_make_genome_features())
40+
cache_info = ann._load_genome_seq.cache_info()
41+
assert cache_info.maxsize == 5
42+
43+
44+
class TestGenomeCacheLRUEviction:
45+
"""Verify that the LRU cache evicts the oldest entry when full."""
46+
47+
def test_eviction(self):
48+
contigs = ["chr1", "chr2", "chr3"]
49+
genome = _make_genome(contigs)
50+
ann = Annotator(
51+
genome=genome,
52+
genome_features=_make_genome_features(),
53+
genome_cache_maxsize=2,
54+
)
55+
56+
# Load all three contigs in order.
57+
for c in contigs:
58+
ann._load_genome_seq(c)
59+
60+
info = ann._load_genome_seq.cache_info()
61+
# Only 2 entries should remain (chr2 and chr3).
62+
assert info.currsize == 2
63+
# 3 total misses (each first access is a miss).
64+
assert info.misses == 3
65+
66+
# Accessing chr1 again should be a miss because it was evicted.
67+
ann._load_genome_seq("chr1")
68+
info = ann._load_genome_seq.cache_info()
69+
assert info.misses == 4
70+
71+
72+
class TestClearGenomeCache:
73+
"""Verify that clear_genome_cache() empties the cache."""
74+
75+
def test_clear(self):
76+
genome = _make_genome(["chr1", "chr2"])
77+
ann = Annotator(genome=genome, genome_features=_make_genome_features())
78+
79+
ann._load_genome_seq("chr1")
80+
ann._load_genome_seq("chr2")
81+
assert ann._load_genome_seq.cache_info().currsize == 2
82+
83+
ann.clear_genome_cache()
84+
assert ann._load_genome_seq.cache_info().currsize == 0
85+
86+
87+
class TestGenomeCacheUnbounded:
88+
"""Verify that maxsize=None gives an unbounded cache."""
89+
90+
def test_unbounded(self):
91+
contigs = [f"chr{i}" for i in range(20)]
92+
genome = _make_genome(contigs)
93+
ann = Annotator(
94+
genome=genome,
95+
genome_features=_make_genome_features(),
96+
genome_cache_maxsize=None,
97+
)
98+
99+
for c in contigs:
100+
ann._load_genome_seq(c)
101+
102+
info = ann._load_genome_seq.cache_info()
103+
assert info.maxsize is None
104+
assert info.currsize == 20
105+
106+
107+
class TestPerInstanceCacheIsolation:
108+
"""Verify that two Annotator instances have independent caches."""
109+
110+
def test_isolation(self):
111+
genome = _make_genome(["chr1", "chr2"])
112+
features = _make_genome_features()
113+
114+
ann1 = Annotator(genome=genome, genome_features=features)
115+
ann2 = Annotator(genome=genome, genome_features=features)
116+
117+
ann1._load_genome_seq("chr1")
118+
assert ann1._load_genome_seq.cache_info().currsize == 1
119+
assert ann2._load_genome_seq.cache_info().currsize == 0
120+
121+
ann2._load_genome_seq("chr2")
122+
assert ann1._load_genome_seq.cache_info().currsize == 1
123+
assert ann2._load_genome_seq.cache_info().currsize == 1
124+
125+
ann1.clear_genome_cache()
126+
assert ann1._load_genome_seq.cache_info().currsize == 0
127+
assert ann2._load_genome_seq.cache_info().currsize == 1

0 commit comments

Comments
 (0)