Skip to content

Commit 278943d

Browse files
authored
Merge branch 'master' into feature/amino-acid-distance-metrics
2 parents 44aac78 + d078091 commit 278943d

13 files changed

Lines changed: 131 additions & 34 deletions

File tree

malariagen_data/ag3.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,14 @@ def _setup_aim_palettes():
7474
"gcx4": TAXON_PALETTE[10],
7575
"unassigned": "black",
7676
}
77+
# Colors for aim_species column, matching the AIM palettes.
78+
AIM_SPECIES_COLORS = {
79+
"gambiae": AIM_PALETTES["gamb_vs_colu"][1],
80+
"coluzzii": AIM_PALETTES["gamb_vs_colu"][3],
81+
"arabiensis": AIM_PALETTES["gambcolu_vs_arab"][3],
82+
"gambcolu": AIM_PALETTES["gambcolu_vs_arab"][1],
83+
"unassigned": "black",
84+
}
7785

7886
# Note: These column names will be treated as case-insensitive,
7987
# because these column names and the column names from the CSV
@@ -197,6 +205,7 @@ def __init__(
197205
storage_options=storage_options,
198206
tqdm_class=tqdm_class,
199207
taxon_colors=TAXON_COLORS,
208+
aim_species_colors=AIM_SPECIES_COLORS,
200209
virtual_contigs=VIRTUAL_CONTIGS,
201210
gene_names=GENE_NAMES,
202211
inversion_tag_path=INVERSION_TAG_PATH,

malariagen_data/anoph/aim_data.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,7 @@ def plot_aim_heatmap(
208208
show: plotly_params.show = True,
209209
renderer: plotly_params.renderer = None,
210210
) -> plotly_params.figure:
211+
aims = self._prep_aims_param(aims=aims)
211212
# Load AIM calls.
212213
ds = self.aim_calls(
213214
aims=aims,

malariagen_data/anoph/cnv_data.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -642,9 +642,13 @@ def cnv_discordant_read_calls(
642642

643643
ly.append(y)
644644

645-
if len(ly) == 0:
646-
# Bail out, no data for given sample sets and analysis.
647-
raise ValueError("No data found for requested sample sets.")
645+
if len(ly) == 0:
646+
# Bail out, no data for given sample sets and contig.
647+
raise ValueError(
648+
f"No CNV discordant read calls data found for contig {c!r} "
649+
f"in the requested sample sets. This could be because the "
650+
f"sample sets do not have discordant read calls data available."
651+
)
648652

649653
x = _simple_xarray_concat(ly, dim=DIM_SAMPLE)
650654
lx.append(x)

malariagen_data/anoph/fst.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -539,11 +539,10 @@ def plot_pairwise_average_fst(
539539
if annotation == "standard error":
540540
fig_df.loc[cohort1, cohort2] = se
541541
elif annotation == "Z score":
542-
try:
543-
zs = fst / se
544-
fig_df.loc[cohort1, cohort2] = zs
545-
except ZeroDivisionError:
542+
if se == 0:
546543
fig_df.loc[cohort1, cohort2] = np.nan
544+
else:
545+
fig_df.loc[cohort1, cohort2] = fst / se
547546
else:
548547
fig_df.loc[cohort1, cohort2] = fst
549548

malariagen_data/anoph/g123.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ def g123_gwss(
170170
) -> Tuple[np.ndarray, np.ndarray]:
171171
# Change this name if you ever change the behaviour of this function, to
172172
# invalidate any previously cached data.
173-
name = "g123_gwss_v1"
173+
name = "g123_gwss_v2"
174174

175175
valid_sites = self.phasing_analysis_ids + ("all", "segregating")
176176
if sites not in valid_sites:
@@ -181,7 +181,7 @@ def g123_gwss(
181181
params = dict(
182182
contig=contig,
183183
sites=sites,
184-
site_mask=site_mask,
184+
site_mask=self._prep_optional_site_mask_param(site_mask=site_mask),
185185
window_size=window_size,
186186
sample_sets=self._prep_sample_sets_param(sample_sets=sample_sets),
187187
# N.B., do not be tempted to convert this sample query into integer

malariagen_data/anoph/sample_metadata.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ def __init__(
3333
aim_analysis: Optional[str] = None,
3434
aim_metadata_dtype: Optional[Mapping[str, Any]] = None,
3535
taxon_colors: Optional[Mapping[str, str]] = None,
36+
aim_species_colors: Optional[Mapping[str, str]] = None,
3637
**kwargs,
3738
):
3839
# N.B., this class is designed to work cooperatively, and
@@ -73,6 +74,8 @@ def __init__(
7374
# Set up taxon colors.
7475
self._taxon_colors = taxon_colors
7576

77+
self._aim_species_colors = aim_species_colors
78+
7679
# Set up extra metadata.
7780
self._extra_metadata: List = []
7881

@@ -1304,6 +1307,11 @@ def _setup_sample_colors_plotly(
13041307
# Special case, default taxon colors and order.
13051308
color_discrete_map = self._taxon_colors
13061309

1310+
# Special handling for aim_species colors.
1311+
if color == "aim_species" and color_discrete_map is None:
1312+
# Special case, default aim_species colors and order.
1313+
color_discrete_map = self._aim_species_colors
1314+
13071315
if isinstance(color, str):
13081316
if "cohort_" + color in data.columns:
13091317
# Convenience to allow things like "admin1_year" instead of "cohort_admin1_year".

malariagen_data/anoph/snp_data.py

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import warnings
12
from functools import lru_cache
23
from typing import Any, Dict, List, Optional, Tuple, Union
34

@@ -38,6 +39,11 @@
3839
from .genome_sequence import AnophelesGenomeSequenceData
3940
from .sample_metadata import AnophelesSampleMetadata
4041

42+
# Maximum number of entries kept in the per-instance _cache_locate_site_class
43+
# dict. The natural ceiling is n_contigs × n_site_classes (≈ 45 for Ag3), so
44+
# 64 gives comfortable headroom without allowing unbounded growth.
45+
_LOCATE_SITE_CLASS_CACHE_MAXSIZE = 64
46+
4147

4248
class AnophelesSnpData(
4349
AnophelesSampleMetadata, AnophelesGenomeFeaturesData, AnophelesGenomeSequenceData
@@ -68,6 +74,14 @@ def __init__(
6874
self._cache_site_annotations = None
6975
self._cache_locate_site_class: Dict = dict()
7076

77+
# Create the SNP-calls cache as a per-instance lru_cache wrapping the
78+
# bound method. Storing it on the instance (rather than using a
79+
# class-level @lru_cache decorator) means:
80+
# 1. `self` is not part of the cache key, so old instances are freed
81+
# normally when the caller drops their reference.
82+
# 2. Different instances have independent, non-interfering caches.
83+
self._cached_snp_calls = lru_cache(maxsize=2)(self._raw_snp_calls)
84+
7185
@property
7286
def _site_filters_analysis(self) -> Optional[str]:
7387
if self._site_filters_analysis_override:
@@ -928,6 +942,13 @@ def _locate_site_class(
928942

929943
self._cache_locate_site_class[cache_key] = loc_ann
930944

945+
# Evict the oldest entry when the cache exceeds its size limit.
946+
# Plain dicts preserve insertion order (Python 3.7+), so the first
947+
# key is always the oldest.
948+
while len(self._cache_locate_site_class) > _LOCATE_SITE_CLASS_CACHE_MAXSIZE:
949+
oldest = next(iter(self._cache_locate_site_class))
950+
del self._cache_locate_site_class[oldest]
951+
931952
return loc_ann
932953

933954
def _snp_calls_for_contig(
@@ -1088,16 +1109,7 @@ def snp_calls(
10881109
chunks=chunks,
10891110
)
10901111

1091-
# Here we cache to improve performance for functions which
1092-
# access SNP calls more than once. For example, this currently
1093-
# happens during access of biallelic SNP calls, because a
1094-
# first computation of allele counts is required, before
1095-
# then using that to filter SNP calls.
1096-
#
1097-
# We only cache up to 2 items because otherwise we can see
1098-
# high memory usage.
1099-
@lru_cache(maxsize=2)
1100-
def _cached_snp_calls(
1112+
def _raw_snp_calls(
11011113
self,
11021114
*,
11031115
regions: Tuple[Region, ...],
@@ -1253,6 +1265,12 @@ def _snp_calls(
12531265
if max_cohort_size is not None:
12541266
n_samples = ds.sizes["samples"]
12551267
if n_samples > max_cohort_size:
1268+
warnings.warn(
1269+
f"Cohort downsampled from {n_samples} to {max_cohort_size} "
1270+
"samples. Set max_cohort_size=None to disable downsampling.",
1271+
UserWarning,
1272+
stacklevel=2,
1273+
)
12561274
rng = np.random.default_rng(seed=random_seed)
12571275
loc_downsample = rng.choice(
12581276
n_samples, size=max_cohort_size, replace=False

malariagen_data/anopheles.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -133,12 +133,13 @@ def __init__(
133133
gff_default_attributes: Tuple[str, ...],
134134
tqdm_class,
135135
storage_options: Mapping,
136-
taxon_colors: Optional[Mapping[str, str]],
137-
virtual_contigs: Optional[Mapping[str, Sequence[str]]],
138-
gene_names: Optional[Mapping[str, str]],
139-
inversion_tag_path: Optional[str],
140-
unrestricted_use_only: Optional[bool],
141-
surveillance_use_only: Optional[bool],
136+
taxon_colors: Optional[Mapping[str, str]] = None,
137+
aim_species_colors: Optional[Mapping[str, str]] = None,
138+
virtual_contigs: Optional[Mapping[str, Sequence[str]]] = None,
139+
gene_names: Optional[Mapping[str, str]] = None,
140+
inversion_tag_path: Optional[str] = None,
141+
unrestricted_use_only: Optional[bool] = None,
142+
surveillance_use_only: Optional[bool] = None,
142143
):
143144
super().__init__(
144145
url=url,
@@ -171,6 +172,7 @@ def __init__(
171172
results_cache=results_cache,
172173
tqdm_class=tqdm_class,
173174
taxon_colors=taxon_colors,
175+
aim_species_colors=aim_species_colors,
174176
virtual_contigs=virtual_contigs,
175177
gene_names=gene_names,
176178
inversion_tag_path=inversion_tag_path,

malariagen_data/mjn.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -281,12 +281,6 @@ def _mjn_graph_edges(
281281
# add edge from final intermediate node to node j
282282
source = f"anon_{i}_{j}_{sep-2}"
283283
target = j
284-
graph_node = {
285-
"id": source,
286-
"count": 0,
287-
"width": anon_width,
288-
}
289-
graph_nodes.append(graph_node)
290284
graph_edge = {
291285
"id": f"edge_{i}_{j}_{sep-1}",
292286
"source": source,

tests/anoph/test_cnv_data.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -626,7 +626,10 @@ def test_cnv_discordant_read_calls(fixture, api: AnophelesCnvData):
626626
assert isinstance(d2, xr.DataArray)
627627

628628
# Check with a contig that should not exist
629-
with pytest.raises(ValueError):
629+
with pytest.raises(
630+
ValueError,
631+
match="No CNV discordant read calls data found|no CNVs available for contig",
632+
):
630633
api.cnv_discordant_read_calls(
631634
contig="foobar", sample_sets=random.choice(all_sample_sets)
632635
)

0 commit comments

Comments
 (0)