Skip to content

Commit 233ada5

Browse files
authored
Merge branch 'master' into GH407_allow_gene_labels
2 parents 46e0cfe + 33b8535 commit 233ada5

10 files changed

Lines changed: 175 additions & 52 deletions

File tree

malariagen_data/ag3.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,7 @@ def karyotype(
378378
inversion: inversion_param,
379379
sample_sets: Optional[base_params.sample_sets] = None,
380380
sample_query: Optional[base_params.sample_query] = None,
381+
sample_query_options: Optional[base_params.sample_query_options] = None,
381382
) -> pd.DataFrame:
382383
# load tag snp data
383384
df_tagsnps = self.load_inversion_tags(inversion=inversion)
@@ -390,7 +391,10 @@ def karyotype(
390391
region = f"{contig}:{start}-{end}"
391392

392393
ds_snps = self.snp_calls(
393-
region=region, sample_sets=sample_sets, sample_query=sample_query
394+
region=region,
395+
sample_sets=sample_sets,
396+
sample_query=sample_query,
397+
sample_query_options=sample_query_options,
394398
)
395399

396400
with self._spinner("Inferring karyotype from tag SNPs"):

malariagen_data/anoph/fst.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,13 @@ def __init__(
2828

2929
def _fst_gwss(
3030
self,
31+
*,
3132
contig,
3233
window_size,
3334
sample_sets,
3435
cohort1_query,
3536
cohort2_query,
37+
sample_query_options,
3638
site_mask,
3739
cohort_size,
3840
min_cohort_size,
@@ -46,6 +48,7 @@ def _fst_gwss(
4648
ac1 = self.snp_allele_counts(
4749
region=contig,
4850
sample_query=cohort1_query,
51+
sample_query_options=sample_query_options,
4952
sample_sets=sample_sets,
5053
site_mask=site_mask,
5154
cohort_size=cohort_size,
@@ -58,6 +61,7 @@ def _fst_gwss(
5861
ac2 = self.snp_allele_counts(
5962
region=contig,
6063
sample_query=cohort2_query,
64+
sample_query_options=sample_query_options,
6165
sample_sets=sample_sets,
6266
site_mask=site_mask,
6367
cohort_size=cohort_size,
@@ -78,10 +82,11 @@ def _fst_gwss(
7882
).compute()
7983

8084
with self._spinner(desc="Compute Fst"):
81-
fst = allel.moving_hudson_fst(ac1, ac2, size=window_size)
82-
# Sometimes Fst can be very slightly below zero, clip for simplicity.
83-
fst = np.clip(fst, a_min=clip_min, a_max=1)
84-
x = allel.moving_statistic(pos, statistic=np.mean, size=window_size)
85+
with np.errstate(divide="ignore", invalid="ignore"):
86+
fst = allel.moving_hudson_fst(ac1, ac2, size=window_size)
87+
# Sometimes Fst can be very slightly below zero, clip for simplicity.
88+
fst = np.clip(fst, a_min=clip_min, a_max=1)
89+
x = allel.moving_statistic(pos, statistic=np.mean, size=window_size)
8590

8691
results = dict(x=x, fst=fst)
8792

@@ -104,6 +109,7 @@ def fst_gwss(
104109
window_size: fst_params.window_size,
105110
cohort1_query: base_params.sample_query,
106111
cohort2_query: base_params.sample_query,
112+
sample_query_options: Optional[base_params.sample_query_options] = None,
107113
sample_sets: Optional[base_params.sample_sets] = None,
108114
site_mask: Optional[base_params.site_mask] = base_params.DEFAULT,
109115
cohort_size: Optional[base_params.cohort_size] = fst_params.cohort_size_default,
@@ -127,6 +133,7 @@ def fst_gwss(
127133
window_size=window_size,
128134
cohort1_query=cohort1_query,
129135
cohort2_query=cohort2_query,
136+
sample_query_options=sample_query_options,
130137
sample_sets=self._prep_sample_sets_param(sample_sets=sample_sets),
131138
site_mask=self._prep_optional_site_mask_param(site_mask=site_mask),
132139
cohort_size=cohort_size,
@@ -161,6 +168,7 @@ def plot_fst_gwss_track(
161168
window_size: fst_params.window_size,
162169
cohort1_query: base_params.sample_query,
163170
cohort2_query: base_params.sample_query,
171+
sample_query_options: Optional[base_params.sample_query_options] = None,
164172
sample_sets: Optional[base_params.sample_sets] = None,
165173
site_mask: Optional[base_params.site_mask] = base_params.DEFAULT,
166174
cohort_size: Optional[base_params.cohort_size] = fst_params.cohort_size_default,
@@ -189,6 +197,7 @@ def plot_fst_gwss_track(
189197
max_cohort_size=max_cohort_size,
190198
cohort1_query=cohort1_query,
191199
cohort2_query=cohort2_query,
200+
sample_query_options=sample_query_options,
192201
sample_sets=sample_sets,
193202
site_mask=site_mask,
194203
random_seed=random_seed,
@@ -265,6 +274,7 @@ def plot_fst_gwss(
265274
window_size: fst_params.window_size,
266275
cohort1_query: base_params.sample_query,
267276
cohort2_query: base_params.sample_query,
277+
sample_query_options: Optional[base_params.sample_query_options] = None,
268278
sample_sets: Optional[base_params.sample_sets] = None,
269279
site_mask: Optional[base_params.site_mask] = base_params.DEFAULT,
270280
cohort_size: Optional[base_params.cohort_size] = fst_params.cohort_size_default,
@@ -290,6 +300,7 @@ def plot_fst_gwss(
290300
window_size=window_size,
291301
cohort1_query=cohort1_query,
292302
cohort2_query=cohort2_query,
303+
sample_query_options=sample_query_options,
293304
sample_sets=sample_sets,
294305
site_mask=site_mask,
295306
cohort_size=cohort_size,
@@ -348,6 +359,7 @@ def average_fst(
348359
region: base_params.region,
349360
cohort1_query: base_params.sample_query,
350361
cohort2_query: base_params.sample_query,
362+
sample_query_options: Optional[base_params.sample_query] = None,
351363
sample_sets: Optional[base_params.sample_sets] = None,
352364
cohort_size: Optional[base_params.cohort_size] = fst_params.cohort_size_default,
353365
min_cohort_size: Optional[
@@ -366,6 +378,7 @@ def average_fst(
366378
region=region,
367379
sample_sets=sample_sets,
368380
sample_query=cohort1_query,
381+
sample_query_options=sample_query_options,
369382
cohort_size=cohort_size,
370383
site_mask=site_mask,
371384
site_class=site_class,
@@ -377,6 +390,7 @@ def average_fst(
377390
region=region,
378391
sample_sets=sample_sets,
379392
sample_query=cohort2_query,
393+
sample_query_options=sample_query_options,
380394
cohort_size=cohort_size,
381395
site_mask=site_mask,
382396
site_class=site_class,

malariagen_data/anoph/genome_features.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,10 +149,11 @@ def genome_features(
149149
df_part = df_part.query(f"end >= {r.start}")
150150
parts.append(df_part)
151151
df = pd.concat(parts, axis=0)
152-
return df.reset_index(drop=True).copy()
152+
return df.sort_values(["contig", "start"]).reset_index(drop=True).copy()
153153

154154
return (
155155
self._genome_features(attributes=attributes_normed)
156+
.sort_values(["contig", "start"])
156157
.reset_index(drop=True)
157158
.copy()
158159
)

malariagen_data/anoph/gplt_params.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Parameters for genome plotting functions. N.B., genome plots are always
22
plotted with bokeh."""
33

4-
from typing import Literal, Mapping, Optional, Union, Sequence
4+
from typing import Literal, Mapping, Optional, Union, Final, Sequence
55

66
import bokeh.models
77
from typing_extensions import Annotated, TypeAlias
@@ -112,6 +112,13 @@
112112
"Passed through to bokeh line() function.",
113113
]
114114

115+
contig_colors: TypeAlias = Annotated[
116+
list[str],
117+
"A sequence of colors.",
118+
]
119+
120+
contig_colors_default: Final[contig_colors] = list(bokeh.palettes.d3["Category20b"][5])
121+
115122
colors: TypeAlias = Annotated[Sequence[str], "List of colors."]
116123

117124
gene_labels: TypeAlias = Annotated[

malariagen_data/anoph/h12.py

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -266,8 +266,16 @@ def _h12_gwss(
266266
# Compute window midpoints.
267267
pos = ds_haps["variant_position"].values
268268
x = allel.moving_statistic(pos, statistic=np.mean, size=window_size)
269+
contigs = np.asarray(
270+
allel.moving_statistic(
271+
ds_haps["variant_contig"].values,
272+
statistic=np.median,
273+
size=window_size,
274+
),
275+
dtype=int,
276+
)
269277

270-
results = dict(x=x, h12=h12)
278+
results = dict(x=x, h12=h12, contigs=contigs)
271279

272280
return results
273281

@@ -277,6 +285,7 @@ def _h12_gwss(
277285
returns=dict(
278286
x="An array containing the window centre point genomic positions.",
279287
h12="An array with h12 statistic values for each window.",
288+
contigs="An array with the contig for each window. The median is chosen for windows overlapping a change of contig.",
280289
),
281290
)
282291
def h12_gwss(
@@ -297,10 +306,10 @@ def h12_gwss(
297306
random_seed: base_params.random_seed = 42,
298307
chunks: base_params.chunks = base_params.native_chunks,
299308
inline_array: base_params.inline_array = base_params.inline_array_default,
300-
) -> Tuple[np.ndarray, np.ndarray]:
309+
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
301310
# Change this name if you ever change the behaviour of this function, to
302311
# invalidate any previously cached data.
303-
name = "h12_gwss_v1"
312+
name = "h12_gwss_v2"
304313

305314
params = dict(
306315
contig=contig,
@@ -327,8 +336,9 @@ def h12_gwss(
327336

328337
x = results["x"]
329338
h12 = results["h12"]
339+
contigs = results["contigs"]
330340

331-
return x, h12
341+
return x, h12, contigs
332342

333343
@check_types
334344
@doc(
@@ -354,14 +364,15 @@ def plot_h12_gwss_track(
354364
sizing_mode: gplt_params.sizing_mode = gplt_params.sizing_mode_default,
355365
width: gplt_params.width = gplt_params.width_default,
356366
height: gplt_params.height = 200,
367+
contig_colors: gplt_params.contig_colors = gplt_params.contig_colors_default,
357368
show: gplt_params.show = True,
358369
x_range: Optional[gplt_params.x_range] = None,
359370
output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
360371
chunks: base_params.chunks = base_params.native_chunks,
361372
inline_array: base_params.inline_array = base_params.inline_array_default,
362373
) -> gplt_params.figure:
363374
# Compute H12.
364-
x, h12 = self.h12_gwss(
375+
x, h12, contigs = self.h12_gwss(
365376
contig=contig,
366377
analysis=analysis,
367378
window_size=window_size,
@@ -412,15 +423,14 @@ def plot_h12_gwss_track(
412423
)
413424

414425
# Plot H12.
415-
fig.scatter(
416-
x=x,
417-
y=h12,
418-
marker="circle",
419-
size=3,
420-
line_width=1,
421-
line_color="black",
422-
fill_color=None,
423-
)
426+
for s in set(contigs):
427+
idxs = contigs == s
428+
fig.scatter(
429+
x=x[idxs],
430+
y=h12[idxs],
431+
marker="circle",
432+
color=contig_colors[s % len(contig_colors)],
433+
)
424434

425435
# Tidy up the plot.
426436
fig.yaxis.axis_label = "H12"
@@ -457,6 +467,7 @@ def plot_h12_gwss(
457467
sizing_mode: gplt_params.sizing_mode = gplt_params.sizing_mode_default,
458468
width: gplt_params.width = gplt_params.width_default,
459469
track_height: gplt_params.track_height = 170,
470+
contig_colors: gplt_params.contig_colors = gplt_params.contig_colors_default,
460471
genes_height: gplt_params.genes_height = gplt_params.genes_height_default,
461472
show: gplt_params.show = True,
462473
output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
@@ -479,6 +490,7 @@ def plot_h12_gwss(
479490
sizing_mode=sizing_mode,
480491
width=width,
481492
height=track_height,
493+
contig_colors=contig_colors,
482494
show=False,
483495
output_backend=output_backend,
484496
chunks=chunks,
@@ -575,7 +587,7 @@ def plot_h12_gwss_multi_overlay_track(
575587
)
576588

577589
# Determine X axis range.
578-
x, _ = res[list(cohort_queries.keys())[0]]
590+
x, _, _ = res[list(cohort_queries.keys())[0]]
579591
x_min = x[0]
580592
x_max = x[-1]
581593
if x_range is None:
@@ -610,7 +622,7 @@ def plot_h12_gwss_multi_overlay_track(
610622
)
611623

612624
# Plot H12.
613-
for i, (cohort_label, (x, h12)) in enumerate(res.items()):
625+
for i, (cohort_label, (x, h12, contig)) in enumerate(res.items()):
614626
fig.scatter(
615627
x=x,
616628
y=h12,

0 commit comments

Comments
 (0)