Skip to content

Commit 5e636f2

Browse files
committed
Added a first run of all haplotypes to set up the list of observed haplotypes.
1 parent 12ac956 commit 5e636f2

1 file changed

Lines changed: 24 additions & 26 deletions

File tree

malariagen_data/anoph/hap_freq.py

Lines changed: 24 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def haplotypes_frequencies(
7373
)
7474

7575
# Access haplotypes.
76-
ds_hap = self.haplotypes(
76+
ds_haps = self.haplotypes(
7777
region=region,
7878
sample_sets=sample_sets,
7979
sample_query=sample_query,
@@ -82,35 +82,37 @@ def haplotypes_frequencies(
8282
)
8383

8484
# Early check for no SNPs.
85-
if ds_hap.sizes["variants"] == 0: # pragma: no cover
85+
if ds_haps.sizes["variants"] == 0: # pragma: no cover
8686
raise ValueError("No SNPs available for the given region.")
8787

8888
# Access genotypes.
89-
gt = allel.GenotypeDaskArray(ds_hap["call_genotype"].data)
89+
gt = allel.GenotypeDaskArray(ds_haps["call_genotype"].data)
9090
with self._dask_progress(desc="Compute haplotypes"):
9191
gt = gt.compute()
9292

93+
# List all haplotypes
94+
gt_hap = gt.to_haplotypes()
95+
f_all, _, _ = haplotype_frequencies(gt_hap)
96+
9397
# Count haplotypes.
9498
freq_cols = dict()
9599
cohorts_iterator = self._progress(
96100
coh_dict.items(), desc="Compute allele frequencies"
97101
)
98-
hap_track: dict[np.int64, float] = {}
99102
for coh, loc_coh in cohorts_iterator:
100-
hap_track = {k: 0 for k in hap_track.keys()}
103+
# We reset all frequencies to 0 for each cohort
104+
hap_dict = {k: 0 for k in f_all.keys()}
105+
101106
n_samples = np.count_nonzero(loc_coh)
102107
assert n_samples >= min_cohort_size
103108
gt_coh = allel.GenotypeDaskArray(da.compress(loc_coh, gt, axis=1))
104109
gt_hap = gt_coh.to_haplotypes().compute()
105110
f, _, _ = haplotype_frequencies(gt_hap)
106-
hap_track.update(f)
107-
freq_cols["frq_" + coh] = list(hap_track.values())
111+
# The frequencies of the observed haplotypes are then updated
112+
hap_dict.update(f)
113+
freq_cols["frq_" + coh] = list(hap_dict.values())
108114

109-
n_haps = np.max([len(i) for i in freq_cols.values()])
110-
freq_cols = {
111-
k: v + [0 for i in range(0, n_haps - len(v))] for k, v in freq_cols.items()
112-
}
113-
df_freqs = pd.DataFrame(freq_cols, index=hap_track.keys())
115+
df_freqs = pd.DataFrame(freq_cols, index=hap_dict.keys())
114116

115117
# Compute max_af.
116118
df_max_af = pd.DataFrame({"max_af": df_freqs.max(axis=1)})
@@ -196,10 +198,14 @@ def haplotypes_frequencies_advanced(
196198
raise ValueError("No SNPs available for the given region.")
197199

198200
# Access genotypes.
199-
gt = ds_haps["call_genotype"].data
201+
gt = allel.GenotypeDaskArray(ds_haps["call_genotype"].data)
200202
with self._dask_progress(desc="Compute haplotypes"):
201203
gt = gt.compute()
202204

205+
# List all haplotypes
206+
gt_hap = gt.to_haplotypes()
207+
f_all, _, _ = haplotype_frequencies(gt_hap)
208+
203209
# Count haplotypes.
204210
hap_freq: dict[np.int64, float] = dict()
205211
hap_count: dict[np.int64, int] = dict()
@@ -213,40 +219,32 @@ def haplotypes_frequencies_advanced(
213219
for cohort in cohorts_iterator:
214220
cohort_key = cohort.taxon, cohort.area, cohort.period
215221
cohort_key_str = cohort.taxon + "_" + cohort.area + "_" + str(cohort.period)
216-
hap_freq = {k: 0 for k in hap_freq.keys()}
217-
hap_count = {k: 0 for k in hap_count.keys()}
218-
hap_nob = {k: 0 for k in hap_nob.keys()}
222+
# We reset all frequencies, counts to 0 for each cohort, nobs is set to the number of haplotypes
219223
n_samples = cohort.size
224+
hap_freq = {k: 0 for k in f_all.keys()}
225+
hap_count = {k: 0 for k in f_all.keys()}
226+
hap_nob = {k: 2 * n_samples for k in f_all.keys()}
220227
assert n_samples >= min_cohort_size
221228
sample_indices = group_samples_by_cohort.indices[cohort_key]
222229
loc_coh = [i in sample_indices for i in range(0, gt.shape[1])]
223230
gt_coh = allel.GenotypeDaskArray(da.compress(loc_coh, gt, axis=1))
224231
gt_hap = gt_coh.to_haplotypes().compute()
225232
f, c, o = haplotype_frequencies(gt_hap)
233+
# The frequencies and counts of the observed haplotypes are then updated, so are the nobs but the values should actually stay the same
226234
hap_freq.update(f)
227235
hap_count.update(c)
228236
hap_nob.update(o)
229237
count_cols["count_" + cohort_key_str] = list(hap_count.values())
230238
freq_cols["frq_" + cohort_key_str] = list(hap_freq.values())
231239
nobs_cols["nobs_" + cohort_key_str] = list(hap_nob.values())
232240

233-
n_haps = np.max([len(i) for i in freq_cols.values()])
234-
freq_cols = {
235-
k: v + [0 for i in range(0, n_haps - len(v))] for k, v in freq_cols.items()
236-
}
237241
df_freqs = pd.DataFrame(freq_cols, index=hap_freq.keys())
238242

239243
# Compute max_af.
240244
df_max_af = pd.DataFrame({"max_af": df_freqs.max(axis=1)})
241245

242-
count_cols = {
243-
k: v + [0 for i in range(0, n_haps - len(v))] for k, v in count_cols.items()
244-
}
245246
df_counts = pd.DataFrame(count_cols, index=hap_count.keys())
246247

247-
nobs_cols = {
248-
k: v + [0 for i in range(0, n_haps - len(v))] for k, v in nobs_cols.items()
249-
}
250248
df_nobs = pd.DataFrame(nobs_cols, index=hap_nob.keys())
251249

252250
# Build the final dataframe.

0 commit comments

Comments
 (0)