@@ -73,7 +73,7 @@ def haplotypes_frequencies(
7373 )
7474
7575 # Access haplotypes.
76- ds_hap = self .haplotypes (
76+ ds_haps = self .haplotypes (
7777 region = region ,
7878 sample_sets = sample_sets ,
7979 sample_query = sample_query ,
@@ -82,35 +82,37 @@ def haplotypes_frequencies(
8282 )
8383
8484 # Early check for no SNPs.
85- if ds_hap .sizes ["variants" ] == 0 : # pragma: no cover
85+ if ds_haps .sizes ["variants" ] == 0 : # pragma: no cover
8686 raise ValueError ("No SNPs available for the given region." )
8787
8888 # Access genotypes.
89- gt = allel .GenotypeDaskArray (ds_hap ["call_genotype" ].data )
89+ gt = allel .GenotypeDaskArray (ds_haps ["call_genotype" ].data )
9090 with self ._dask_progress (desc = "Compute haplotypes" ):
9191 gt = gt .compute ()
9292
93+ # List all haplotypes
94+ gt_hap = gt .to_haplotypes ()
95+ f_all , _ , _ = haplotype_frequencies (gt_hap )
96+
9397 # Count haplotypes.
9498 freq_cols = dict ()
9599 cohorts_iterator = self ._progress (
96100 coh_dict .items (), desc = "Compute allele frequencies"
97101 )
98- hap_track : dict [np .int64 , float ] = {}
99102 for coh , loc_coh in cohorts_iterator :
100- hap_track = {k : 0 for k in hap_track .keys ()}
103+ # We reset all frequencies to 0 for each cohort
104+ hap_dict = {k : 0 for k in f_all .keys ()}
105+
101106 n_samples = np .count_nonzero (loc_coh )
102107 assert n_samples >= min_cohort_size
103108 gt_coh = allel .GenotypeDaskArray (da .compress (loc_coh , gt , axis = 1 ))
104109 gt_hap = gt_coh .to_haplotypes ().compute ()
105110 f , _ , _ = haplotype_frequencies (gt_hap )
106- hap_track .update (f )
107- freq_cols ["frq_" + coh ] = list (hap_track .values ())
111+ # The frequencies of the observed haplotypes are then updated
112+ hap_dict .update (f )
113+ freq_cols ["frq_" + coh ] = list (hap_dict .values ())
108114
109- n_haps = np .max ([len (i ) for i in freq_cols .values ()])
110- freq_cols = {
111- k : v + [0 for i in range (0 , n_haps - len (v ))] for k , v in freq_cols .items ()
112- }
113- df_freqs = pd .DataFrame (freq_cols , index = hap_track .keys ())
115+ df_freqs = pd .DataFrame (freq_cols , index = hap_dict .keys ())
114116
115117 # Compute max_af.
116118 df_max_af = pd .DataFrame ({"max_af" : df_freqs .max (axis = 1 )})
@@ -196,10 +198,14 @@ def haplotypes_frequencies_advanced(
196198 raise ValueError ("No SNPs available for the given region." )
197199
198200 # Access genotypes.
199- gt = ds_haps ["call_genotype" ].data
201+ gt = allel . GenotypeDaskArray ( ds_haps ["call_genotype" ].data )
200202 with self ._dask_progress (desc = "Compute haplotypes" ):
201203 gt = gt .compute ()
202204
205+ # List all haplotypes
206+ gt_hap = gt .to_haplotypes ()
207+ f_all , _ , _ = haplotype_frequencies (gt_hap )
208+
203209 # Count haplotypes.
204210 hap_freq : dict [np .int64 , float ] = dict ()
205211 hap_count : dict [np .int64 , int ] = dict ()
@@ -213,40 +219,32 @@ def haplotypes_frequencies_advanced(
213219 for cohort in cohorts_iterator :
214220 cohort_key = cohort .taxon , cohort .area , cohort .period
215221 cohort_key_str = cohort .taxon + "_" + cohort .area + "_" + str (cohort .period )
216- hap_freq = {k : 0 for k in hap_freq .keys ()}
217- hap_count = {k : 0 for k in hap_count .keys ()}
218- hap_nob = {k : 0 for k in hap_nob .keys ()}
222+ # We reset all frequencies, counts to 0 for each cohort, nobs is set to the number of haplotypes
219223 n_samples = cohort .size
224+ hap_freq = {k : 0 for k in f_all .keys ()}
225+ hap_count = {k : 0 for k in f_all .keys ()}
226+ hap_nob = {k : 2 * n_samples for k in f_all .keys ()}
220227 assert n_samples >= min_cohort_size
221228 sample_indices = group_samples_by_cohort .indices [cohort_key ]
222229 loc_coh = [i in sample_indices for i in range (0 , gt .shape [1 ])]
223230 gt_coh = allel .GenotypeDaskArray (da .compress (loc_coh , gt , axis = 1 ))
224231 gt_hap = gt_coh .to_haplotypes ().compute ()
225232 f , c , o = haplotype_frequencies (gt_hap )
233+ # The frequencies and counts of the observed haplotypes are then updated, so are the nobs but the values should actually stay the same
226234 hap_freq .update (f )
227235 hap_count .update (c )
228236 hap_nob .update (o )
229237 count_cols ["count_" + cohort_key_str ] = list (hap_count .values ())
230238 freq_cols ["frq_" + cohort_key_str ] = list (hap_freq .values ())
231239 nobs_cols ["nobs_" + cohort_key_str ] = list (hap_nob .values ())
232240
233- n_haps = np .max ([len (i ) for i in freq_cols .values ()])
234- freq_cols = {
235- k : v + [0 for i in range (0 , n_haps - len (v ))] for k , v in freq_cols .items ()
236- }
237241 df_freqs = pd .DataFrame (freq_cols , index = hap_freq .keys ())
238242
239243 # Compute max_af.
240244 df_max_af = pd .DataFrame ({"max_af" : df_freqs .max (axis = 1 )})
241245
242- count_cols = {
243- k : v + [0 for i in range (0 , n_haps - len (v ))] for k , v in count_cols .items ()
244- }
245246 df_counts = pd .DataFrame (count_cols , index = hap_count .keys ())
246247
247- nobs_cols = {
248- k : v + [0 for i in range (0 , n_haps - len (v ))] for k , v in nobs_cols .items ()
249- }
250248 df_nobs = pd .DataFrame (nobs_cols , index = hap_nob .keys ())
251249
252250 # Build the final dataframe.
0 commit comments