@@ -791,6 +791,17 @@ def _compute_haplotype_sharing(
791791 chunks ,
792792 inline_array ,
793793 ):
794+ """
795+ Computes the number of identical haplotypes shared between cohorts
796+
797+ This process involves:
798+ 1. Loading haplotypes for a genomic region.
799+ 2. Assigning two cohort labels per diploid sample (one for each haplotype).
800+ 3. Filtering down to segregating sites to speed up distinction.
801+ 4. Grouping all identical haplotypes together using `ht.distinct()`.
802+ 5. Counting the number of shared identical groups between every pair of cohorts and returning sharing matrix.
803+ """
804+ # Load phased genotypes for the specified region
794805 ds_haps = self .haplotypes (
795806 region = region ,
796807 analysis = analysis ,
@@ -802,17 +813,19 @@ def _compute_haplotype_sharing(
802813 chunks = chunks ,
803814 inline_array = inline_array ,
804815 )
805-
816+ # Load metadata to map samples to cohorts
806817 df_samples = self .sample_metadata (
807818 sample_sets = sample_sets ,
808819 sample_query = sample_query ,
809820 sample_query_options = sample_query_options ,
810821 )
811-
822+ # Ensure metadata is in exactly the same order as samples in ds_haps
812823 samples_phased = ds_haps ["sample_id" ].values
813824 df_samples_phased = (
814825 df_samples .set_index ("sample_id" ).loc [samples_phased ].reset_index ()
815826 )
827+ # Each diploid sample has 2 haplotypes. Duplicate each metadata row
828+ # so cohort labels align 1:1 with the flattened haplotype array.
816829 df_haps = df_samples_phased .loc [df_samples_phased .index .repeat (2 )].reset_index (
817830 drop = True
818831 )
@@ -822,26 +835,27 @@ def _compute_haplotype_sharing(
822835 f"Column '{ cohort_col } ' not found in sample metadata. "
823836 f"Available columns: { ', ' .join (df_haps .columns )} "
824837 )
825-
838+ # Convert paired genotypes to a flat array: (n_variants, 2 * n_samples)
826839 gt = allel .GenotypeDaskArray (ds_haps ["call_genotype" ].data )
827840 with self ._dask_progress (desc = "Load haplotypes" ):
828841 ht = gt .to_haplotypes ().compute ()
829-
842+ # Remove non-segregating sites (where all haplotypes are identical)
830843 ac = ht .count_alleles (max_allele = 1 )
831844 ht_seg = ht [ac .is_segregating ()]
832-
845+ # Find sets of perfectly identical haplotypes
833846 ht_distinct_sets = ht_seg .distinct ()
834847
835848 cohort_labels = df_haps [cohort_col ].values
836849 cohorts = pd .unique (df_haps [cohort_col ].dropna ())
837-
850+ # Build N x N sharing matrix
838851 sharing_matrix = pd .DataFrame (0 , index = cohorts , columns = cohorts , dtype = int )
839852
840853 for s in ht_distinct_sets :
841854 indices = list (s )
842855 cohorts_in_set = set (cohort_labels [indices ])
843856 cohorts_in_set .discard (None )
844857 cohorts_in_set = [c for c in cohorts_in_set if pd .notna (c )]
858+ # Increment the symmetric matrix for every unique pair of cohorts in this group
845859 for i , c1 in enumerate (cohorts_in_set ):
846860 for c2 in cohorts_in_set [i + 1 :]:
847861 sharing_matrix .loc [c1 , c2 ] += 1
@@ -891,6 +905,7 @@ def plot_haplotype_sharing_arc(
891905
892906 n_cohorts = len (cohorts )
893907 cohort_list = list (cohorts )
908+ # Position cohorts evenly along a horizontal axis from x=0 to x=1
894909 x_positions = np .linspace (0 , 1 , n_cohorts )
895910 cohort_x = {c : x for c , x in zip (cohort_list , x_positions )}
896911
@@ -904,7 +919,7 @@ def plot_haplotype_sharing_arc(
904919 max_sharing = sharing_matrix .values .max ()
905920 if max_sharing == 0 :
906921 max_sharing = 1
907-
922+ # Iterate over all unique pairs of cohorts
908923 for i in range (n_cohorts ):
909924 for j in range (i + 1 , n_cohorts ):
910925 c1 = cohort_list [i ]
@@ -916,9 +931,9 @@ def plot_haplotype_sharing_arc(
916931 x1 = cohort_x [c1 ]
917932 x2 = cohort_x [c2 ]
918933 arc_height = abs (x2 - x1 ) * 0.5
919-
934+ # Scale arc thickness linearly based on sharing count
920935 line_width = 1 + (count / max_sharing ) * 9
921-
936+ # Generate points for the arc
922937 t = np .linspace (0 , np .pi , 50 )
923938 arc_x = x1 + (x2 - x1 ) * (1 - np .cos (t )) / 2
924939 arc_y = arc_height * np .sin (t )
@@ -1025,7 +1040,7 @@ def plot_haplotype_sharing_chord(
10251040
10261041 n_cohorts = len (cohorts )
10271042 cohort_list = list (cohorts )
1028-
1043+ # Position cohorts evenly around a circular layout with radius 1.0
10291044 angles = np .linspace (0 , 2 * np .pi , n_cohorts , endpoint = False )
10301045 radius = 1.0
10311046 cohort_x = {c : radius * np .cos (a ) for c , a in zip (cohort_list , angles )}
@@ -1041,7 +1056,7 @@ def plot_haplotype_sharing_chord(
10411056 max_sharing = sharing_matrix .values .max ()
10421057 if max_sharing == 0 :
10431058 max_sharing = 1
1044-
1059+ # Iterate over all unique pairs of cohorts
10451060 for i in range (n_cohorts ):
10461061 for j in range (i + 1 , n_cohorts ):
10471062 c1 = cohort_list [i ]
@@ -1054,7 +1069,8 @@ def plot_haplotype_sharing_chord(
10541069 x2 , y2 = cohort_x [c2 ], cohort_y [c2 ]
10551070
10561071 line_width = 1 + (count / max_sharing ) * 9
1057-
1072+ # Draw a quadratic Bezier curve passing through the center (0,0)
1073+ # t from 0 to 1 traces the curve from cohort 1 to cohort 2
10581074 t = np .linspace (0 , 1 , 50 )
10591075 cx , cy = 0 , 0
10601076 chord_x = (1 - t ) ** 2 * x1 + 2 * (1 - t ) * t * cx + t ** 2 * x2
0 commit comments