Skip to content

Commit 8983418

Browse files
committed
docs: add inline comments explaining haplotype sharing computation
1 parent 9fb0a7d commit 8983418

1 file changed

Lines changed: 28 additions & 12 deletions

File tree

malariagen_data/anoph/hapclust.py

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -791,6 +791,17 @@ def _compute_haplotype_sharing(
791791
chunks,
792792
inline_array,
793793
):
794+
"""
795+
Computes the number of identical haplotypes shared between cohorts
796+
797+
This process involves:
798+
1. Loading haplotypes for a genomic region.
799+
2. Assigning two cohort labels per diploid sample (one for each haplotype).
800+
3. Filtering down to segregating sites to speed up distinction.
801+
4. Grouping all identical haplotypes together using `ht.distinct()`.
802+
5. Counting the number of shared identical groups between every pair of cohorts and returning sharing matrix.
803+
"""
804+
# Load phased genotypes for the specified region
794805
ds_haps = self.haplotypes(
795806
region=region,
796807
analysis=analysis,
@@ -802,17 +813,19 @@ def _compute_haplotype_sharing(
802813
chunks=chunks,
803814
inline_array=inline_array,
804815
)
805-
816+
# Load metadata to map samples to cohorts
806817
df_samples = self.sample_metadata(
807818
sample_sets=sample_sets,
808819
sample_query=sample_query,
809820
sample_query_options=sample_query_options,
810821
)
811-
822+
# Ensure metadata is in exactly the same order as samples in ds_haps
812823
samples_phased = ds_haps["sample_id"].values
813824
df_samples_phased = (
814825
df_samples.set_index("sample_id").loc[samples_phased].reset_index()
815826
)
827+
# Each diploid sample has 2 haplotypes. Duplicate each metadata row
828+
# so cohort labels align 1:1 with the flattened haplotype array.
816829
df_haps = df_samples_phased.loc[df_samples_phased.index.repeat(2)].reset_index(
817830
drop=True
818831
)
@@ -822,26 +835,27 @@ def _compute_haplotype_sharing(
822835
f"Column '{cohort_col}' not found in sample metadata. "
823836
f"Available columns: {', '.join(df_haps.columns)}"
824837
)
825-
838+
# Convert paired genotypes to a flat array: (n_variants, 2 * n_samples)
826839
gt = allel.GenotypeDaskArray(ds_haps["call_genotype"].data)
827840
with self._dask_progress(desc="Load haplotypes"):
828841
ht = gt.to_haplotypes().compute()
829-
842+
# Remove non-segregating sites (where all haplotypes are identical)
830843
ac = ht.count_alleles(max_allele=1)
831844
ht_seg = ht[ac.is_segregating()]
832-
845+
# Find sets of perfectly identical haplotypes
833846
ht_distinct_sets = ht_seg.distinct()
834847

835848
cohort_labels = df_haps[cohort_col].values
836849
cohorts = pd.unique(df_haps[cohort_col].dropna())
837-
850+
# Build N x N sharing matrix
838851
sharing_matrix = pd.DataFrame(0, index=cohorts, columns=cohorts, dtype=int)
839852

840853
for s in ht_distinct_sets:
841854
indices = list(s)
842855
cohorts_in_set = set(cohort_labels[indices])
843856
cohorts_in_set.discard(None)
844857
cohorts_in_set = [c for c in cohorts_in_set if pd.notna(c)]
858+
# Increment the symmetric matrix for every unique pair of cohorts in this group
845859
for i, c1 in enumerate(cohorts_in_set):
846860
for c2 in cohorts_in_set[i + 1 :]:
847861
sharing_matrix.loc[c1, c2] += 1
@@ -891,6 +905,7 @@ def plot_haplotype_sharing_arc(
891905

892906
n_cohorts = len(cohorts)
893907
cohort_list = list(cohorts)
908+
# Position cohorts evenly along a horizontal axis from x=0 to x=1
894909
x_positions = np.linspace(0, 1, n_cohorts)
895910
cohort_x = {c: x for c, x in zip(cohort_list, x_positions)}
896911

@@ -904,7 +919,7 @@ def plot_haplotype_sharing_arc(
904919
max_sharing = sharing_matrix.values.max()
905920
if max_sharing == 0:
906921
max_sharing = 1
907-
922+
# Iterate over all unique pairs of cohorts
908923
for i in range(n_cohorts):
909924
for j in range(i + 1, n_cohorts):
910925
c1 = cohort_list[i]
@@ -916,9 +931,9 @@ def plot_haplotype_sharing_arc(
916931
x1 = cohort_x[c1]
917932
x2 = cohort_x[c2]
918933
arc_height = abs(x2 - x1) * 0.5
919-
934+
# Scale arc thickness linearly based on sharing count
920935
line_width = 1 + (count / max_sharing) * 9
921-
936+
# Generate points for the arc
922937
t = np.linspace(0, np.pi, 50)
923938
arc_x = x1 + (x2 - x1) * (1 - np.cos(t)) / 2
924939
arc_y = arc_height * np.sin(t)
@@ -1025,7 +1040,7 @@ def plot_haplotype_sharing_chord(
10251040

10261041
n_cohorts = len(cohorts)
10271042
cohort_list = list(cohorts)
1028-
1043+
# Position cohorts evenly around a circular layout with radius 1.0
10291044
angles = np.linspace(0, 2 * np.pi, n_cohorts, endpoint=False)
10301045
radius = 1.0
10311046
cohort_x = {c: radius * np.cos(a) for c, a in zip(cohort_list, angles)}
@@ -1041,7 +1056,7 @@ def plot_haplotype_sharing_chord(
10411056
max_sharing = sharing_matrix.values.max()
10421057
if max_sharing == 0:
10431058
max_sharing = 1
1044-
1059+
# Iterate over all unique pairs of cohorts
10451060
for i in range(n_cohorts):
10461061
for j in range(i + 1, n_cohorts):
10471062
c1 = cohort_list[i]
@@ -1054,7 +1069,8 @@ def plot_haplotype_sharing_chord(
10541069
x2, y2 = cohort_x[c2], cohort_y[c2]
10551070

10561071
line_width = 1 + (count / max_sharing) * 9
1057-
1072+
# Draw a quadratic Bezier curve passing through the center (0,0)
1073+
# t from 0 to 1 traces the curve from cohort 1 to cohort 2
10581074
t = np.linspace(0, 1, 50)
10591075
cx, cy = 0, 0
10601076
chord_x = (1 - t) ** 2 * x1 + 2 * (1 - t) * t * cx + t**2 * x2

0 commit comments

Comments
 (0)