@@ -708,3 +708,120 @@ def plot_roh(
708708 return None
709709 else :
710710 return fig_all
711+
712+ @_check_types
713+ @doc (
714+ summary = "Return windowed heterozygosity for a single sample over a genome region." ,
715+ returns = """
716+ A DataFrame with one row per window. Columns are:
717+ `sample_id` is the identifier of the sample,
718+ `window_start` is the start position of the window,
719+ `window_stop` is the stop position of the window,
720+ `heterozygosity` is the heterozygosity in the window.
721+ """ ,
722+ )
723+ def sample_count_het (
724+ self ,
725+ sample : base_params .sample ,
726+ region : base_params .region ,
727+ window_size : het_params .window_size = het_params .window_size_default ,
728+ site_mask : Optional [base_params .site_mask ] = base_params .DEFAULT ,
729+ sample_set : Optional [base_params .sample_set ] = None ,
730+ chunks : base_params .chunks = base_params .native_chunks ,
731+ inline_array : base_params .inline_array = base_params .inline_array_default ,
732+ ) -> pd .DataFrame :
733+ # Normalise region parameter.
734+ region_prepped : Region = _parse_single_region (self , region )
735+ del region
736+
737+ # Compute windowed heterozygosity using private method.
738+ sample_id , sample_set , windows , counts = self ._sample_count_het (
739+ sample = sample ,
740+ region = region_prepped ,
741+ site_mask = site_mask ,
742+ window_size = window_size ,
743+ sample_set = sample_set ,
744+ chunks = chunks ,
745+ inline_array = inline_array ,
746+ )
747+
748+ # Build and return a DataFrame.
749+ df = pd .DataFrame (
750+ {
751+ "sample_id" : sample_id ,
752+ "window_start" : windows [:, 0 ],
753+ "window_stop" : windows [:, 1 ],
754+ "heterozygosity" : counts / window_size ,
755+ }
756+ )
757+
758+ return df
759+
760+ @_check_types
761+ @doc (
762+ summary = "Compute mean heterozygosity for multiple cohorts over a genome region." ,
763+ returns = """
764+ A DataFrame with one row per cohort. Columns are:
765+ `cohort` is the cohort identifier,
766+ `n_samples` is the number of samples in the cohort,
767+ `mean_heterozygosity` is the mean heterozygosity across all samples and windows in the cohort.
768+ """ ,
769+ )
770+ def cohort_heterozygosity (
771+ self ,
772+ region : base_params .region ,
773+ cohorts : base_params .cohorts ,
774+ sample_sets : Optional [base_params .sample_sets ] = None ,
775+ sample_query : Optional [base_params .sample_query ] = None ,
776+ sample_query_options : Optional [base_params .sample_query_options ] = None ,
777+ window_size : het_params .window_size = het_params .window_size_default ,
778+ site_mask : Optional [base_params .site_mask ] = base_params .DEFAULT ,
779+ chunks : base_params .chunks = base_params .native_chunks ,
780+ inline_array : base_params .inline_array = base_params .inline_array_default ,
781+ ) -> pd .DataFrame :
782+ # Normalise region parameter.
783+ region_prepped : Region = _parse_single_region (self , region )
784+ del region
785+
786+ # Set up cohort queries.
787+ cohort_queries = self ._setup_cohort_queries (
788+ cohorts = cohorts ,
789+ sample_sets = sample_sets ,
790+ sample_query = sample_query ,
791+ sample_query_options = sample_query_options ,
792+ cohort_size = None ,
793+ min_cohort_size = 1 ,
794+ )
795+
796+ # Compute heterozygosity for each cohort.
797+ results = []
798+ for cohort_label , cohort_query in cohort_queries .items ():
799+ # Get samples in this cohort.
800+ df_cohort_samples = self .sample_metadata (
801+ sample_sets = sample_sets ,
802+ sample_query = cohort_query ,
803+ )
804+ n_samples = len (df_cohort_samples )
805+
806+ # Compute heterozygosity for each sample and take the mean.
807+ het_values = []
808+ for sample_id in df_cohort_samples ["sample_id" ]:
809+ df_het = self .sample_count_het (
810+ sample = sample_id ,
811+ region = region_prepped ,
812+ window_size = window_size ,
813+ site_mask = site_mask ,
814+ chunks = chunks ,
815+ inline_array = inline_array ,
816+ )
817+ het_values .append (df_het ["heterozygosity" ].mean ())
818+
819+ results .append (
820+ {
821+ "cohort" : cohort_label ,
822+ "n_samples" : n_samples ,
823+ "mean_heterozygosity" : np .mean (het_values ),
824+ }
825+ )
826+
827+ return pd .DataFrame (results )
0 commit comments