Skip to content

Commit 5b51ca0

Browse files
Merge branch 'master' into GH1113-plink-filename-hash
2 parents 3eca47d + 2fdabf4 commit 5b51ca0

2 files changed

Lines changed: 172 additions & 0 deletions

File tree

malariagen_data/anoph/heterozygosity.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -708,3 +708,120 @@ def plot_roh(
708708
return None
709709
else:
710710
return fig_all
711+
712+
@_check_types
713+
@doc(
714+
summary="Return windowed heterozygosity for a single sample over a genome region.",
715+
returns="""
716+
A DataFrame with one row per window. Columns are:
717+
`sample_id` is the identifier of the sample,
718+
`window_start` is the start position of the window,
719+
`window_stop` is the stop position of the window,
720+
`heterozygosity` is the heterozygosity in the window.
721+
""",
722+
)
723+
def sample_count_het(
724+
self,
725+
sample: base_params.sample,
726+
region: base_params.region,
727+
window_size: het_params.window_size = het_params.window_size_default,
728+
site_mask: Optional[base_params.site_mask] = base_params.DEFAULT,
729+
sample_set: Optional[base_params.sample_set] = None,
730+
chunks: base_params.chunks = base_params.native_chunks,
731+
inline_array: base_params.inline_array = base_params.inline_array_default,
732+
) -> pd.DataFrame:
733+
# Normalise region parameter.
734+
region_prepped: Region = _parse_single_region(self, region)
735+
del region
736+
737+
# Compute windowed heterozygosity using private method.
738+
sample_id, sample_set, windows, counts = self._sample_count_het(
739+
sample=sample,
740+
region=region_prepped,
741+
site_mask=site_mask,
742+
window_size=window_size,
743+
sample_set=sample_set,
744+
chunks=chunks,
745+
inline_array=inline_array,
746+
)
747+
748+
# Build and return a DataFrame.
749+
df = pd.DataFrame(
750+
{
751+
"sample_id": sample_id,
752+
"window_start": windows[:, 0],
753+
"window_stop": windows[:, 1],
754+
"heterozygosity": counts / window_size,
755+
}
756+
)
757+
758+
return df
759+
760+
@_check_types
761+
@doc(
762+
summary="Compute mean heterozygosity for multiple cohorts over a genome region.",
763+
returns="""
764+
A DataFrame with one row per cohort. Columns are:
765+
`cohort` is the cohort identifier,
766+
`n_samples` is the number of samples in the cohort,
767+
`mean_heterozygosity` is the mean heterozygosity across all samples and windows in the cohort.
768+
""",
769+
)
770+
def cohort_heterozygosity(
771+
self,
772+
region: base_params.region,
773+
cohorts: base_params.cohorts,
774+
sample_sets: Optional[base_params.sample_sets] = None,
775+
sample_query: Optional[base_params.sample_query] = None,
776+
sample_query_options: Optional[base_params.sample_query_options] = None,
777+
window_size: het_params.window_size = het_params.window_size_default,
778+
site_mask: Optional[base_params.site_mask] = base_params.DEFAULT,
779+
chunks: base_params.chunks = base_params.native_chunks,
780+
inline_array: base_params.inline_array = base_params.inline_array_default,
781+
) -> pd.DataFrame:
782+
# Normalise region parameter.
783+
region_prepped: Region = _parse_single_region(self, region)
784+
del region
785+
786+
# Set up cohort queries.
787+
cohort_queries = self._setup_cohort_queries(
788+
cohorts=cohorts,
789+
sample_sets=sample_sets,
790+
sample_query=sample_query,
791+
sample_query_options=sample_query_options,
792+
cohort_size=None,
793+
min_cohort_size=1,
794+
)
795+
796+
# Compute heterozygosity for each cohort.
797+
results = []
798+
for cohort_label, cohort_query in cohort_queries.items():
799+
# Get samples in this cohort.
800+
df_cohort_samples = self.sample_metadata(
801+
sample_sets=sample_sets,
802+
sample_query=cohort_query,
803+
)
804+
n_samples = len(df_cohort_samples)
805+
806+
# Compute heterozygosity for each sample and take the mean.
807+
het_values = []
808+
for sample_id in df_cohort_samples["sample_id"]:
809+
df_het = self.sample_count_het(
810+
sample=sample_id,
811+
region=region_prepped,
812+
window_size=window_size,
813+
site_mask=site_mask,
814+
chunks=chunks,
815+
inline_array=inline_array,
816+
)
817+
het_values.append(df_het["heterozygosity"].mean())
818+
819+
results.append(
820+
{
821+
"cohort": cohort_label,
822+
"n_samples": n_samples,
823+
"mean_heterozygosity": np.mean(het_values),
824+
}
825+
)
826+
827+
return pd.DataFrame(results)

tests/anoph/test_heterozygosity.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,3 +205,58 @@ def test_plot_roh(fixture, api: AnophelesHetAnalysis):
205205

206206
# Check results.
207207
assert isinstance(fig, bokeh.models.GridPlot)
208+
209+
210+
@parametrize_with_cases("fixture,api", cases=".")
211+
def test_sample_count_het(fixture, api: AnophelesHetAnalysis):
212+
# Set up test parameters.
213+
all_sample_sets = api.sample_sets()["sample_set"].to_list()
214+
sample_set = random.choice(all_sample_sets)
215+
df_samples = api.sample_metadata(sample_sets=sample_set)
216+
sample = random.choice(df_samples["sample_id"].to_list())
217+
218+
het_params = dict(
219+
sample=sample,
220+
region=random.choice(api.contigs),
221+
sample_set=sample_set,
222+
window_size=20_000,
223+
)
224+
225+
# Run function under test.
226+
df = api.sample_count_het(**het_params)
227+
228+
# Check results.
229+
assert isinstance(df, pd.DataFrame)
230+
expected_columns = ["sample_id", "window_start", "window_stop", "heterozygosity"]
231+
for col in expected_columns:
232+
assert col in df.columns
233+
assert len(df) > 0
234+
assert (df["heterozygosity"] >= 0).all()
235+
assert (df["heterozygosity"] <= 1).all()
236+
237+
238+
@parametrize_with_cases("fixture,api", cases=".")
239+
def test_cohort_heterozygosity(fixture, api: AnophelesHetAnalysis):
240+
# Set up test parameters.
241+
all_sample_sets = api.sample_sets()["sample_set"].to_list()
242+
sample_set = random.choice(all_sample_sets)
243+
244+
cohort_params = dict(
245+
region=random.choice(api.contigs),
246+
cohorts="taxon",
247+
sample_sets=sample_set,
248+
window_size=20_000,
249+
)
250+
251+
# Run function under test.
252+
df = api.cohort_heterozygosity(**cohort_params)
253+
254+
# Check results.
255+
assert isinstance(df, pd.DataFrame)
256+
expected_columns = ["cohort", "n_samples", "mean_heterozygosity"]
257+
for col in expected_columns:
258+
assert col in df.columns
259+
assert len(df) > 0
260+
assert (df["n_samples"] > 0).all()
261+
assert (df["mean_heterozygosity"] >= 0).all()
262+
assert (df["mean_heterozygosity"] <= 1).all()

0 commit comments

Comments
 (0)