|
2 | 2 |
|
3 | 3 | import dask |
4 | 4 | import pandas as pd # type: ignore |
5 | | -from pandas import CategoricalDtype |
6 | | -import numpy as np # type: ignore |
7 | | -import allel # type: ignore |
8 | 5 | import plotly.express as px # type: ignore |
9 | 6 |
|
10 | 7 | import malariagen_data |
11 | 8 | from .anopheles import AnophelesDataResource |
12 | 9 |
|
13 | | -from numpydoc_decorator import doc |
14 | | -from .util import check_types, _karyotype_tags_n_alt |
15 | | -from .anoph import base_params |
16 | | -from typing import Optional, Literal, Annotated, TypeAlias |
| 10 | +from typing import Literal, Annotated, TypeAlias |
17 | 11 |
|
18 | 12 | # silence dask performance warnings |
19 | 13 | dask.config.set(**{"array.slicing.split_native_chunks": False}) # type: ignore |
@@ -355,82 +349,3 @@ def _results_cache_add_analysis_params(self, params): |
355 | 349 | super()._results_cache_add_analysis_params(params) |
356 | 350 | # override parent class to add AIM analysis |
357 | 351 | params["aim_analysis"] = self._aim_analysis |
358 | | - |
359 | | - @check_types |
360 | | - @doc( |
361 | | - summary="Load tag SNPs for a given inversion in Ag.", |
362 | | - ) |
363 | | - def load_inversion_tags(self, inversion: inversion_param) -> pd.DataFrame: |
364 | | - # needs to be modified depending on where we are hosting |
365 | | - import importlib.resources |
366 | | - from . import resources |
367 | | - |
368 | | - with importlib.resources.path(resources, "karyotype_tag_snps.csv") as path: |
369 | | - df_tag_snps = pd.read_csv(path, sep=",") |
370 | | - return df_tag_snps.query(f"inversion == '{inversion}'").reset_index() |
371 | | - |
372 | | - @check_types |
373 | | - @doc( |
374 | | - summary="Infer karyotype from tag SNPs for a given inversion in Ag.", |
375 | | - ) |
376 | | - def karyotype( |
377 | | - self, |
378 | | - inversion: inversion_param, |
379 | | - sample_sets: Optional[base_params.sample_sets] = None, |
380 | | - sample_query: Optional[base_params.sample_query] = None, |
381 | | - sample_query_options: Optional[base_params.sample_query_options] = None, |
382 | | - ) -> pd.DataFrame: |
383 | | - # load tag snp data |
384 | | - df_tagsnps = self.load_inversion_tags(inversion=inversion) |
385 | | - inversion_pos = df_tagsnps["position"] |
386 | | - inversion_alts = df_tagsnps["alt_allele"] |
387 | | - contig = inversion[0:2] |
388 | | - |
389 | | - # get snp calls for inversion region |
390 | | - start, end = np.min(inversion_pos), np.max(inversion_pos) |
391 | | - region = f"{contig}:{start}-{end}" |
392 | | - |
393 | | - ds_snps = self.snp_calls( |
394 | | - region=region, |
395 | | - sample_sets=sample_sets, |
396 | | - sample_query=sample_query, |
397 | | - sample_query_options=sample_query_options, |
398 | | - ) |
399 | | - |
400 | | - with self._spinner("Inferring karyotype from tag SNPs"): |
401 | | - # access variables we need |
402 | | - geno = allel.GenotypeDaskArray(ds_snps["call_genotype"].data) |
403 | | - pos = allel.SortedIndex(ds_snps["variant_position"].values) |
404 | | - samples = ds_snps["sample_id"].values |
405 | | - alts = ds_snps["variant_allele"].values.astype(str) |
406 | | - |
407 | | - # subset to position of inversion tags |
408 | | - mask = pos.locate_intersection(inversion_pos)[0] |
409 | | - alts = alts[mask] |
410 | | - geno = geno.compress(mask, axis=0).compute() |
411 | | - |
412 | | - # infer karyotype |
413 | | - gn_alt = _karyotype_tags_n_alt( |
414 | | - gt=geno, alts=alts, inversion_alts=inversion_alts |
415 | | - ) |
416 | | - is_called = geno.is_called() |
417 | | - |
418 | | - # calculate mean genotype for each sample whilst masking missing calls |
419 | | - av_gts = np.mean(np.ma.MaskedArray(gn_alt, mask=~is_called), axis=0) |
420 | | - total_sites = np.sum(is_called, axis=0) |
421 | | - |
422 | | - df = pd.DataFrame( |
423 | | - { |
424 | | - "sample_id": samples, |
425 | | - "inversion": inversion, |
426 | | - f"karyotype_{inversion}_mean": av_gts, |
427 | | - # round the genotypes then convert to int |
428 | | - f"karyotype_{inversion}": av_gts.round().astype(int), |
429 | | - "total_tag_snps": total_sites, |
430 | | - }, |
431 | | - ) |
432 | | - # Allow filling missing values with "<NA>" visible placeholder. |
433 | | - kt_dtype = CategoricalDtype(categories=[0, 1, 2, "<NA>"], ordered=True) |
434 | | - df[f"karyotype_{inversion}"] = df[f"karyotype_{inversion}"].astype(kt_dtype) |
435 | | - |
436 | | - return df |
0 commit comments