|
2 | 2 |
|
3 | 3 | import dask |
4 | 4 | import pandas as pd # type: ignore |
5 | | -from pandas import CategoricalDtype |
6 | | -import numpy as np # type: ignore |
7 | | -import allel # type: ignore |
8 | 5 | import plotly.express as px # type: ignore |
9 | 6 |
|
10 | 7 | import malariagen_data |
11 | 8 | from .anopheles import AnophelesDataResource |
12 | 9 |
|
13 | | -from numpydoc_decorator import doc |
14 | | -from .util import check_types, _karyotype_tags_n_alt |
15 | | -from .anoph import base_params |
16 | | -from typing import Optional, Literal, Annotated, TypeAlias |
17 | 10 |
|
18 | 11 | # silence dask performance warnings |
19 | 12 | dask.config.set(**{"array.slicing.split_native_chunks": False}) # type: ignore |
|
35 | 28 | GENE_NAMES = { |
36 | 29 | "AGAP004707": "Vgsc/para", |
37 | 30 | } |
| 31 | +INVERSION_TAG_PATH = "karyotype_tag_snps.csv" |
38 | 32 |
|
39 | 33 |
|
40 | 34 | def _setup_aim_palettes(): |
@@ -83,12 +77,6 @@ def _setup_aim_palettes(): |
83 | 77 | } |
84 | 78 |
|
85 | 79 |
|
86 | | -inversion_param: TypeAlias = Annotated[ |
87 | | - Literal["2La", "2Rb", "2Rc_gam", "2Rc_col", "2Rd", "2Rj"], |
88 | | - "Name of inversion to infer karyotype for.", |
89 | | -] |
90 | | - |
91 | | - |
92 | 80 | class Ag3(AnophelesDataResource): |
93 | 81 | """Provides access to data from Ag3.x releases. |
94 | 82 |
|
@@ -203,6 +191,7 @@ def __init__( |
203 | 191 | taxon_colors=TAXON_COLORS, |
204 | 192 | virtual_contigs=VIRTUAL_CONTIGS, |
205 | 193 | gene_names=GENE_NAMES, |
| 194 | + inversion_tag_path=INVERSION_TAG_PATH, |
206 | 195 | ) |
207 | 196 |
|
208 | 197 | # set up caches |
@@ -355,82 +344,3 @@ def _results_cache_add_analysis_params(self, params): |
355 | 344 | super()._results_cache_add_analysis_params(params) |
356 | 345 | # override parent class to add AIM analysis |
357 | 346 | params["aim_analysis"] = self._aim_analysis |
358 | | - |
359 | | - @check_types |
360 | | - @doc( |
361 | | - summary="Load tag SNPs for a given inversion in Ag.", |
362 | | - ) |
363 | | - def load_inversion_tags(self, inversion: inversion_param) -> pd.DataFrame: |
364 | | - # needs to be modified depending on where we are hosting |
365 | | - import importlib.resources |
366 | | - from . import resources |
367 | | - |
368 | | - with importlib.resources.path(resources, "karyotype_tag_snps.csv") as path: |
369 | | - df_tag_snps = pd.read_csv(path, sep=",") |
370 | | - return df_tag_snps.query(f"inversion == '{inversion}'").reset_index() |
371 | | - |
372 | | - @check_types |
373 | | - @doc( |
374 | | - summary="Infer karyotype from tag SNPs for a given inversion in Ag.", |
375 | | - ) |
376 | | - def karyotype( |
377 | | - self, |
378 | | - inversion: inversion_param, |
379 | | - sample_sets: Optional[base_params.sample_sets] = None, |
380 | | - sample_query: Optional[base_params.sample_query] = None, |
381 | | - sample_query_options: Optional[base_params.sample_query_options] = None, |
382 | | - ) -> pd.DataFrame: |
383 | | - # load tag snp data |
384 | | - df_tagsnps = self.load_inversion_tags(inversion=inversion) |
385 | | - inversion_pos = df_tagsnps["position"] |
386 | | - inversion_alts = df_tagsnps["alt_allele"] |
387 | | - contig = inversion[0:2] |
388 | | - |
389 | | - # get snp calls for inversion region |
390 | | - start, end = np.min(inversion_pos), np.max(inversion_pos) |
391 | | - region = f"{contig}:{start}-{end}" |
392 | | - |
393 | | - ds_snps = self.snp_calls( |
394 | | - region=region, |
395 | | - sample_sets=sample_sets, |
396 | | - sample_query=sample_query, |
397 | | - sample_query_options=sample_query_options, |
398 | | - ) |
399 | | - |
400 | | - with self._spinner("Inferring karyotype from tag SNPs"): |
401 | | - # access variables we need |
402 | | - geno = allel.GenotypeDaskArray(ds_snps["call_genotype"].data) |
403 | | - pos = allel.SortedIndex(ds_snps["variant_position"].values) |
404 | | - samples = ds_snps["sample_id"].values |
405 | | - alts = ds_snps["variant_allele"].values.astype(str) |
406 | | - |
407 | | - # subset to position of inversion tags |
408 | | - mask = pos.locate_intersection(inversion_pos)[0] |
409 | | - alts = alts[mask] |
410 | | - geno = geno.compress(mask, axis=0).compute() |
411 | | - |
412 | | - # infer karyotype |
413 | | - gn_alt = _karyotype_tags_n_alt( |
414 | | - gt=geno, alts=alts, inversion_alts=inversion_alts |
415 | | - ) |
416 | | - is_called = geno.is_called() |
417 | | - |
418 | | - # calculate mean genotype for each sample whilst masking missing calls |
419 | | - av_gts = np.mean(np.ma.MaskedArray(gn_alt, mask=~is_called), axis=0) |
420 | | - total_sites = np.sum(is_called, axis=0) |
421 | | - |
422 | | - df = pd.DataFrame( |
423 | | - { |
424 | | - "sample_id": samples, |
425 | | - "inversion": inversion, |
426 | | - f"karyotype_{inversion}_mean": av_gts, |
427 | | - # round the genotypes then convert to int |
428 | | - f"karyotype_{inversion}": av_gts.round().astype(int), |
429 | | - "total_tag_snps": total_sites, |
430 | | - }, |
431 | | - ) |
432 | | - # Allow filling missing values with "<NA>" visible placeholder. |
433 | | - kt_dtype = CategoricalDtype(categories=[0, 1, 2, "<NA>"], ordered=True) |
434 | | - df[f"karyotype_{inversion}"] = df[f"karyotype_{inversion}"].astype(kt_dtype) |
435 | | - |
436 | | - return df |
0 commit comments