Skip to content

Commit 27e4283

Browse files
authored
Merge pull request #1236 from nw20/add_pf9
Adding Pf9 functionality
2 parents 9b9608a + f6ea824 commit 27e4283

File tree

5 files changed

+527
-1
lines changed

5 files changed

+527
-1
lines changed

malariagen_data/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from .anopheles import AnophelesDataResource, Region
88
from .pf7 import Pf7
99
from .pf8 import Pf8
10+
from .pf9 import Pf9
1011
from .pv4 import Pv4
1112
from .util import SiteClass
1213

malariagen_data/pf9.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import os
2+
3+
from .plasmodium import PlasmodiumDataResource
4+
5+
6+
class Pf9(PlasmodiumDataResource):
7+
"""Provides access to data from the Pf9 release.
8+
9+
Parameters
10+
----------
11+
url : str, optional
12+
Base path to data. Default uses Google Cloud Storage "gs://pf9-release/",
13+
or specify a local path on your file system if data have been downloaded.
14+
data_config : str, optional
15+
Path to config for structure of Pf9 data resource. Defaults to config included
16+
with the malariagen_data package.
17+
**kwargs
18+
Passed through to fsspec when setting up file system access.
19+
20+
Examples
21+
--------
22+
Access data from Google Cloud Storage (default):
23+
24+
>>> import malariagen_data
25+
>>> pf9 = malariagen_data.Pf9()
26+
27+
Access data downloaded to a local file system:
28+
29+
>>> pf9 = malariagen_data.Pf9("/local/path/to/pf9-release/")
30+
31+
"""
32+
33+
def __init__(
34+
self,
35+
url=None,
36+
data_config=None,
37+
**kwargs,
38+
):
39+
# setup filesystem
40+
if not data_config:
41+
working_dir = os.path.dirname(os.path.abspath(__file__))
42+
data_config = os.path.join(working_dir, "pf9_config.json")
43+
super().__init__(data_config=data_config, url=url)

malariagen_data/pf9_config.json

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
{
2+
"default_url": "gs://pf9-release/" ,
3+
"metadata_path": "metadata/Pf9_samples.txt",
4+
"reference_path": "reference/PlasmoDB-54-Pfalciparum3D7-Genome.zarr/",
5+
"reference_contigs": [
6+
"Pf3D7_01_v3",
7+
"Pf3D7_02_v3",
8+
"Pf3D7_03_v3",
9+
"Pf3D7_04_v3",
10+
"Pf3D7_05_v3",
11+
"Pf3D7_06_v3",
12+
"Pf3D7_07_v3",
13+
"Pf3D7_08_v3",
14+
"Pf3D7_09_v3",
15+
"Pf3D7_10_v3",
16+
"Pf3D7_11_v3",
17+
"Pf3D7_12_v3",
18+
"Pf3D7_13_v3",
19+
"Pf3D7_14_v3",
20+
"Pf3D7_API_v3",
21+
"Pf3D7_MIT_v3"
22+
],
23+
"annotations_path": "annotations/PlasmoDB-55_Pfalciparum3D7.gff.gz",
24+
"variant_calls_zarr_path": "zarr/",
25+
"default_variant_variables": {
26+
"FILTER_PASS": ["variants"],
27+
"is_snp": ["variants"],
28+
"numalt": ["variants"],
29+
"CDS": ["variants"]
30+
},
31+
"extended_calldata_variables": {
32+
"DP": ["variants", "samples"],
33+
"GQ": ["variants", "samples"],
34+
"MIN_DP": ["variants", "samples"],
35+
"PGT": ["variants", "samples"],
36+
"PID": ["variants", "samples"],
37+
"PS": ["variants", "samples"],
38+
"RGQ": ["variants", "samples"],
39+
"PL": ["variants", "samples", "genotypes"],
40+
"SB": ["variants", "samples", "sb_statistics"]
41+
},
42+
"extended_variant_fields": {
43+
"AC": ["variants", "alt_alleles"],
44+
"AF": ["variants", "alt_alleles"],
45+
"AN": ["variants"],
46+
"ANN_AA_length": ["variants", "alt_alleles"],
47+
"ANN_AA_pos": ["variants", "alt_alleles"],
48+
"ANN_Allele": ["variants", "alt_alleles"],
49+
"ANN_Annotation": ["variants", "alt_alleles"],
50+
"ANN_Annotation_Impact": ["variants", "alt_alleles"],
51+
"ANN_CDS_length": ["variants", "alt_alleles"],
52+
"ANN_CDS_pos": ["variants", "alt_alleles"],
53+
"ANN_Distance": ["variants", "alt_alleles"],
54+
"ANN_Feature_ID": ["variants", "alt_alleles"],
55+
"ANN_Feature_Type": ["variants", "alt_alleles"],
56+
"ANN_Gene_ID": ["variants", "alt_alleles"],
57+
"ANN_Gene_Name": ["variants", "alt_alleles"],
58+
"ANN_HGVS_c": ["variants", "alt_alleles"],
59+
"ANN_HGVS_p": ["variants", "alt_alleles"],
60+
"ANN_Rank": ["variants", "alt_alleles"],
61+
"ANN_Transcript_BioType": ["variants", "alt_alleles"],
62+
"ANN_cDNA_length": ["variants", "alt_alleles"],
63+
"ANN_cDNA_pos": ["variants", "alt_alleles"],
64+
"AS_BaseQRankSum": ["variants", "alt_alleles"],
65+
"AS_FS": ["variants", "alt_alleles"],
66+
"AS_InbreedingCoeff": ["variants", "alt_alleles"],
67+
"AS_MQ": ["variants", "alt_alleles"],
68+
"AS_MQRankSum": ["variants", "alt_alleles"],
69+
"AS_QD": ["variants", "alt_alleles"],
70+
"AS_ReadPosRankSum": ["variants", "alt_alleles"],
71+
"AS_SOR": ["variants", "alt_alleles"],
72+
"BaseQRankSum": ["variants"],
73+
"DP": ["variants"],
74+
"DS": ["variants"],
75+
"END": ["variants"],
76+
"ExcessHet": ["variants"],
77+
"FILTER_Apicoplast": ["variants"],
78+
"FILTER_Centromere": ["variants"],
79+
"FILTER_InternalHypervariable": ["variants"],
80+
"FILTER_LowQual": ["variants"],
81+
"FILTER_Low_VQSLOD": ["variants"],
82+
"FILTER_Mitochondrion": ["variants"],
83+
"FILTER_SubtelomericHypervariable": ["variants"],
84+
"FILTER_SubtelomericRepeat": ["variants"],
85+
"FILTER_VQSRTrancheINDEL99.50to99.60": ["variants"],
86+
"FILTER_VQSRTrancheINDEL99.60to99.80": ["variants"],
87+
"FILTER_VQSRTrancheINDEL99.80to99.90": ["variants"],
88+
"FILTER_VQSRTrancheINDEL99.90to99.95": ["variants"],
89+
"FILTER_VQSRTrancheINDEL99.95to100.00+": ["variants"],
90+
"FILTER_VQSRTrancheINDEL99.95to100.00": ["variants"],
91+
"FILTER_VQSRTrancheSNP99.50to99.60": ["variants"],
92+
"FILTER_VQSRTrancheSNP99.60to99.80": ["variants"],
93+
"FILTER_VQSRTrancheSNP99.80to99.90": ["variants"],
94+
"FILTER_VQSRTrancheSNP99.90to99.95": ["variants"],
95+
"FILTER_VQSRTrancheSNP99.95to100.00+": ["variants"],
96+
"FILTER_VQSRTrancheSNP99.95to100.00": ["variants"],
97+
"FS": ["variants"],
98+
"ID": ["variants"],
99+
"InbreedingCoeff": ["variants"],
100+
"LOF": ["variants"],
101+
"MLEAC": ["variants", "alt_alleles"],
102+
"MLEAF": ["variants", "alt_alleles"],
103+
"MQ": ["variants"],
104+
"MQRankSum": ["variants"],
105+
"NEGATIVE_TRAIN_SITE": ["variants"],
106+
"NMD": ["variants"],
107+
"POSITIVE_TRAIN_SITE": ["variants"],
108+
"QD": ["variants"],
109+
"QUAL": ["variants"],
110+
"RAW_MQandDP": ["variants", "ploidy"],
111+
"ReadPosRankSum": ["variants"],
112+
"RegionType": ["variants"],
113+
"SOR": ["variants"],
114+
"VQSLOD": ["variants"],
115+
"culprit": ["variants"],
116+
"set": ["variants"]
117+
}
118+
}

malariagen_data/plasmodium.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,9 @@ def sample_metadata(self):
6262
if self._cache_sample_metadata is None:
6363
path = os.path.join(self._path, self.CONF["metadata_path"])
6464
with self._fs.open(path) as f:
65-
self._cache_sample_metadata = pd.read_csv(f, sep="\t", na_values="")
65+
self._cache_sample_metadata = pd.read_csv(
66+
f, sep="\t", na_values="", low_memory=False
67+
)
6668
return self._cache_sample_metadata
6769

6870
def _open_variant_calls_zarr(self):

0 commit comments

Comments
 (0)