Skip to content

Commit 54efbc3

Browse files
authored
Merge branch 'master' into issue-845-bokeh
2 parents 5579d53 + ccc6592 commit 54efbc3

3 files changed

Lines changed: 316 additions & 0 deletions

File tree

malariagen_data/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# flake8: noqa
2+
from .adar1 import Adar1
23
from .adir1 import Adir1
34
from .af1 import Af1
45
from .ag3 import Ag3

malariagen_data/adar1.py

Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
import sys
2+
3+
import plotly.express as px # type: ignore
4+
5+
import malariagen_data
6+
from .anopheles import AnophelesDataResource
7+
8+
MAJOR_VERSION_NUMBER = 1
9+
MAJOR_VERSION_PATH = "v1.0"
10+
CONFIG_PATH = "v1.0-config.json"
11+
GCS_DEFAULT_URL = "gs://vo_adar_release_master_us_central1/"
12+
GCS_DEFAULT_PUBLIC_URL = "gs://vo_adar_release_master_us_central1/"
13+
GCS_REGION_URLS = {
14+
"us-central1": "gs://vo_adar_release_master_us_central1",
15+
}
16+
17+
TAXON_PALETTE = px.colors.qualitative.Plotly
18+
TAXON_COLORS = {
19+
"darlingi": TAXON_PALETTE[0],
20+
}
21+
22+
XPEHH_GWSS_CACHE_NAME = "adar1_xpehh_gwss_v1"
23+
IHS_GWSS_CACHE_NAME = "adar1_ihs_gwss_v1"
24+
ROH_HMM_CACHE_NAME = "adar1_roh_hmm_v1"
25+
26+
27+
class Adar1(AnophelesDataResource):
28+
"""Provides access to data from Adar1.0 releases.
29+
30+
Parameters
31+
----------
32+
url : str, optional
33+
Base path to data. Defaults to use Google Cloud Storage, or can
34+
be a local path on your file system if data have been downloaded.
35+
site_filters_analysis : str, optional
36+
Site filters analysis version.
37+
bokeh_output_notebook : bool, optional
38+
If True (default), configure bokeh to output plots to the notebook.
39+
results_cache : str, optional
40+
Path to directory on local file system to save results.
41+
log : str or stream, optional
42+
File path or stream output for logging messages.
43+
debug : bool, optional
44+
Set to True to enable debug level logging.
45+
show_progress : bool, optional
46+
If True, show a progress bar during longer-running computations. The default can be overridden using an environmental variable named MGEN_SHOW_PROGRESS.
47+
check_location : bool, optional
48+
If True, use ipinfo to check the location of the client system.
49+
**kwargs
50+
Passed through to fsspec when setting up file system access.
51+
52+
Examples
53+
--------
54+
Access data from Google Cloud Storage (default):
55+
56+
>>> import malariagen_data
57+
>>> adar1 = malariagen_data.Adar1()
58+
59+
Access data downloaded to a local file system:
60+
61+
>>> adar1 = malariagen_data.Adar1("/local/path/to/vo_adar_release/")
62+
63+
Access data from Google Cloud Storage, with caching on the local file system
64+
in a directory named "gcs_cache":
65+
66+
>>> adar1 = malariagen_data.Adar1(
67+
... "simplecache::gs://vo_adar_release_master_us_central1",
68+
... simplecache=dict(cache_storage="gcs_cache"),
69+
... )
70+
71+
Set up caching of some longer-running computations on the local file system,
72+
in a directory named "results_cache":
73+
74+
>>> adar1 = malariagen_data.Adar1(results_cache="results_cache")
75+
76+
"""
77+
78+
_xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME
79+
_ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME
80+
_roh_hmm_cache_name = ROH_HMM_CACHE_NAME
81+
82+
def __init__(
83+
self,
84+
url=None,
85+
public_url=GCS_DEFAULT_PUBLIC_URL,
86+
bokeh_output_notebook=True,
87+
results_cache=None,
88+
log=sys.stdout,
89+
debug=False,
90+
show_progress=None,
91+
check_location=True,
92+
cohorts_analysis=None,
93+
site_filters_analysis=None,
94+
discordant_read_calls_analysis=None,
95+
pre=False,
96+
tqdm_class=None,
97+
unrestricted_use_only=False,
98+
surveillance_use_only=False,
99+
**storage_options,
100+
):
101+
super().__init__(
102+
url=url,
103+
public_url=public_url,
104+
config_path=CONFIG_PATH,
105+
cohorts_analysis=cohorts_analysis,
106+
aim_analysis=None,
107+
aim_metadata_dtype=None,
108+
aim_ids=None,
109+
aim_palettes=None,
110+
site_filters_analysis=site_filters_analysis,
111+
discordant_read_calls_analysis=discordant_read_calls_analysis,
112+
default_site_mask="darlingi",
113+
default_phasing_analysis="darlingi",
114+
default_coverage_calls_analysis="darlingi",
115+
bokeh_output_notebook=bokeh_output_notebook,
116+
results_cache=results_cache,
117+
log=log,
118+
debug=debug,
119+
show_progress=show_progress,
120+
check_location=check_location,
121+
pre=pre,
122+
gcs_default_url=GCS_DEFAULT_URL,
123+
gcs_region_urls=GCS_REGION_URLS,
124+
major_version_number=MAJOR_VERSION_NUMBER,
125+
major_version_path=MAJOR_VERSION_PATH,
126+
gff_gene_type="gene",
127+
gff_gene_name_attribute="Note",
128+
gff_default_attributes=("ID", "Parent", "Note", "description"),
129+
storage_options=storage_options,
130+
tqdm_class=tqdm_class,
131+
taxon_colors=TAXON_COLORS,
132+
virtual_contigs=None,
133+
gene_names=None,
134+
inversion_tag_path=None,
135+
unrestricted_use_only=unrestricted_use_only,
136+
surveillance_use_only=surveillance_use_only,
137+
)
138+
139+
def __repr__(self):
140+
text = (
141+
f"<MalariaGEN Adar1 API client>\n"
142+
f"Storage URL : {self._url}\n"
143+
f"Data releases available : {', '.join(self._available_releases)}\n"
144+
f"Results cache : {self._results_cache}\n"
145+
f"Cohorts analysis : {self._cohorts_analysis}\n"
146+
f"Site filters analysis : {self._site_filters_analysis}\n"
147+
f"Software version : malariagen_data {malariagen_data.__version__}\n"
148+
f"Client location : {self.client_location}\n"
149+
f"Data filtered to unrestricted use only: {self._unrestricted_use_only}\n"
150+
f"Data filtered to surveillance use only: {self._surveillance_use_only}\n"
151+
f"Relevant data releases : {', '.join(self.releases)}\n"
152+
f"---\n"
153+
f"Please note that data are subject to terms of use,\n"
154+
f"for more information see https://www.malariagen.net/data\n"
155+
f"or contact support@malariagen.net. For API documentation see \n"
156+
f"https://malariagen.github.io/malariagen-data-python/v{malariagen_data.__version__}/Adir1.html"
157+
)
158+
return text
159+
160+
def _repr_html_(self):
161+
html = f"""
162+
<table class="malariagen-adar1">
163+
<thead>
164+
<tr>
165+
<th style="text-align: left" colspan="2">MalariaGEN Adar1 API client</th>
166+
</tr>
167+
<tr><td colspan="2" style="text-align: left">
168+
Please note that data are subject to terms of use,
169+
for more information see <a href="https://www.malariagen.net/data">
170+
the MalariaGEN website</a> or contact support@malariagen.net.
171+
See also the <a href="https://malariagen.github.io/malariagen-data-python/v{malariagen_data.__version__}/Adir1.html">Adir1 API docs</a>.
172+
</td></tr>
173+
</thead>
174+
<tbody>
175+
<tr>
176+
<th style="text-align: left">
177+
Storage URL
178+
</th>
179+
<td>{self._url}</td>
180+
</tr>
181+
<tr>
182+
<th style="text-align: left">
183+
Data releases available
184+
</th>
185+
<td>{', '.join(self._available_releases)}</td>
186+
</tr>
187+
<tr>
188+
<th style="text-align: left">
189+
Results cache
190+
</th>
191+
<td>{self._results_cache}</td>
192+
</tr>
193+
<tr>
194+
<th style="text-align: left">
195+
Cohorts analysis
196+
</th>
197+
<td>{self._cohorts_analysis}</td>
198+
</tr>
199+
<tr>
200+
<th style="text-align: left">
201+
Site filters analysis
202+
</th>
203+
<td>{self._site_filters_analysis}</td>
204+
</tr>
205+
<tr>
206+
<th style="text-align: left">
207+
Software version
208+
</th>
209+
<td>malariagen_data {malariagen_data.__version__}</td>
210+
</tr>
211+
<tr>
212+
<th style="text-align: left">
213+
Client location
214+
</th>
215+
<td>{self.client_location}</td>
216+
</tr>
217+
<tr>
218+
<th style="text-align: left">
219+
Data filtered for unrestricted use only
220+
</th>
221+
<td>{self._unrestricted_use_only}</td>
222+
</tr>
223+
<tr>
224+
<th style="text-align: left">
225+
Data filtered for surveillance use only
226+
</th>
227+
<td>{self._surveillance_use_only}</td>
228+
</tr>
229+
<tr>
230+
<th style="text-align: left">
231+
Relevant data releases
232+
</th>
233+
<td>{', '.join(self.releases)}</td>
234+
</tr>
235+
</tbody>
236+
</table>
237+
"""
238+
return html

tests/integration/test_adar1.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
import pytest
2+
3+
from malariagen_data import Adar1, Region
4+
from malariagen_data.util import _locate_region, _resolve_region
5+
6+
7+
def setup_adar1(url="simplecache::gs://vo_adar_release_master_us_central1/", **kwargs):
8+
kwargs.setdefault("check_location", False)
9+
kwargs.setdefault("show_progress", False)
10+
if url is None:
11+
# test default URL
12+
# This only tests the setup_af1 default url, not the Af1 default.
13+
# The test_anopheles setup_subclass tests true defaults.
14+
return Adar1(**kwargs)
15+
if url.startswith("simplecache::"):
16+
# configure the directory on the local file system to cache data
17+
kwargs["simplecache"] = dict(cache_storage="gcs_cache")
18+
return Adar1(url, **kwargs)
19+
20+
21+
def test_repr():
22+
adar1 = setup_adar1(check_location=True)
23+
assert isinstance(adar1, Adar1)
24+
r = repr(adar1)
25+
assert isinstance(r, str)
26+
27+
28+
@pytest.mark.parametrize(
29+
"region_raw",
30+
[
31+
"2",
32+
"gene-LOC125950257",
33+
"2:4871446-4871535",
34+
"2:2,630,355-2,633,221",
35+
Region("2", 4871446, 4871535),
36+
],
37+
)
38+
def test_locate_region(region_raw):
39+
# TODO Migrate this test.
40+
adar1 = setup_adar1()
41+
gene_annotation = adar1.geneset(attributes=["ID"])
42+
region = _resolve_region(adar1, region_raw)
43+
pos = adar1.snp_sites(region=region.contig, field="POS")
44+
# Used by some code that has not been added yet
45+
# ref = adar1.snp_sites(region=region.contig, field="REF")
46+
loc_region = _locate_region(region, pos)
47+
48+
# check types
49+
assert isinstance(loc_region, slice)
50+
assert isinstance(region, Region)
51+
52+
# check Region with contig
53+
if region_raw == "2":
54+
assert region.contig == "2"
55+
assert region.start is None
56+
assert region.end is None
57+
58+
# check that Region goes through unchanged
59+
if isinstance(region_raw, Region):
60+
assert region == region_raw
61+
62+
# check that gene name matches coordinates from the geneset and matches gene sequence
63+
if region_raw == "gene-LOC125950257":
64+
gene = gene_annotation.query("ID == 'gene-LOC125950257'").squeeze()
65+
assert region == Region(gene.contig, gene.start, gene.end)
66+
assert pos[loc_region][0] == gene.start
67+
assert pos[loc_region][-1] == gene.end
68+
# To be checked
69+
# assert (
70+
# ref[loc_region][:5].compute() == np.array(["T", "T", "G", "T", "T"])
71+
# ).all()
72+
73+
# check string parsing
74+
if region_raw == "2:4871446-4871535":
75+
assert region == Region("2", 4871446, 4871535)
76+
if region_raw == "2:2,630,355-2,633,221":
77+
assert region == Region("2", 2630355, 2633221)

0 commit comments

Comments
 (0)