Skip to content

Commit 03a6d5d

Browse files
authored
Merge pull request #724 from malariagen/GH716_add_constructor_params
Add `unrestricted_use_only` and `surveillance_use_only` constructor params
2 parents c659b69 + dad61a6 commit 03a6d5d

45 files changed

Lines changed: 5084 additions & 300 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

malariagen_data/af1.py

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,8 @@ def __init__(
9191
discordant_read_calls_analysis=None,
9292
pre=False,
9393
tqdm_class=None,
94+
unrestricted_use_only=False,
95+
surveillance_use_only=False,
9496
**storage_options, # used by fsspec via init_filesystem()
9597
):
9698
super().__init__(
@@ -127,18 +129,23 @@ def __init__(
127129
virtual_contigs=None,
128130
gene_names=None,
129131
inversion_tag_path=None,
132+
unrestricted_use_only=unrestricted_use_only,
133+
surveillance_use_only=surveillance_use_only,
130134
)
131135

132136
def __repr__(self):
133137
text = (
134138
f"<MalariaGEN Af1 API client>\n"
135-
f"Storage URL : {self._url}\n"
136-
f"Data releases available : {', '.join(self.releases)}\n"
137-
f"Results cache : {self._results_cache}\n"
138-
f"Cohorts analysis : {self._cohorts_analysis}\n"
139-
f"Site filters analysis : {self._site_filters_analysis}\n"
140-
f"Software version : malariagen_data {malariagen_data.__version__}\n"
141-
f"Client location : {self.client_location}\n"
139+
f"Storage URL : {self._url}\n"
140+
f"Data releases available : {', '.join(self._available_releases)}\n"
141+
f"Results cache : {self._results_cache}\n"
142+
f"Cohorts analysis : {self._cohorts_analysis}\n"
143+
f"Site filters analysis : {self._site_filters_analysis}\n"
144+
f"Software version : malariagen_data {malariagen_data.__version__}\n"
145+
f"Client location : {self.client_location}\n"
146+
f"Data filtered to unrestricted use only: {self._unrestricted_use_only}\n"
147+
f"Data filtered to surveillance use only: {self._surveillance_use_only}\n"
148+
f"Relevant data releases : {', '.join(self.releases)}\n"
142149
f"---\n"
143150
f"Please note that data are subject to terms of use,\n"
144151
f"for more information see https://www.malariagen.net/data\n"
@@ -172,7 +179,7 @@ def _repr_html_(self):
172179
<th style="text-align: left">
173180
Data releases available
174181
</th>
175-
<td>{', '.join(self.releases)}</td>
182+
<td>{', '.join(self._available_releases)}</td>
176183
</tr>
177184
<tr>
178185
<th style="text-align: left">
@@ -204,6 +211,24 @@ def _repr_html_(self):
204211
</th>
205212
<td>{self.client_location}</td>
206213
</tr>
214+
<tr>
215+
<th style="text-align: left">
216+
Data filtered for unrestricted use only
217+
</th>
218+
<td>{self._unrestricted_use_only}</td>
219+
</tr>
220+
<tr>
221+
<th style="text-align: left">
222+
Data filtered for surveillance use only
223+
</th>
224+
<td>{self._surveillance_use_only}</td>
225+
</tr>
226+
<tr>
227+
<th style="text-align: left">
228+
Relevant data releases
229+
</th>
230+
<td>{', '.join(self.releases)}</td>
231+
</tr>
207232
</tbody>
208233
</table>
209234
"""

malariagen_data/ag3.py

Lines changed: 76 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,18 @@ def _setup_aim_palettes():
7575
"unassigned": "black",
7676
}
7777

78+
# Note: These column names will be treated as case-insensitive,
79+
# because these column names and the column names from the CSV
80+
# will be converted to lowercase before applying these dtypes.
81+
AIM_METADATA_DTYPE = {
82+
"aim_species_fraction_arab": "float64",
83+
"aim_species_fraction_colu": "float64",
84+
"aim_species_fraction_colu_no2l": "float64",
85+
"aim_species_gambcolu_arabiensis": "object",
86+
"aim_species_gambiae_coluzzii": "object",
87+
"aim_species": "object",
88+
}
89+
7890

7991
class Ag3(AnophelesDataResource):
8092
"""Provides access to data from Ag3.x releases.
@@ -150,6 +162,8 @@ def __init__(
150162
discordant_read_calls_analysis=None,
151163
pre=False,
152164
tqdm_class=None,
165+
unrestricted_use_only=False,
166+
surveillance_use_only=False,
153167
**storage_options, # used by fsspec via init_filesystem()
154168
):
155169
super().__init__(
@@ -158,14 +172,7 @@ def __init__(
158172
config_path=CONFIG_PATH,
159173
cohorts_analysis=cohorts_analysis,
160174
aim_analysis=aim_analysis,
161-
aim_metadata_dtype={
162-
"aim_species_fraction_arab": "float64",
163-
"aim_species_fraction_colu": "float64",
164-
"aim_species_fraction_colu_no2l": "float64",
165-
"aim_species_gambcolu_arabiensis": "object",
166-
"aim_species_gambiae_coluzzii": "object",
167-
"aim_species": "object",
168-
},
175+
aim_metadata_dtype=AIM_METADATA_DTYPE,
169176
aim_ids=("gambcolu_vs_arab", "gamb_vs_colu"),
170177
aim_palettes=AIM_PALETTES,
171178
site_filters_analysis=site_filters_analysis,
@@ -193,6 +200,8 @@ def __init__(
193200
virtual_contigs=VIRTUAL_CONTIGS,
194201
gene_names=GENE_NAMES,
195202
inversion_tag_path=INVERSION_TAG_PATH,
203+
unrestricted_use_only=unrestricted_use_only,
204+
surveillance_use_only=surveillance_use_only,
196205
)
197206

198207
# set up caches
@@ -204,21 +213,24 @@ def v3_wild(self):
204213
3.0 release, excluding the lab crosses."""
205214
return [
206215
x
207-
for x in self.sample_sets(release="3.0")["sample_set"].tolist()
216+
for x in self._available_sample_sets(release="3.0")["sample_set"].tolist()
208217
if x != "AG1000G-X"
209218
]
210219

211220
def __repr__(self):
212221
text = (
213222
f"<MalariaGEN Ag3 API client>\n"
214-
f"Storage URL : {self._url}\n"
215-
f"Data releases available : {', '.join(self.releases)}\n"
216-
f"Results cache : {self._results_cache}\n"
217-
f"Cohorts analysis : {self._cohorts_analysis}\n"
218-
f"AIM analysis : {self._aim_analysis}\n"
219-
f"Site filters analysis : {self._site_filters_analysis}\n"
220-
f"Software version : malariagen_data {malariagen_data.__version__}\n"
221-
f"Client location : {self.client_location}\n"
223+
f"Storage URL : {self._url}\n"
224+
f"Data releases available : {', '.join(self._available_releases)}\n"
225+
f"Results cache : {self._results_cache}\n"
226+
f"Cohorts analysis : {self._cohorts_analysis}\n"
227+
f"AIM analysis : {self._aim_analysis}\n"
228+
f"Site filters analysis : {self._site_filters_analysis}\n"
229+
f"Software version : malariagen_data {malariagen_data.__version__}\n"
230+
f"Client location : {self.client_location}\n"
231+
f"Data filtered to unrestricted use only: {self._unrestricted_use_only}\n"
232+
f"Data filtered to surveillance use only: {self._surveillance_use_only}\n"
233+
f"Relevant data releases : {', '.join(self.releases)}\n"
222234
f"---\n"
223235
f"Please note that data are subject to terms of use,\n"
224236
f"for more information see https://www.malariagen.net/data\n"
@@ -252,7 +264,7 @@ def _repr_html_(self):
252264
<th style="text-align: left">
253265
Data releases available
254266
</th>
255-
<td>{', '.join(self.releases)}</td>
267+
<td>{', '.join(self._available_releases)}</td>
256268
</tr>
257269
<tr>
258270
<th style="text-align: left">
@@ -290,6 +302,24 @@ def _repr_html_(self):
290302
</th>
291303
<td>{self.client_location}</td>
292304
</tr>
305+
<tr>
306+
<th style="text-align: left">
307+
Data filtered for unrestricted use only
308+
</th>
309+
<td>{self._unrestricted_use_only}</td>
310+
</tr>
311+
<tr>
312+
<th style="text-align: left">
313+
Data filtered for surveillance use only
314+
</th>
315+
<td>{self._surveillance_use_only}</td>
316+
</tr>
317+
<tr>
318+
<th style="text-align: left">
319+
Relevant data releases
320+
</th>
321+
<td>{', '.join(self.releases)}</td>
322+
</tr>
293323
</tbody>
294324
</table>
295325
"""
@@ -337,6 +367,34 @@ def cross_metadata(self):
337367
debug("drop 'phenotype' column, not used")
338368
df.drop("phenotype", axis="columns", inplace=True)
339369

370+
# Identify the crosses sample set.
371+
# Note: this sample set identifier is also hard-coded in `v3_wild()`.
372+
crosses_sample_set = "AG1000G-X"
373+
374+
# If `_unrestricted_use_only` is `True`, then only return data if the crosses sample set has `unrestricted_use` set to `True`.
375+
if (
376+
self._unrestricted_use_only
377+
and not self._sample_set_has_unrestricted_use(
378+
sample_set=crosses_sample_set
379+
)
380+
):
381+
# Remove all the data from the DataFrame and reset its index.
382+
df = df.iloc[0:0].reset_index(drop=True)
383+
384+
# If `_surveillance_use_only` is `True`, then only return samples that have `is_surveillance` set to `True`.
385+
if self._surveillance_use_only:
386+
crosses_surveillance_flags_df = self._surveillance_flags(
387+
sample_sets=[crosses_sample_set]
388+
)
389+
df = df.merge(
390+
crosses_surveillance_flags_df[["sample_id", "is_surveillance"]],
391+
on="sample_id",
392+
how="left",
393+
)
394+
df = df[df["is_surveillance"]]
395+
df = df.drop(columns=["is_surveillance"])
396+
397+
# Cache the cross metadata.
340398
self._cache_cross_metadata = df
341399

342400
return self._cache_cross_metadata.copy()

malariagen_data/anoph/aim_data.py

Lines changed: 33 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -138,31 +138,45 @@ def aim_calls(
138138
) -> xr.Dataset:
139139
self._require_aim_analysis()
140140

141-
# Normalise parameters.
142-
aims = self._prep_aims_param(aims=aims)
143-
sample_sets_prepped = self._prep_sample_sets_param(sample_sets=sample_sets)
141+
# Prepare parameters.
142+
prepared_aims = self._prep_aims_param(aims=aims)
143+
del aims
144+
prepared_sample_sets = self._prep_sample_sets_param(sample_sets=sample_sets)
144145
del sample_sets
145-
146-
# Access SNP calls and concatenate multiple sample sets and/or regions.
147-
ly = []
148-
for s in sample_sets_prepped:
149-
y = self._aim_calls_dataset(
150-
aims=aims,
151-
sample_set=s,
146+
prepared_sample_query = self._prep_sample_query_param(sample_query=sample_query)
147+
del sample_query
148+
149+
# Start a list of AIM calls Datasets, one for each sample set.
150+
aim_calls_datasets = []
151+
152+
# For each sample set...
153+
for sample_set in prepared_sample_sets:
154+
# Get the AIM calls for all samples in the set, as a Xarray Dataset.
155+
aim_calls_dataset = self._aim_calls_dataset(
156+
aims=prepared_aims,
157+
sample_set=sample_set,
152158
)
153-
ly.append(y)
159+
160+
# Add this Dataset to the list.
161+
aim_calls_datasets.append(aim_calls_dataset)
154162

155163
# Concatenate data from multiple sample sets.
156-
ds = simple_xarray_concat(ly, dim=DIM_SAMPLE)
164+
ds = simple_xarray_concat(aim_calls_datasets, dim=DIM_SAMPLE)
157165

158-
# Handle sample query.
159-
if sample_query is not None:
160-
df_samples = self.sample_metadata(sample_sets=sample_sets_prepped)
166+
# If there's a sample query...
167+
if prepared_sample_query is not None:
168+
# Get the relevant sample metadata.
169+
df_samples = self.sample_metadata(sample_sets=prepared_sample_sets)
170+
171+
# If there are no sample query options, then default to an empty dict.
161172
sample_query_options = sample_query_options or {}
162-
loc_samples = df_samples.eval(sample_query, **sample_query_options).values
163-
if np.count_nonzero(loc_samples) == 0:
164-
raise ValueError(f"No samples found for query {sample_query!r}")
165-
ds = ds.isel(samples=loc_samples)
173+
174+
ds = self._filter_sample_dataset(
175+
ds=ds,
176+
df_samples=df_samples,
177+
sample_query=prepared_sample_query,
178+
sample_query_options=sample_query_options,
179+
)
166180

167181
return ds
168182

0 commit comments

Comments
 (0)