Skip to content

Commit fdebfd4

Browse files
committed
WIP: dev support for unrestricted_use_only, surveillance_use_only params
1 parent ea950fc commit fdebfd4

7 files changed

Lines changed: 594 additions & 136 deletions

File tree

malariagen_data/af1.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -136,15 +136,16 @@ def __init__(
136136
def __repr__(self):
137137
text = (
138138
f"<MalariaGEN Af1 API client>\n"
139-
f"Storage URL : {self._url}\n"
140-
f"Data releases available : {', '.join(self.releases)}\n"
141-
f"Results cache : {self._results_cache}\n"
142-
f"Cohorts analysis : {self._cohorts_analysis}\n"
143-
f"Site filters analysis : {self._site_filters_analysis}\n"
144-
f"Software version : malariagen_data {malariagen_data.__version__}\n"
145-
f"Client location : {self.client_location}\n"
139+
f"Storage URL : {self._url}\n"
140+
f"Data releases available : {', '.join(self._available_releases)}\n"
141+
f"Results cache : {self._results_cache}\n"
142+
f"Cohorts analysis : {self._cohorts_analysis}\n"
143+
f"Site filters analysis : {self._site_filters_analysis}\n"
144+
f"Software version : malariagen_data {malariagen_data.__version__}\n"
145+
f"Client location : {self.client_location}\n"
146146
f"Data filtered to unrestricted use only: {self._unrestricted_use_only}\n"
147147
f"Data filtered to surveillance use only: {self._surveillance_use_only}\n"
148+
f"Relevant data releases : {', '.join(self.releases)}\n"
148149
f"---\n"
149150
f"Please note that data are subject to terms of use,\n"
150151
f"for more information see https://www.malariagen.net/data\n"
@@ -178,7 +179,7 @@ def _repr_html_(self):
178179
<th style="text-align: left">
179180
Data releases available
180181
</th>
181-
<td>{', '.join(self.releases)}</td>
182+
<td>{', '.join(self._available_releases)}</td>
182183
</tr>
183184
<tr>
184185
<th style="text-align: left">
@@ -222,6 +223,12 @@ def _repr_html_(self):
222223
</th>
223224
<td>{self._surveillance_use_only}</td>
224225
</tr>
226+
<tr>
227+
<th style="text-align: left">
228+
Relevant data releases
229+
</th>
230+
<td>{', '.join(self.releases)}</td>
231+
</tr>
225232
</tbody>
226233
</table>
227234
"""

malariagen_data/ag3.py

Lines changed: 45 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -210,23 +210,24 @@ def v3_wild(self):
210210
3.0 release, excluding the lab crosses."""
211211
return [
212212
x
213-
for x in self.sample_sets(release="3.0")["sample_set"].tolist()
213+
for x in self._available_sample_sets(release="3.0")["sample_set"].tolist()
214214
if x != "AG1000G-X"
215215
]
216216

217217
def __repr__(self):
218218
text = (
219219
f"<MalariaGEN Ag3 API client>\n"
220-
f"Storage URL : {self._url}\n"
221-
f"Data releases available : {', '.join(self.releases)}\n"
222-
f"Results cache : {self._results_cache}\n"
223-
f"Cohorts analysis : {self._cohorts_analysis}\n"
224-
f"AIM analysis : {self._aim_analysis}\n"
225-
f"Site filters analysis : {self._site_filters_analysis}\n"
226-
f"Software version : malariagen_data {malariagen_data.__version__}\n"
227-
f"Client location : {self.client_location}\n"
220+
f"Storage URL : {self._url}\n"
221+
f"Data releases available : {', '.join(self._available_releases)}\n"
222+
f"Results cache : {self._results_cache}\n"
223+
f"Cohorts analysis : {self._cohorts_analysis}\n"
224+
f"AIM analysis : {self._aim_analysis}\n"
225+
f"Site filters analysis : {self._site_filters_analysis}\n"
226+
f"Software version : malariagen_data {malariagen_data.__version__}\n"
227+
f"Client location : {self.client_location}\n"
228228
f"Data filtered to unrestricted use only: {self._unrestricted_use_only}\n"
229229
f"Data filtered to surveillance use only: {self._surveillance_use_only}\n"
230+
f"Relevant data releases : {', '.join(self.releases)}\n"
230231
f"---\n"
231232
f"Please note that data are subject to terms of use,\n"
232233
f"for more information see https://www.malariagen.net/data\n"
@@ -260,7 +261,7 @@ def _repr_html_(self):
260261
<th style="text-align: left">
261262
Data releases available
262263
</th>
263-
<td>{', '.join(self.releases)}</td>
264+
<td>{', '.join(self._available_releases)}</td>
264265
</tr>
265266
<tr>
266267
<th style="text-align: left">
@@ -310,6 +311,12 @@ def _repr_html_(self):
310311
</th>
311312
<td>{self._surveillance_use_only}</td>
312313
</tr>
314+
<tr>
315+
<th style="text-align: left">
316+
Relevant data releases
317+
</th>
318+
<td>{', '.join(self.releases)}</td>
319+
</tr>
313320
</tbody>
314321
</table>
315322
"""
@@ -357,6 +364,34 @@ def cross_metadata(self):
357364
debug("drop 'phenotype' column, not used")
358365
df.drop("phenotype", axis="columns", inplace=True)
359366

367+
# Identify the crosses sample set.
368+
# Note: this sample set identifier is also hard-coded in `v3_wild()`.
369+
crosses_sample_set = "AG1000G-X"
370+
371+
# If `_unrestricted_use_only` is `True`, then only return data if the crosses sample set has `unrestricted_use` set to `True`.
372+
if (
373+
self._unrestricted_use_only
374+
and not self._sample_set_has_unrestricted_use(
375+
sample_set=crosses_sample_set
376+
)
377+
):
378+
# Remove all the data from the DataFrame and reset its index.
379+
df = df.iloc[0:0].reset_index(drop=True)
380+
381+
# If `_surveillance_use_only` is `True`, then only return samples that have `is_surveillance` set to `True`.
382+
if self._surveillance_use_only:
383+
crosses_surveillance_flags_df = self._surveillance_flags(
384+
sample_sets=[crosses_sample_set]
385+
)
386+
df = df.merge(
387+
crosses_surveillance_flags_df[["sample_id", "is_surveillance"]],
388+
on="sample_id",
389+
how="left",
390+
)
391+
df = df[df["is_surveillance"]]
392+
df = df.drop(columns=["is_surveillance"])
393+
394+
# Cache the cross metadata.
360395
self._cache_cross_metadata = df
361396

362397
return self._cache_cross_metadata.copy()

malariagen_data/anoph/aim_data.py

Lines changed: 47 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -138,35 +138,64 @@ def aim_calls(
138138
) -> xr.Dataset:
139139
self._require_aim_analysis()
140140

141-
# Normalise parameters.
142-
aims = self._prep_aims_param(aims=aims)
143-
sample_sets_prepped = self._prep_sample_sets_param(sample_sets=sample_sets)
141+
# Prepare parameters.
142+
prepared_aims = self._prep_aims_param(aims=aims)
143+
del aims
144+
prepared_sample_sets = self._prep_sample_sets_param(sample_sets=sample_sets)
144145
del sample_sets
145-
sample_query_prepped = self._prep_sample_query_param(sample_query=sample_query)
146+
prepared_sample_query = self._prep_sample_query_param(sample_query=sample_query)
146147
del sample_query
147148

148-
# Access SNP calls and concatenate multiple sample sets and/or regions.
149-
ly = []
150-
for s in sample_sets_prepped:
151-
y = self._aim_calls_dataset(
152-
aims=aims,
153-
sample_set=s,
149+
# Start a list of AIM calls Datasets, one for each sample set.
150+
aim_calls_datasets = []
151+
152+
# For each sample set...
153+
for sample_set in prepared_sample_sets:
154+
# Get the AIM calls for all samples in the set, as a Xarray Dataset.
155+
aim_calls_dataset = self._aim_calls_dataset(
156+
aims=prepared_aims,
157+
sample_set=sample_set,
154158
)
155-
ly.append(y)
159+
160+
# Add this Dataset to the list.
161+
aim_calls_datasets.append(aim_calls_dataset)
156162

157163
# Concatenate data from multiple sample sets.
158-
ds = simple_xarray_concat(ly, dim=DIM_SAMPLE)
164+
ds = simple_xarray_concat(aim_calls_datasets, dim=DIM_SAMPLE)
159165

160-
# Handle sample query.
161-
if sample_query_prepped is not None:
162-
df_samples = self.sample_metadata(sample_sets=sample_sets_prepped)
166+
# If there's a sample query...
167+
if prepared_sample_query is not None:
168+
# Get the relevant sample metadata.
169+
df_samples = self.sample_metadata(sample_sets=prepared_sample_sets)
170+
171+
# If there are no sample query options, then default to an empty dict.
163172
sample_query_options = sample_query_options or {}
173+
174+
# Determine which samples match the sample query.
164175
loc_samples = df_samples.eval(
165-
sample_query_prepped, **sample_query_options
176+
prepared_sample_query, **sample_query_options
166177
).values
178+
179+
# Raise an error if no samples match the sample query.
167180
if np.count_nonzero(loc_samples) == 0:
168-
raise ValueError(f"No samples found for query {sample_query_prepped!r}")
169-
ds = ds.isel(samples=loc_samples)
181+
raise ValueError(
182+
f"No samples found for query {prepared_sample_query!r}"
183+
)
184+
185+
# Get the relevant sample ids from the sample metadata DataFrame, using the boolean mask.
186+
relevant_sample_ids = df_samples.loc[loc_samples, "sample_id"].values
187+
188+
# Get all the sample ids from the unfiltered AIM calls Dataset.
189+
ds_sample_ids = ds.coords["sample_id"].values
190+
191+
# Get the indices of samples in the AIM calls Dataset that match the relevant sample ids.
192+
# Note: we use `[0]` to get the first element of the tuple returned by `np.where`.
193+
relevant_sample_indices = np.where(
194+
np.isin(ds_sample_ids, relevant_sample_ids)
195+
)[0]
196+
197+
# Select only the relevant samples from the AIM calls Dataset.
198+
ds = ds.isel(samples=relevant_sample_indices)
170199

171200
return ds
172201

0 commit comments

Comments
 (0)