@@ -75,6 +75,18 @@ def _setup_aim_palettes():
7575 "unassigned" : "black" ,
7676}
7777
78+ # Note: These column names will be treated as case-insensitive,
79+ # because these column names and the column names from the CSV
80+ # will be converted to lowercase before applying these dtypes.
81+ AIM_METADATA_DTYPE = {
82+ "aim_species_fraction_arab" : "float64" ,
83+ "aim_species_fraction_colu" : "float64" ,
84+ "aim_species_fraction_colu_no2l" : "float64" ,
85+ "aim_species_gambcolu_arabiensis" : "object" ,
86+ "aim_species_gambiae_coluzzii" : "object" ,
87+ "aim_species" : "object" ,
88+ }
89+
7890
7991class Ag3 (AnophelesDataResource ):
8092 """Provides access to data from Ag3.x releases.
@@ -150,6 +162,8 @@ def __init__(
150162 discordant_read_calls_analysis = None ,
151163 pre = False ,
152164 tqdm_class = None ,
165+ unrestricted_use_only = False ,
166+ surveillance_use_only = False ,
153167 ** storage_options , # used by fsspec via init_filesystem()
154168 ):
155169 super ().__init__ (
@@ -158,14 +172,7 @@ def __init__(
158172 config_path = CONFIG_PATH ,
159173 cohorts_analysis = cohorts_analysis ,
160174 aim_analysis = aim_analysis ,
161- aim_metadata_dtype = {
162- "aim_species_fraction_arab" : "float64" ,
163- "aim_species_fraction_colu" : "float64" ,
164- "aim_species_fraction_colu_no2l" : "float64" ,
165- "aim_species_gambcolu_arabiensis" : "object" ,
166- "aim_species_gambiae_coluzzii" : "object" ,
167- "aim_species" : "object" ,
168- },
175+ aim_metadata_dtype = AIM_METADATA_DTYPE ,
169176 aim_ids = ("gambcolu_vs_arab" , "gamb_vs_colu" ),
170177 aim_palettes = AIM_PALETTES ,
171178 site_filters_analysis = site_filters_analysis ,
@@ -193,6 +200,8 @@ def __init__(
193200 virtual_contigs = VIRTUAL_CONTIGS ,
194201 gene_names = GENE_NAMES ,
195202 inversion_tag_path = INVERSION_TAG_PATH ,
203+ unrestricted_use_only = unrestricted_use_only ,
204+ surveillance_use_only = surveillance_use_only ,
196205 )
197206
198207 # set up caches
@@ -204,21 +213,24 @@ def v3_wild(self):
204213 3.0 release, excluding the lab crosses."""
205214 return [
206215 x
207- for x in self .sample_sets (release = "3.0" )["sample_set" ].tolist ()
216+ for x in self ._available_sample_sets (release = "3.0" )["sample_set" ].tolist ()
208217 if x != "AG1000G-X"
209218 ]
210219
211220 def __repr__ (self ):
212221 text = (
213222 f"<MalariaGEN Ag3 API client>\n "
214- f"Storage URL : { self ._url } \n "
215- f"Data releases available : { ', ' .join (self .releases )} \n "
216- f"Results cache : { self ._results_cache } \n "
217- f"Cohorts analysis : { self ._cohorts_analysis } \n "
218- f"AIM analysis : { self ._aim_analysis } \n "
219- f"Site filters analysis : { self ._site_filters_analysis } \n "
220- f"Software version : malariagen_data { malariagen_data .__version__ } \n "
221- f"Client location : { self .client_location } \n "
223+ f"Storage URL : { self ._url } \n "
224+ f"Data releases available : { ', ' .join (self ._available_releases )} \n "
225+ f"Results cache : { self ._results_cache } \n "
226+ f"Cohorts analysis : { self ._cohorts_analysis } \n "
227+ f"AIM analysis : { self ._aim_analysis } \n "
228+ f"Site filters analysis : { self ._site_filters_analysis } \n "
229+ f"Software version : malariagen_data { malariagen_data .__version__ } \n "
230+ f"Client location : { self .client_location } \n "
231+ f"Data filtered to unrestricted use only: { self ._unrestricted_use_only } \n "
232+ f"Data filtered to surveillance use only: { self ._surveillance_use_only } \n "
233+ f"Relevant data releases : { ', ' .join (self .releases )} \n "
222234 f"---\n "
223235 f"Please note that data are subject to terms of use,\n "
224236 f"for more information see https://www.malariagen.net/data\n "
@@ -252,7 +264,7 @@ def _repr_html_(self):
252264 <th style="text-align: left">
253265 Data releases available
254266 </th>
255- <td>{ ', ' .join (self .releases )} </td>
267+ <td>{ ', ' .join (self ._available_releases )} </td>
256268 </tr>
257269 <tr>
258270 <th style="text-align: left">
@@ -290,6 +302,24 @@ def _repr_html_(self):
290302 </th>
291303 <td>{ self .client_location } </td>
292304 </tr>
305+ <tr>
306+ <th style="text-align: left">
307+ Data filtered for unrestricted use only
308+ </th>
309+ <td>{ self ._unrestricted_use_only } </td>
310+ </tr>
311+ <tr>
312+ <th style="text-align: left">
313+ Data filtered for surveillance use only
314+ </th>
315+ <td>{ self ._surveillance_use_only } </td>
316+ </tr>
317+ <tr>
318+ <th style="text-align: left">
319+ Relevant data releases
320+ </th>
321+ <td>{ ', ' .join (self .releases )} </td>
322+ </tr>
293323 </tbody>
294324 </table>
295325 """
@@ -337,6 +367,34 @@ def cross_metadata(self):
337367 debug ("drop 'phenotype' column, not used" )
338368 df .drop ("phenotype" , axis = "columns" , inplace = True )
339369
370+ # Identify the crosses sample set.
371+ # Note: this sample set identifier is also hard-coded in `v3_wild()`.
372+ crosses_sample_set = "AG1000G-X"
373+
374+ # If `_unrestricted_use_only` is `True`, then only return data if the crosses sample set has `unrestricted_use` set to `True`.
375+ if (
376+ self ._unrestricted_use_only
377+ and not self ._sample_set_has_unrestricted_use (
378+ sample_set = crosses_sample_set
379+ )
380+ ):
381+ # Remove all the data from the DataFrame and reset its index.
382+ df = df .iloc [0 :0 ].reset_index (drop = True )
383+
384+ # If `_surveillance_use_only` is `True`, then only return samples that have `is_surveillance` set to `True`.
385+ if self ._surveillance_use_only :
386+ crosses_surveillance_flags_df = self ._surveillance_flags (
387+ sample_sets = [crosses_sample_set ]
388+ )
389+ df = df .merge (
390+ crosses_surveillance_flags_df [["sample_id" , "is_surveillance" ]],
391+ on = "sample_id" ,
392+ how = "left" ,
393+ )
394+ df = df [df ["is_surveillance" ]]
395+ df = df .drop (columns = ["is_surveillance" ])
396+
397+ # Cache the cross metadata.
340398 self ._cache_cross_metadata = df
341399
342400 return self ._cache_cross_metadata .copy ()
0 commit comments