WIP: handle sample_indices when surveillance_use_only

leehart · leehart · commit 6c4e74f60759 · 2025-06-13T17:46:11.000+01:00
diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py
@@ -1060,6 +1060,36 @@ def _prep_sample_selection_cache_params(
         sample_query_options: Optional[base_params.sample_query_options],
         sample_indices: Optional[base_params.sample_indices],
     ) -> Tuple[List[str], Optional[List[int]]]:
+        # Check that either sample_query xor sample_indices are provided.
+        base_params.validate_sample_selection_params(
+            sample_query=sample_query, sample_indices=sample_indices
+        )
+
+        # Resolve query to a list of integers for more cache hits - we
+        # do this because there are different ways to write the same pandas
+        # query, and so it's better to evaluate the query and use a list of
+        # integer indices instead.
+
+        # Scenario 1: No `sample_query` nor `sample_indices` were given,
+        #             and there is no internal `sample_query`,
+        #             so no `sample_indices` will be returned.
+
+        # Scenario 2: No `sample_query` nor `sample_indices` were given,
+        #             but there is an internal `sample_query`,
+        #             which will be converted into `sample_indices` and returned.
+
+        # Scenario 3: Only `sample_query` has been provided,
+        #             which will be converted into `sample_indices` and returned.
+        #             This will be handled the same as Scenario 2.
+
+        # Scenario 4: Only `sample_indices` has been provided,
+        #             and there is no internal `sample_query`,
+        #             simply return `sample_indices`.
+
+        # Scenario 5: Only `sample_indices` has been provided,
+        #             but there is also an internal `sample_query`, still return `sample_indices`,
+        #             which ought to already align with `sample_metadata`.
+
         # Normalise sample sets.
         prepared_sample_sets = self._prep_sample_sets_param(sample_sets=sample_sets)
         prepared_sample_query = self._prep_sample_query_param(sample_query=sample_query)
@@ -1068,20 +1098,46 @@ def _prep_sample_selection_cache_params(
         del sample_sets
         del sample_query
 
-        if prepared_sample_query is not None:
-            # Resolve query to a list of integers for more cache hits - we
-            # do this because there are different ways to write the same pandas
-            # query, and so it's better to evaluate the query and use a list of
-            # integer indices instead.
+        # Start with assuming there are no sample indices.
+        # This can be returned if there is no `prepared_sample_query` nor `sample_indices`.
+        prepared_sample_indices = None
+
+        # If there is a `prepared_sample_query` but no `sample_indices`...
+        if prepared_sample_query is not None and sample_indices is None:
+            # Get the unfiltered sample metadata for the given sample sets.
+            # Note: we don't want to pass the `sample_query` to `sample_metadata` here
+            # because we want to get the sample indices that represent the `sample_query`.
             df_samples = self.sample_metadata(sample_sets=prepared_sample_sets)
+
+            # Default the sample_query_options to an empty dict.
             sample_query_options = sample_query_options or {}
+
             # Use the python engine in order to support extension array dtypes, e.g. Float64, Int64, boolean.
+            # Get the Pandas Series as a NumPy array of Boolean values.
+            # Note: if `prepared_sample_query` is an internal query, this will select all samples,
+            # since `sample_metadata` should have already applied the internal query.
             loc_samples = df_samples.eval(
                 prepared_sample_query, **sample_query_options, engine="python"
             ).values
-            sample_indices = np.nonzero(loc_samples)[0].tolist()
 
-        return prepared_sample_sets, sample_indices
+            # Convert the sample indices to a list.
+            # Get the indices of the True values in the Boolean array and convert it to a list of integers.
+            prepared_sample_indices = np.nonzero(loc_samples)[0].tolist()
+
+        # If there is a `prepared_sample_query` and a `sample_indices`...
+        elif prepared_sample_query is not None and sample_indices is not None:
+            # Given that we don't allow both `sample_query` and `sample_indices` params in this function,
+            # we can deduce that the `prepared_sample_query` has resulted from an internal query.
+            # Given that `sample_indices` should be aligned with the results of `sample_metadata`,
+            # which should already apply the internal query, simply return the given `sample_indices`.
+
+            prepared_sample_indices = sample_indices
+
+        # If there is no `prepared_sample_query` but there is a `sample_indices`...
+        elif prepared_sample_query is None and sample_indices is not None:
+            prepared_sample_indices = sample_indices
+
+        return prepared_sample_sets, prepared_sample_indices
 
     def _results_cache_add_analysis_params(self, params: dict):
         super()._results_cache_add_analysis_params(params)
diff --git a/malariagen_data/anoph/snp_data.py b/malariagen_data/anoph/snp_data.py
@@ -753,7 +753,7 @@ def _locate_site_class(
         try:
             loc_ann = self._cache_locate_site_class[cache_key]
 
-        except KeyError:
+        except KeyError as exc:
             # Access site annotations data.
             ds_ann = self._site_annotations_raw(
                 contig=region.contig,
@@ -877,7 +877,7 @@ def _locate_site_class(
                 ) | ((seq_cls == SEQ_CLS_DOWNSTREAM) & (seq_relpos_start > 10_000))
 
             else:
-                raise NotImplementedError(site_class)
+                raise NotImplementedError(site_class) from exc
 
             # N.B., site annotations data are provided for every position in the genome. We need to
             # therefore subset to SNP positions.
@@ -1007,33 +1007,44 @@ def snp_calls(
         )
 
         # Normalise parameters.
-        prepared_regions: Tuple[Region, ...] = tuple(parse_multi_region(self, region))
-        prepared_sample_sets: Tuple[str, ...] = tuple(
-            self._prep_sample_sets_param(sample_sets=sample_sets)
-        )
-
-        sample_query_prepped = self._prep_sample_query_param(sample_query=sample_query)
-
-        if sample_indices is not None:
-            prepared_sample_indices: Optional[Tuple[int, ...]] = tuple(sample_indices)
-        else:
-            prepared_sample_indices = sample_indices
-
+        prepared_regions = parse_multi_region(self, region)
         prepared_site_mask = self._prep_optional_site_mask_param(site_mask=site_mask)
 
+        # Note: `_prep_sample_selection_cache_params` converts `sample_query` and `sample_query_options` into `sample_indices`.
+        # So `sample_query` and `sample_query_options` should not be used beyond this point. (`sample_indices` should be used instead.)
+        (
+            prepared_sample_sets,
+            prepared_sample_indices,
+        ) = self._prep_sample_selection_cache_params(
+            sample_sets=sample_sets,
+            sample_query=sample_query,
+            sample_query_options=sample_query_options,
+            sample_indices=sample_indices,
+        )
+
         # Delete original parameters to prevent accidental use.
         del sample_sets
         del sample_query
         del sample_indices
         del region
         del site_mask
 
+        # Convert lists to tuples to avoid CacheMiss "TypeError: unhashable type: 'list'".
+        prepared_regions_tuple: Tuple[Region, ...] = tuple(prepared_regions)
+        prepared_sample_sets_tuple: Optional[Tuple[str, ...]] = (
+            tuple(prepared_sample_sets) if prepared_sample_sets is not None else None
+        )
+        prepared_sample_indices_tuple: Optional[Tuple[int, ...]] = (
+            tuple(prepared_sample_indices)
+            if prepared_sample_indices is not None
+            else None
+        )
+
+        # Note: `_snp_calls` should only take `sample_indices`, not `sample_query`, to facilitate caching.
         return self._snp_calls(
-            regions=prepared_regions,
-            sample_sets=prepared_sample_sets,
-            sample_query=sample_query_prepped,
-            sample_query_options=sample_query_options,
-            sample_indices=prepared_sample_indices,
+            regions=prepared_regions_tuple,
+            sample_sets=prepared_sample_sets_tuple,
+            sample_indices=prepared_sample_indices_tuple,
             site_mask=prepared_site_mask,
             site_class=site_class,
             cohort_size=cohort_size,
@@ -1127,8 +1138,6 @@ def _snp_calls(
         *,
         regions: Tuple[Region, ...],
         sample_sets,
-        sample_query,
-        sample_query_options,
         sample_indices,
         site_mask,
         site_class,
@@ -1139,10 +1148,15 @@ def _snp_calls(
         inline_array,
         chunks,
     ):
-        # Note: sample_sets and sample_query should be "prepared" before being passed to this private function.
+        ## Get SNP calls and concatenate multiple sample sets and/or regions.
+
+        # Note: sample_sets should be "prepared" before being passed to this private function.
+
+        # Note: `_snp_calls` should only take `sample_indices`, not `sample_query`.
+        #       Use `_prep_sample_selection_cache_params` to convert `sample_query` to `sample_indices`.
+
+        # Note: we don't cache different sample_indices subsets, which are selected below.
 
-        # Get SNP calls and concatenate multiple sample sets and/or regions.
-        # Note: we don't cache different sample_query or sample_indices subsets.
         ds = self._cached_snp_calls(
             regions=regions,
             sample_sets=sample_sets,
@@ -1153,22 +1167,41 @@ def _snp_calls(
         )
 
         # Handle sample selection.
-        if sample_query is not None:
+        if sample_indices is not None:
+            # Note: `sample_indices` could be any tuple of integers, while the `ds` DataSet will contain data for all samples in the `sample_sets`.
+            # In other words, the internal `sample_query` is not being applied to `ds`.
+            # We need to get the filtered set of samples from `sample_metadata` and then select samples based on that set.
+
             # Get the relevant sample metadata.
-            df_samples = self.sample_metadata(sample_sets=sample_sets)
+            relevant_samples_df = self.sample_metadata(sample_sets=sample_sets)
 
-            # If there are no sample query options, then default to an empty dict.
-            sample_query_options = sample_query_options or {}
+            # We need to select only the samples that are identified by the `sample_indices` tuple relative to the results of `sample_metadata`.
+            # However, the `ds` DataSet contains data for all samples in the `sample_sets`, regardless of any internal `sample_query`.
 
-            ds = self._filter_sample_dataset(
-                ds=ds,
-                df_samples=df_samples,
-                sample_query=sample_query,
-                sample_query_options=sample_query_options,
-            )
+            # Get the samples identified via `sample_indices`.
+            # Note: this might raise `IndexingError` if the user provides bad indices, e.g. "positional indexers are out-of-bounds".
+            # Note: `sample_indices` needs to be a list rather than tuple for `iloc`, otherwise `IndexingError`, e.g. "Too many indexers".
+            sample_indices_as_list = list(sample_indices)
+            selected_samples_df = relevant_samples_df.iloc[sample_indices_as_list]
 
-        elif sample_indices is not None:
-            ds = ds.isel(samples=list(sample_indices))
+            # Get the selected sample ids from the sample metadata DataFrame.
+            relevant_sample_ids = selected_samples_df["sample_id"].values
+
+            # Get all the sample ids from the unfiltered Dataset.
+            ds_sample_ids = ds.coords["sample_id"].values
+
+            # Get the indices of samples in the Dataset that match the relevant sample ids.
+            # Note: we use `[0]` to get the first element of the tuple returned by `np.where`.
+            relevant_sample_indices = np.where(
+                np.isin(ds_sample_ids, relevant_sample_ids)
+            )[0]
+
+            # Preserve the behaviour of raising a `ValueError` instead of empty results.
+            if relevant_sample_indices.size == 0:
+                raise ValueError("No relevant samples found.")
+
+            # Select only the relevant samples from the Dataset.
+            ds = ds.isel(samples=relevant_sample_indices)
 
         # Handle cohort size, overrides min and max.
         if cohort_size is not None:
@@ -1939,7 +1972,7 @@ def _biallelic_diplotypes(
         inline_array,
         chunks,
     ):
-        # Note: this uses sample_indices and should not expect a sample_query.
+        # Note: this function uses sample_indices and should not expect a sample_query.
 
         # Access biallelic SNPs.
         ds = self.biallelic_snp_calls(