Skip to content

Commit fbff766

Browse files
fix: Robust coordinate handling in phenotype-genetic data merge
- Fixed genetic data merge to dynamically detect and use correct coordinate names - Added comprehensive detection for both 'samples' and 'sample_id' coordinates - Implemented dimension alignment before merging xarray datasets - Added tests for the new phenotypes functions Resolves #789
1 parent 56f0d7a commit fbff766

2 files changed

Lines changed: 437 additions & 16 deletions

File tree

malariagen_data/anoph/phenotypes.py

Lines changed: 95 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -252,22 +252,93 @@ def _create_phenotype_dataset(
252252
ds = xr.Dataset(data_vars, coords=coords)
253253

254254
if variant_data is not None:
255-
if "samples" not in variant_data.dims:
255+
# Find the sample dimension and coordinate
256+
sample_dim = None
257+
sample_coord = None
258+
259+
# Check for sample dimension
260+
for dim_name in ["samples", "sample_id"]:
261+
if dim_name in variant_data.dims:
262+
sample_dim = dim_name
263+
break
264+
265+
if sample_dim is None:
256266
warnings.warn(
257-
"Variant data does not contain 'samples' dimension, cannot merge."
267+
f"Variant data does not contain 'samples' or 'sample_id' dimension. "
268+
f"Available dimensions: {list(variant_data.dims)}"
258269
)
259-
else:
260-
common_samples = list(
261-
set(sample_ids) & set(variant_data.coords["samples"].values)
270+
return ds
271+
272+
for coord_name in ["sample_id", "samples"]:
273+
if coord_name in variant_data.coords:
274+
sample_coord = coord_name
275+
break
276+
277+
if sample_coord is None:
278+
warnings.warn(
279+
f"Variant data does not contain 'samples' or 'sample_id' coordinate. "
280+
f"Available coordinates: {list(variant_data.coords.keys())}"
262281
)
263-
if not common_samples:
264-
warnings.warn(
265-
"No common samples found between phenotype and variant data"
282+
return ds
283+
284+
# Get variant sample IDs - use the correct coordinate name
285+
variant_sample_ids = variant_data.coords[sample_coord].values
286+
287+
# Find common samples
288+
common_samples = list(set(sample_ids) & set(variant_sample_ids))
289+
print(f"DEBUG: Found {len(common_samples)} common samples")
290+
291+
if not common_samples:
292+
warnings.warn(
293+
"No common samples found between phenotype and variant data"
294+
)
295+
return ds
296+
else:
297+
# Select common samples from phenotype dataset
298+
ds_common = ds.sel(samples=common_samples)
299+
300+
# Select common samples from variant dataset
301+
try:
302+
# NEW APPROACH: Use isel with boolean indexing instead of sel
303+
# This avoids the "no index found" error
304+
305+
sample_mask = pd.Series(variant_sample_ids).isin(common_samples)
306+
sample_indices = sample_mask[sample_mask].index.tolist()
307+
variant_data_common = variant_data.isel(
308+
{sample_dim: sample_indices}
309+
)
310+
311+
# Rename dimension to "samples" if it's not already
312+
if sample_dim != "samples":
313+
variant_data_common = variant_data_common.rename(
314+
{sample_dim: "samples"}
315+
)
316+
317+
if (
318+
sample_coord != "samples"
319+
and sample_coord in variant_data_common.coords
320+
):
321+
variant_data_common = variant_data_common.rename(
322+
{sample_coord: "samples"}
323+
)
324+
325+
variant_samples_selected = variant_data_common.coords[
326+
"samples"
327+
].values
328+
329+
ds_common_reordered = ds_common.sel(
330+
samples=variant_samples_selected
266331
)
267-
else:
268-
ds_common = ds.sel(samples=common_samples)
269-
variant_data_common = variant_data.sel(samples=common_samples)
270-
ds = xr.merge([ds_common, variant_data_common])
332+
333+
# Merge the datasets
334+
ds = xr.merge([ds_common_reordered, variant_data_common])
335+
336+
except Exception as e:
337+
warnings.warn(f"Error selecting/merging variant data: {e}")
338+
import traceback
339+
340+
traceback.print_exc()
341+
return ds
271342

272343
return ds
273344

@@ -466,6 +537,7 @@ def phenotypes(
466537
hap_error_message = None
467538

468539
sample_sets_norm = self._prep_sample_sets_param(sample_sets=sample_sets)
540+
469541
try:
470542
if hasattr(self, "snp_calls") and callable(self.snp_calls):
471543
variant_data = self.snp_calls(
@@ -519,16 +591,23 @@ def phenotypes(
519591
)
520592

521593
# 3. Merge into an xarray Dataset
522-
ds = self._create_phenotype_dataset(df_phenotypes, variant_data)
523-
return ds
594+
try:
595+
ds = self._create_phenotype_dataset(df_phenotypes, variant_data)
596+
return ds
597+
except Exception:
598+
import traceback
599+
600+
traceback.print_exc()
601+
# Return phenotype-only dataset as fallback
602+
return self._create_phenotype_dataset(df_phenotypes, None)
524603

525604
def phenotype_sample_sets(self) -> List[str]:
526605
"""
527606
Get list of sample sets that have phenotypic data available.
528607
"""
529-
all_sample_sets = self.sample_sets
608+
all_sample_sets = self.sample_sets()["sample_set"].tolist()
530609
phenotype_sample_sets = []
531-
base_phenotype_path = f"{self._url}/v3.2/phenotypes/all"
610+
base_phenotype_path = f"{self._url}v3.2/phenotypes/all"
532611
for sample_set in all_sample_sets:
533612
try:
534613
phenotype_path = f"{base_phenotype_path}/{sample_set}/phenotypes.csv"

0 commit comments

Comments
 (0)