Fix bug in applying aim_metadata_dtype. Amend data types.

leehart · leehart · commit 21af2d2b944d · 2025-05-30T13:03:08.000+01:00
diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py
@@ -77,6 +77,18 @@ def _setup_aim_palettes():
     "unassigned": "black",
 }
 
+# Note: These column names will be treated as case-insensitive,
+# because these column names and the column names from the CSV
+# will be converted to lowercase before applying these dtypes.
+AIM_METADATA_DTYPE = {
+    "aim_species_fraction_arab": "float64",
+    "aim_species_fraction_colu": "float64",
+    "aim_species_fraction_colu_no2l": "float64",
+    "aim_species_gambcolu_arabiensis": "object",
+    "aim_species_gambiae_coluzzii": "object",
+    "aim_species": "object",
+}
+
 
 class Ag3(AnophelesDataResource):
     """Provides access to data from Ag3.x releases.
@@ -162,14 +174,7 @@ def __init__(
             config_path=CONFIG_PATH,
             cohorts_analysis=cohorts_analysis,
             aim_analysis=aim_analysis,
-            aim_metadata_dtype={
-                "aim_species_fraction_arab": "float64",
-                "aim_species_fraction_colu": "float64",
-                "aim_species_fraction_colu_no2l": "float64",
-                "aim_species_gambcolu_arabiensis": "object",
-                "aim_species_gambiae_coluzzii": "object",
-                "aim_species": "object",
-            },
+            aim_metadata_dtype=AIM_METADATA_DTYPE,
             aim_ids=("gambcolu_vs_arab", "gamb_vs_colu"),
             aim_palettes=AIM_PALETTES,
             site_filters_analysis=site_filters_analysis,
diff --git a/malariagen_data/anoph/frq_base.py b/malariagen_data/anoph/frq_base.py
@@ -210,7 +210,8 @@ def plot_frequencies_heatmap(
 
         # Indexing.
         if index is None:
-            # `list[Hashable]` is incompatible with `list`
+            # `list[Hashable]` is incompatible with the param for `list`
+            # Convert `df.index.names` to a `list[str]` instead.
             index_names_as_list = [str(name) for name in df.index.names]
             index = list(index_names_as_list)
         df = df.reset_index().copy()
diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py
@@ -3,15 +3,16 @@
 from typing import (
     Any,
     Callable,
+    DefaultDict,
     Dict,
     List,
     Mapping,
     Optional,
     Sequence,
     Tuple,
     Union,
-    cast,
 )
+from collections import defaultdict
 import warnings
 
 import ipyleaflet  # type: ignore
@@ -51,21 +52,22 @@ def __init__(
         # data resources, and so column names and dtype need to be
         # passed in as parameters.
         self._aim_metadata_columns: Optional[List[str]] = None
-        #  `dtype` of `dict[str, Any]` is incompatible with `pd.read_csv`
-        self._aim_metadata_dtype: Mapping[
-            str, Union[str, type, np.dtype, pd.api.extensions.ExtensionDtype]
-        ] = {}
+        self._aim_metadata_dtype: Optional[Mapping[str, Any]] = {}
+
+        # Only apply the `aim_metadata_dtype` if it is a type of `Mapping`.
         if isinstance(aim_metadata_dtype, Mapping):
-            self._aim_metadata_columns = list(aim_metadata_dtype.keys())
-            self._aim_metadata_dtype.update(
-                cast(
-                    Mapping[
-                        str,
-                        Union[str, type, np.dtype, pd.api.extensions.ExtensionDtype],
-                    ],
-                    aim_metadata_dtype,
-                )
-            )
+            # Convert all of the column names to lowercase.
+            prepared_aim_metadata_dtype_dict = {
+                k.lower(): v for k, v in aim_metadata_dtype.items()
+            }
+
+            # Get all the column names from the prepared dict.
+            self._aim_metadata_columns = list(prepared_aim_metadata_dtype_dict.keys())
+
+            # Update the _aim_metadata_dtype with the prepared dict.
+            self._aim_metadata_dtype.update(prepared_aim_metadata_dtype_dict)
+
+        # Add the sample_id to the _aim_metadata_dtype.
         self._aim_metadata_dtype["sample_id"] = "object"
 
         # Set up taxon colors.
@@ -151,7 +153,7 @@ def _parse_general_metadata(
         self, sample_set: str, data: Union[bytes, Exception]
     ) -> pd.DataFrame:
         if isinstance(data, bytes):
-            dtype = {
+            dtype_dict = {
                 "sample_id": "object",
                 "partner_sample_id": "object",
                 "contributor": "object",
@@ -163,14 +165,9 @@ def _parse_general_metadata(
                 "longitude": "float64",
                 "sex_call": "object",
             }
-            #  `dtype` of `dict[str, str]` is incompatible with `pd.read_csv`
-            dtype_mapping = cast(
-                Mapping[
-                    str, Union[str, type, np.dtype, pd.api.extensions.ExtensionDtype]
-                ],
-                dtype,
-            )
-            df = pd.read_csv(io.BytesIO(data), dtype=dtype_mapping, na_values="")
+            # `dict[str, str]` is incompatible with the `dtype` of `pd.read_csv`
+            dtype: DefaultDict[str, str] = defaultdict(lambda: "object", dtype_dict)
+            df = pd.read_csv(io.BytesIO(data), dtype=dtype, na_values="")
 
             # Ensure all column names are lower case.
             df.columns = [c.lower() for c in df.columns]  # type: ignore
@@ -255,7 +252,10 @@ def _parse_sequence_qc_metadata(
     ) -> pd.DataFrame:
         if isinstance(data, bytes):
             # Get the dtype of the constant columns.
-            dtype = self._sequence_qc_metadata_dtype
+            dtype_dict = self._sequence_qc_metadata_dtype
+
+            # `dict[str, str]` is incompatible with the `dtype` of `pd.read_csv`
+            dtype: DefaultDict[str, str] = defaultdict(lambda: "object", dtype_dict)
 
             # Read the CSV using the dtype dict.
             df = pd.read_csv(io.BytesIO(data), dtype=dtype, na_values="")
@@ -272,8 +272,8 @@ def _parse_sequence_qc_metadata(
 
             # Add the sequence QC columns with appropriate missing values.
             # For each column, set the value to either NA or NaN.
-            for c, dtype in self._sequence_qc_metadata_dtype.items():
-                if pd.api.types.is_integer_dtype(dtype):
+            for c, datum_dtype in self._sequence_qc_metadata_dtype.items():
+                if pd.api.types.is_integer_dtype(datum_dtype):
                     # Note: this creates a column with dtype int64.
                     df[c] = -1
                 else:
@@ -378,11 +378,8 @@ def _parse_surveillance_flags(
             "sample_id": "object",
             "is_surveillance": "boolean",
         }
-        #  `dtype` of `dict[str, str]` is incompatible with `read_csv`
-        dtype = cast(
-            Mapping[str, Union[str, type, np.dtype, pd.api.extensions.ExtensionDtype]],
-            dtype_dict,
-        )
+        # `dict[str, str]` is incompatible with the `dtype` of `pd.read_csv`
+        dtype: DefaultDict[str, str] = defaultdict(lambda: "object", dtype_dict)
 
         if isinstance(data, bytes):
             # Read the CSV data.
@@ -516,7 +513,11 @@ def _parse_cohorts_metadata(
     ) -> pd.DataFrame:
         if isinstance(data, bytes):
             # Parse CSV data.
-            dtype = self._cohorts_metadata_dtype
+            dtype_dict = self._cohorts_metadata_dtype
+
+            # `dict[str, str]` is incompatible with the `dtype` of `pd.read_csv`
+            dtype: DefaultDict[str, str] = defaultdict(lambda: "object", dtype_dict)
+
             df = pd.read_csv(io.BytesIO(data), dtype=dtype, na_values="")
 
             # Ensure all column names are lower case.
@@ -590,14 +591,19 @@ def _parse_aim_metadata(
         assert self._aim_metadata_columns is not None
         assert self._aim_metadata_dtype is not None
         if isinstance(data, bytes):
-            # Parse CSV data.
-            df = pd.read_csv(
-                io.BytesIO(data), dtype=self._aim_metadata_dtype, na_values=""
-            )
+            # Parse CSV data but don't apply the dtype yet.
+            df = pd.read_csv(io.BytesIO(data), na_values="")
 
-            # Ensure all column names are lower case.
+            # Convert all column names to lowercase.
             df.columns = [c.lower() for c in df.columns]  # type: ignore
 
+            # For each column in the DataFrame...
+            for c in df.columns:
+                # Apply the corresponding dtype from `_aim_metadata_dtype`.
+                # Convert the type to a NumPy dtype.
+                col_dtype_as_np = np.dtype(self._aim_metadata_dtype[c])
+                df[c] = df[c].astype(col_dtype_as_np)
+
             return df
 
         elif isinstance(data, FileNotFoundError):
diff --git a/tests/anoph/test_sample_metadata.py b/tests/anoph/test_sample_metadata.py
@@ -268,7 +268,7 @@ def validate_metadata(df, expected_columns):
 
     # Check column types.
     for c in df.columns:
-        assert df[c].dtype.kind == expected_columns[c]
+        assert df[c].dtype.kind == expected_columns[c], c
 
 
 @parametrize_with_cases("fixture,api", cases=".")