malariagen
diff --git a/‎.github/actions/setup-python/action.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/actions/setup-python/action.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 6 additions & 2 deletions b/‎CONTRIBUTING.md‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎malariagen_data/anoph/base.py‎
Lines changed: 3 additions & 3 deletions b/‎malariagen_data/anoph/base.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎malariagen_data/anoph/base_params.py‎
Lines changed: 4 additions & 1 deletion b/‎malariagen_data/anoph/base_params.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎malariagen_data/anoph/dipclust.py‎
Lines changed: 7 additions & 4 deletions b/‎malariagen_data/anoph/dipclust.py‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎malariagen_data/anoph/frq_base.py‎
Lines changed: 107 additions & 38 deletions b/‎malariagen_data/anoph/frq_base.py‎
Lines changed: 107 additions & 38 deletions
diff --git a/‎malariagen_data/anoph/genome_features.py‎
Lines changed: 10 additions & 13 deletions b/‎malariagen_data/anoph/genome_features.py‎
Lines changed: 10 additions & 13 deletions
@@ -19,4 +19,4 @@ runs:
       shell: bash
       run: |
         poetry env use ${{ inputs.python-version }}
-        poetry install --extras dev
+        poetry install --with dev,test,docs
@@ -3,6 +3,7 @@
 __pycache__
 *.pyc
 dist
+.venv/
 .coverage
 coverage.xml
 .ipynb_checkpoints/
 
@@ -12,7 +12,7 @@ This package provides Python tools for accessing and analyzing genomic data from
 
 You'll need:
 
-- [pipx](https://python-poetry.org/) for installing Python tools
+- [pipx](https://pipx.pypa.io/) for installing Python tools
 - [git](https://git-scm.com/) for version control
 
 Both of these can be installed using your distribution's package manager or [Homebrew](https://brew.sh/) on Mac.
@@ -52,9 +52,13 @@ Both of these can be installed using your distribution's package manager or [Hom
 
    ```bash
    poetry env use 3.12
-   poetry install --extras dev
+   poetry install --with dev,test,docs
    ```
 
+   This installs the runtime dependencies along with the `dev`, `test`, and `docs`
+   [dependency groups](https://python-poetry.org/docs/managing-dependencies/#dependency-groups).
+   If you only need to run tests, `poetry install --with test` is sufficient.
+
    **Recommended**: Use `poetry run` to run commands inside the virtual environment:
 
    ```bash
 
@@ -607,9 +607,9 @@ def _read_sample_sets_manifest(self, *, single_release: str):
             # Get today's date in ISO format
             today_date_iso = date.today().isoformat()
             # Add an "unrestricted_use" column, set to True if terms-of-use expiry date <= today's date.
-            df["unrestricted_use"] = df[terms_of_use_expiry_date_column].apply(
-                lambda d: True if pd.isna(d) else (d <= today_date_iso)
-            )
+            # Vectorized operation: True if NaN, else (d <= today_date_iso)
+            s = df[terms_of_use_expiry_date_column]
+            df["unrestricted_use"] = s.isna() | (s <= today_date_iso)
             # Make the "unrestricted_use" column a nullable boolean, to allow missing data.
             df["unrestricted_use"] = df["unrestricted_use"].astype(pd.BooleanDtype())
 
 
@@ -69,7 +69,10 @@
     str,
     """
     A pandas query string to be evaluated against the sample metadata, to
-    select samples to be included in the returned data.
+    select samples to be included in the returned data. E.g.,
+    "country == 'Uganda'". If the query returns zero results, a warning
+    will be emitted with fuzzy-match suggestions for possible typos or
+    case mismatches.
     """,
 ]
 
 
@@ -1,3 +1,4 @@
+import warnings
 from typing import Optional, Tuple
 
 import allel  # type: ignore
@@ -540,8 +541,9 @@ def _insert_dipclust_snp_trace(
             figures.append(snp_trace)
             subplot_heights.append(snp_row_height * n_snps_transcript)
         else:
-            print(
-                f"No SNPs were found below {snp_filter_min_maf} allele frequency. Omitting SNP genotype plot."
+            warnings.warn(
+                f"No SNPs were found below {snp_filter_min_maf} allele frequency. Omitting SNP genotype plot.",
+                stacklevel=2,
             )
         return figures, subplot_heights, n_snps_transcript
 
@@ -607,8 +609,9 @@ def plot_diplotype_clustering_advanced(
             cnv_colorscale = cnv_params.colorscale_default
         if cohort_size and snp_transcript:
             cohort_size = None
-            print(
-                "Cohort size is not supported with amino acid heatmap. Overriding cohort size to None."
+            warnings.warn(
+                "Cohort size is not supported with amino acid heatmap. Overriding cohort size to None.",
+                stacklevel=2,
             )
 
         res = self.plot_diplotype_clustering(
 
@@ -1,4 +1,3 @@
-import re
 from textwrap import dedent
 from typing import Optional, Union, List
 
@@ -29,6 +28,13 @@ def _prep_samples_for_cohort_grouping(
         # Users can explicitly override with True/False.
         filter_unassigned = taxon_by == "taxon"
 
+    # Validate taxon_by.
+    if taxon_by not in df_samples.columns:
+        raise ValueError(
+            f"Invalid value for `taxon_by`: {taxon_by!r}. "
+            f"Must be the name of an existing column in the sample metadata."
+        )
+
     if filter_unassigned:
         # Remove samples with "intermediate" or "unassigned" taxon values,
         # as we only want cohorts with clean taxon calls.
@@ -43,40 +49,46 @@ def _prep_samples_for_cohort_grouping(
 
     # Add period column.
 
-    # Map supported period_by values to functions that return either the relevant pd.Period or pd.NaT per row.
-    period_by_funcs = {
-        "year": _make_sample_period_year,
-        "quarter": _make_sample_period_quarter,
-        "month": _make_sample_period_month,
+    # Map supported period_by values to vectorized functions that create Period arrays.
+    period_by_funcs_vectorized = {
+        "year": _make_sample_periods_year_vectorized,
+        "quarter": _make_sample_periods_quarter_vectorized,
+        "month": _make_sample_periods_month_vectorized,
     }
 
     # Get the matching function for the specified period_by value, or None.
-    period_by_func = period_by_funcs.get(period_by)
+    period_by_func_vectorized = period_by_funcs_vectorized.get(period_by)
 
     # If there were no matching functions for the specified period_by value...
-    if period_by_func is None:
+    if period_by_func_vectorized is None:
         # Raise a ValueError if the specified period_by value is not a column in the DataFrame.
         if period_by not in df_samples.columns:
             raise ValueError(
                 f"Invalid value for `period_by`: {period_by!r}. Either specify the name of an existing column "
                 "or a supported period: 'year', 'quarter', or 'month'."
             )
 
-        # Raise a ValueError if the specified period_by column does not contain instances pd.Period.
-        if (
-            not df_samples[period_by]
-            .apply(lambda value: pd.isnull(value) or isinstance(value, pd.Period))
-            .all()
-        ):
-            raise TypeError(
-                f"Invalid values in {period_by!r} column. Must be either pandas.Period or null."
-            )
+        # Validate the specified period_by column contains pandas Periods (or nulls).
+        s_period_by = df_samples[period_by]
+        if not pd.api.types.is_period_dtype(s_period_by.dtype):
+            non_null = s_period_by.dropna()
+            if len(non_null) > 0 and not non_null.map(type).eq(pd.Period).all():
+                raise TypeError(
+                    f"Invalid values in {period_by!r} column. Must be either pandas.Period or null."
+                )
 
         # Copy the specified period_by column to a new "period" column.
         df_samples["period"] = df_samples[period_by]
     else:
-        # Apply the matching period_by function to create a new "period" column.
-        df_samples["period"] = df_samples.apply(period_by_func, axis="columns")
+        # Use the vectorized period creation function.
+        df_samples["period"] = period_by_func_vectorized(df_samples)
+
+    # Validate area_by.
+    if area_by not in df_samples.columns:
+        raise ValueError(
+            f"Invalid value for `area_by`: {area_by!r}. "
+            f"Must be the name of an existing column in the sample metadata."
+        )
 
     # Copy the specified area_by column to a new "area" column.
     df_samples["area"] = df_samples[area_by]
@@ -101,22 +113,39 @@ def _build_cohorts_from_sample_grouping(
     df_cohorts = df_cohorts.reset_index()
 
     # Add cohort helper variables.
-    cohort_period_start = df_cohorts["period"].apply(lambda v: v.start_time)
-    cohort_period_end = df_cohorts["period"].apply(lambda v: v.end_time)
-    df_cohorts["period_start"] = cohort_period_start
-    df_cohorts["period_end"] = cohort_period_end
+    # Vectorized extraction of period start/end times.
+    period = df_cohorts["period"]
+    if pd.api.types.is_period_dtype(period.dtype):
+        df_cohorts["period_start"] = period.dt.start_time
+        df_cohorts["period_end"] = period.dt.end_time
+    else:
+        # Fallback for object dtype Period values.
+        df_cohorts["period_start"] = period.map(
+            lambda v: v.start_time if pd.notna(v) else pd.NaT
+        )
+        df_cohorts["period_end"] = period.map(
+            lambda v: v.end_time if pd.notna(v) else pd.NaT
+        )
+
     # Create a label that is similar to the cohort metadata,
     # although this won't be perfect.
+    # Vectorized string operations
     if taxon_by == frq_params.taxon_by_default:
-        df_cohorts["label"] = df_cohorts.apply(
-            lambda v: f"{v.area}_{v[taxon_by][:4]}_{v.period}", axis="columns"
-        )
+        # Default case: area_taxon_short_period
+        area_str = df_cohorts["area"].astype(str)
+        taxon_short = df_cohorts[taxon_by].astype(str).str.slice(0, 4)
+        period_str = df_cohorts["period"].astype(str)
+        df_cohorts["label"] = area_str + "_" + taxon_short + "_" + period_str
     else:
-        # Replace non-alphanumeric characters in the taxon with underscores.
-        df_cohorts["label"] = df_cohorts.apply(
-            lambda v: f"{v.area}_{re.sub(r'[^A-Za-z0-9]+', '_', str(v[taxon_by]))}_{v.period}",
-            axis="columns",
+        # Non-default case: replace non-alphanumeric characters with underscores
+        area_str = df_cohorts["area"].astype(str)
+        taxon_clean = (
+            df_cohorts[taxon_by]
+            .astype(str)
+            .str.replace(r"[^A-Za-z0-9]+", "_", regex=True)
         )
+        period_str = df_cohorts["period"].astype(str)
+        df_cohorts["label"] = area_str + "_" + taxon_clean + "_" + period_str
 
     # Apply minimum cohort size.
     df_cohorts = df_cohorts.query(f"size >= {min_cohort_size}").reset_index(drop=True)
@@ -173,6 +202,50 @@ def _make_sample_period_year(row):
         return pd.NaT
 
 
+def _make_sample_periods_month_vectorized(df_samples):
+    year = df_samples["year"]
+    month = df_samples["month"]
+    valid = (year > 0) & (month > 0)
+
+    out = pd.Series(pd.NaT, index=df_samples.index, dtype="period[M]")
+    if valid.any():
+        out.loc[valid] = pd.PeriodIndex.from_fields(
+            year=year.loc[valid].to_numpy(),
+            month=month.loc[valid].to_numpy(),
+            freq="M",
+        )
+    return out
+
+
+def _make_sample_periods_quarter_vectorized(df_samples):
+    year = df_samples["year"]
+    month = df_samples["month"]
+    valid = (year > 0) & (month > 0)
+
+    out = pd.Series(pd.NaT, index=df_samples.index, dtype="period[Q-DEC]")
+    if valid.any():
+        out.loc[valid] = pd.PeriodIndex.from_fields(
+            year=year.loc[valid].to_numpy(),
+            month=month.loc[valid].to_numpy(),
+            freq="Q-DEC",
+        )
+    return out
+
+
+def _make_sample_periods_year_vectorized(df_samples):
+    year = df_samples["year"]
+    valid = year > 0
+
+    out = pd.Series(pd.NaT, index=df_samples.index, dtype="period[Y-DEC]")
+    if valid.any():
+        out.loc[valid] = pd.PeriodIndex.from_fields(
+            year=year.loc[valid].to_numpy(),
+            month=np.full(int(valid.sum()), 12, dtype="int64"),
+            freq="Y-DEC",
+        )
+    return out
+
+
 class AnophelesFrequencyAnalysis(AnophelesBase):
     def __init__(
         self,
@@ -263,14 +336,10 @@ def plot_frequencies_heatmap(
             index = list(index_names_as_list)
         df = df.reset_index().copy()
         if isinstance(index, list):
-            index_col = (
-                df[index]
-                .astype(str)
-                .apply(
-                    lambda row: ", ".join([o for o in row if o is not None]),
-                    axis="columns",
-                )
-            )
+            idx_vals = df[index].astype(str).to_numpy()
+            index_col = pd.Series(idx_vals[:, 0], index=df.index)
+            for j in range(1, idx_vals.shape[1]):
+                index_col = index_col + ", " + idx_vals[:, j]
         else:
             assert isinstance(index, str)
             index_col = df[index].astype(str)
 
@@ -446,29 +446,26 @@ def plot_genes(
 
             # Put gene pointers (▲ or ▼) in a new column, depending on the strand.
             # Except if the gene_label is null or an empty string, which should not be shown.
-            data["gene_pointer"] = data.apply(
-                lambda row: ("▼" if row["strand"] == "+" else "▲")
-                if row["gene_label"]
-                else "",
-                axis=1,
+            data["gene_pointer"] = np.where(
+                data["gene_label"] == "",
+                "",
+                np.where(data["strand"] == "+", "▼", "▲"),
             )
 
             # Put the pointer above or below the gene rectangle, depending on + or - strand.
             neg_strand_pointer_y = orig_mid_y_range - 1.1
             pos_strand_pointer_y = orig_mid_y_range + 1.1
-            data["pointer_y"] = data["strand"].apply(
-                lambda strand: pos_strand_pointer_y
-                if strand == "+"
-                else neg_strand_pointer_y
+            # Vectorized operation: use np.where instead of Series.apply
+            data["pointer_y"] = np.where(
+                data["strand"] == "+", pos_strand_pointer_y, neg_strand_pointer_y
             )
 
             # Put the label above or below the gene rectangle, depending on + or - strand.
             neg_strand_label_y = orig_mid_y_range - 1.25
             pos_strand_label_y = orig_mid_y_range + 1.3
-            data["label_y"] = data["strand"].apply(
-                lambda strand: pos_strand_label_y
-                if strand == "+"
-                else neg_strand_label_y
+            # Vectorized operation: use np.where instead of Series.apply
+            data["label_y"] = np.where(
+                data["strand"] == "+", pos_strand_label_y, neg_strand_label_y
             )
 
             # Get the data as a ColumnDataSource.