fix(spark): BatchFeatureView with TransformationMode.PYTHON now reads all source columns

abhijeet-dhumal · ntkathole · commit a310eaf304a2 · 2026-04-22T23:26:06.000+05:30
Signed-off-by: abhijeet-dhumal &lt;abhijeetdhumal652@gmail.com&gt;
diff --git a/sdk/python/feast/infra/compute_engines/feature_builder.py b/sdk/python/feast/infra/compute_engines/feature_builder.py
@@ -158,12 +158,14 @@ def get_column_info(
         # we need to read ALL source columns, not just the output feature columns.
         # This is specifically for transformations that create new columns or need raw data.
         mode = getattr(getattr(view, "feature_transformation", None), "mode", None)
-        if mode in ("ray", "pandas") or getattr(mode, "value", None) in (
+        if mode in ("ray", "pandas", "python") or getattr(mode, "value", None) in (
             "ray",
             "pandas",
+            "python",
         ):
-            # Signal to read all columns by passing empty list for feature_cols
-            # The transformation will produce the output columns defined in the schema
+            # Signal to read all columns by passing empty list for feature_cols.
+            # "python" (BatchFeatureView) transformations need all raw source columns — the
+            # UDF computes output features from raw input, not from pre-existing feature cols.
             feature_cols = []
 
         return ColumnInfo(
diff --git a/sdk/python/tests/unit/infra/compute_engines/test_feature_builder.py b/sdk/python/tests/unit/infra/compute_engines/test_feature_builder.py
@@ -1,4 +1,6 @@
-from unittest.mock import MagicMock
+from unittest.mock import MagicMock, patch
+
+import pytest
 
 from feast.data_source import DataSource
 from feast.infra.compute_engines.dag.context import ExecutionContext
@@ -7,6 +9,7 @@
 from feast.infra.compute_engines.dag.plan import ExecutionPlan
 from feast.infra.compute_engines.dag.value import DAGValue
 from feast.infra.compute_engines.feature_builder import FeatureBuilder
+from feast.transformation.mode import TransformationMode
 
 # ---------------------------
 # Minimal Mock DAGNode for testing
@@ -143,3 +146,144 @@ def test_recursive_featureview_build():
               - Source(hourly_driver_stats)"""
 
     assert execution_plan.to_dag() == expected_output
+
+
+# ---------------------------------------------------------------------------
+# Helpers for get_column_info tests
+# ---------------------------------------------------------------------------
+
+# Stable return value for _get_column_names: (join_keys, feature_cols, ts_col, created_ts_col)
+_MOCK_COLUMN_NAMES = (
+    ["user_id"],
+    ["user_avg_rating", "user_review_count"],
+    "event_timestamp",
+    None,
+)
+
+
+def _make_transformation(mode):
+    """Return a minimal transformation stub with the given mode."""
+    t = MagicMock()
+    t.mode = mode
+    return t
+
+
+def _make_builder_for_column_info(transformation):
+    """
+    Build a MockFeatureBuilder whose task.feature_view carries the given
+    transformation. registry.get_entity is stubbed out per entity name.
+    """
+    view = MagicMock()
+    view.entities = ["user"]
+    view.feature_transformation = transformation
+    view.batch_source = MagicMock()
+    view.batch_source.field_mapping = {}
+    view.stream_source = None
+
+    task = MagicMock()
+    task.project = "test_project"
+    task.feature_view = view
+    task.only_latest = False
+
+    registry = MagicMock()
+    registry.get_entity.return_value = MagicMock(join_key="user_id")
+
+    builder = MockFeatureBuilder.__new__(MockFeatureBuilder)
+    builder.registry = registry
+    builder.task = task
+    builder.nodes = []
+    return builder, view
+
+
+# ---------------------------------------------------------------------------
+# Bug fix: TransformationMode.PYTHON must set feature_cols=[]
+#
+# Previously only "ray" and "pandas" were handled. "python" (the default mode
+# for @batch_feature_view) was missing, causing get_column_info to forward
+# the BFV *output* feature names (e.g. user_avg_rating) to the offline store
+# read step — columns that don't exist in raw source data — resulting in
+# UNRESOLVED_COLUMN errors at Spark analysis time.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "mode",
+    [
+        TransformationMode.PYTHON,
+        TransformationMode.PANDAS,
+        TransformationMode.RAY,
+        # String forms (getattr(mode, "value", None) path)
+        "python",
+        "pandas",
+        "ray",
+    ],
+)
+def test_get_column_info_clears_feature_cols_for_udf_modes(mode):
+    """
+    For transformation modes that compute output features from raw input
+    (python, pandas, ray), get_column_info must set feature_cols=[] so the
+    offline store read step issues SELECT * instead of projecting the output
+    feature names that don't exist in the raw source schema.
+    """
+    builder, view = _make_builder_for_column_info(_make_transformation(mode))
+
+    with patch(
+        "feast.infra.compute_engines.feature_builder._get_column_names",
+        return_value=_MOCK_COLUMN_NAMES,
+    ):
+        col_info = builder.get_column_info(view)
+
+    assert col_info.feature_cols == [], (
+        f"Expected feature_cols=[] for TransformationMode {mode!r}, "
+        f"got {col_info.feature_cols!r}. "
+        "The offline store read step must not project output feature names "
+        "that don't exist in the raw source schema."
+    )
+    assert col_info.join_keys == ["user_id"]
+    assert col_info.ts_col == "event_timestamp"
+
+
+@pytest.mark.parametrize(
+    "mode",
+    [
+        TransformationMode.SPARK_SQL,
+        TransformationMode.SQL,
+        TransformationMode.SPARK,
+        "spark_sql",
+        "sql",
+    ],
+)
+def test_get_column_info_preserves_feature_cols_for_non_udf_modes(mode):
+    """
+    SQL/Spark-SQL transformations operate on already-projected columns and
+    should NOT get feature_cols cleared — the source read must still select
+    the named feature columns explicitly.
+    """
+    builder, view = _make_builder_for_column_info(_make_transformation(mode))
+
+    with patch(
+        "feast.infra.compute_engines.feature_builder._get_column_names",
+        return_value=_MOCK_COLUMN_NAMES,
+    ):
+        col_info = builder.get_column_info(view)
+
+    assert col_info.feature_cols == ["user_avg_rating", "user_review_count"], (
+        f"Expected feature_cols to be preserved for mode {mode!r}, "
+        f"got {col_info.feature_cols!r}."
+    )
+
+
+def test_get_column_info_preserves_feature_cols_with_no_transformation():
+    """
+    A plain FeatureView (no transformation) must retain its feature column
+    names so the offline store read step selects only the required columns.
+    """
+    builder, view = _make_builder_for_column_info(None)
+
+    with patch(
+        "feast.infra.compute_engines.feature_builder._get_column_names",
+        return_value=_MOCK_COLUMN_NAMES,
+    ):
+        col_info = builder.get_column_info(view)
+
+    assert col_info.feature_cols == ["user_avg_rating", "user_review_count"]