|
1 | | -from unittest.mock import MagicMock |
| 1 | +from unittest.mock import MagicMock, patch |
| 2 | + |
| 3 | +import pytest |
2 | 4 |
|
3 | 5 | from feast.data_source import DataSource |
4 | 6 | from feast.infra.compute_engines.dag.context import ExecutionContext |
|
7 | 9 | from feast.infra.compute_engines.dag.plan import ExecutionPlan |
8 | 10 | from feast.infra.compute_engines.dag.value import DAGValue |
9 | 11 | from feast.infra.compute_engines.feature_builder import FeatureBuilder |
| 12 | +from feast.transformation.mode import TransformationMode |
10 | 13 |
|
11 | 14 | # --------------------------- |
12 | 15 | # Minimal Mock DAGNode for testing |
@@ -143,3 +146,144 @@ def test_recursive_featureview_build(): |
143 | 146 | - Source(hourly_driver_stats)""" |
144 | 147 |
|
145 | 148 | assert execution_plan.to_dag() == expected_output |
| 149 | + |
| 150 | + |
| 151 | +# --------------------------------------------------------------------------- |
| 152 | +# Helpers for get_column_info tests |
| 153 | +# --------------------------------------------------------------------------- |
| 154 | + |
| 155 | +# Stable return value for _get_column_names: (join_keys, feature_cols, ts_col, created_ts_col) |
| 156 | +_MOCK_COLUMN_NAMES = ( |
| 157 | + ["user_id"], |
| 158 | + ["user_avg_rating", "user_review_count"], |
| 159 | + "event_timestamp", |
| 160 | + None, |
| 161 | +) |
| 162 | + |
| 163 | + |
| 164 | +def _make_transformation(mode): |
| 165 | + """Return a minimal transformation stub with the given mode.""" |
| 166 | + t = MagicMock() |
| 167 | + t.mode = mode |
| 168 | + return t |
| 169 | + |
| 170 | + |
| 171 | +def _make_builder_for_column_info(transformation): |
| 172 | + """ |
| 173 | + Build a MockFeatureBuilder whose task.feature_view carries the given |
| 174 | + transformation. registry.get_entity is stubbed out per entity name. |
| 175 | + """ |
| 176 | + view = MagicMock() |
| 177 | + view.entities = ["user"] |
| 178 | + view.feature_transformation = transformation |
| 179 | + view.batch_source = MagicMock() |
| 180 | + view.batch_source.field_mapping = {} |
| 181 | + view.stream_source = None |
| 182 | + |
| 183 | + task = MagicMock() |
| 184 | + task.project = "test_project" |
| 185 | + task.feature_view = view |
| 186 | + task.only_latest = False |
| 187 | + |
| 188 | + registry = MagicMock() |
| 189 | + registry.get_entity.return_value = MagicMock(join_key="user_id") |
| 190 | + |
| 191 | + builder = MockFeatureBuilder.__new__(MockFeatureBuilder) |
| 192 | + builder.registry = registry |
| 193 | + builder.task = task |
| 194 | + builder.nodes = [] |
| 195 | + return builder, view |
| 196 | + |
| 197 | + |
| 198 | +# --------------------------------------------------------------------------- |
| 199 | +# Bug fix: TransformationMode.PYTHON must set feature_cols=[] |
| 200 | +# |
| 201 | +# Previously only "ray" and "pandas" were handled. "python" (the default mode |
| 202 | +# for @batch_feature_view) was missing, causing get_column_info to forward |
| 203 | +# the BFV *output* feature names (e.g. user_avg_rating) to the offline store |
| 204 | +# read step — columns that don't exist in raw source data — resulting in |
| 205 | +# UNRESOLVED_COLUMN errors at Spark analysis time. |
| 206 | +# --------------------------------------------------------------------------- |
| 207 | + |
| 208 | + |
| 209 | +@pytest.mark.parametrize( |
| 210 | + "mode", |
| 211 | + [ |
| 212 | + TransformationMode.PYTHON, |
| 213 | + TransformationMode.PANDAS, |
| 214 | + TransformationMode.RAY, |
| 215 | + # String forms (getattr(mode, "value", None) path) |
| 216 | + "python", |
| 217 | + "pandas", |
| 218 | + "ray", |
| 219 | + ], |
| 220 | +) |
| 221 | +def test_get_column_info_clears_feature_cols_for_udf_modes(mode): |
| 222 | + """ |
| 223 | + For transformation modes that compute output features from raw input |
| 224 | + (python, pandas, ray), get_column_info must set feature_cols=[] so the |
| 225 | + offline store read step issues SELECT * instead of projecting the output |
| 226 | + feature names that don't exist in the raw source schema. |
| 227 | + """ |
| 228 | + builder, view = _make_builder_for_column_info(_make_transformation(mode)) |
| 229 | + |
| 230 | + with patch( |
| 231 | + "feast.infra.compute_engines.feature_builder._get_column_names", |
| 232 | + return_value=_MOCK_COLUMN_NAMES, |
| 233 | + ): |
| 234 | + col_info = builder.get_column_info(view) |
| 235 | + |
| 236 | + assert col_info.feature_cols == [], ( |
| 237 | + f"Expected feature_cols=[] for TransformationMode {mode!r}, " |
| 238 | + f"got {col_info.feature_cols!r}. " |
| 239 | + "The offline store read step must not project output feature names " |
| 240 | + "that don't exist in the raw source schema." |
| 241 | + ) |
| 242 | + assert col_info.join_keys == ["user_id"] |
| 243 | + assert col_info.ts_col == "event_timestamp" |
| 244 | + |
| 245 | + |
| 246 | +@pytest.mark.parametrize( |
| 247 | + "mode", |
| 248 | + [ |
| 249 | + TransformationMode.SPARK_SQL, |
| 250 | + TransformationMode.SQL, |
| 251 | + TransformationMode.SPARK, |
| 252 | + "spark_sql", |
| 253 | + "sql", |
| 254 | + ], |
| 255 | +) |
| 256 | +def test_get_column_info_preserves_feature_cols_for_non_udf_modes(mode): |
| 257 | + """ |
| 258 | + SQL/Spark-SQL transformations operate on already-projected columns and |
| 259 | + should NOT get feature_cols cleared — the source read must still select |
| 260 | + the named feature columns explicitly. |
| 261 | + """ |
| 262 | + builder, view = _make_builder_for_column_info(_make_transformation(mode)) |
| 263 | + |
| 264 | + with patch( |
| 265 | + "feast.infra.compute_engines.feature_builder._get_column_names", |
| 266 | + return_value=_MOCK_COLUMN_NAMES, |
| 267 | + ): |
| 268 | + col_info = builder.get_column_info(view) |
| 269 | + |
| 270 | + assert col_info.feature_cols == ["user_avg_rating", "user_review_count"], ( |
| 271 | + f"Expected feature_cols to be preserved for mode {mode!r}, " |
| 272 | + f"got {col_info.feature_cols!r}." |
| 273 | + ) |
| 274 | + |
| 275 | + |
| 276 | +def test_get_column_info_preserves_feature_cols_with_no_transformation(): |
| 277 | + """ |
| 278 | + A plain FeatureView (no transformation) must retain its feature column |
| 279 | + names so the offline store read step selects only the required columns. |
| 280 | + """ |
| 281 | + builder, view = _make_builder_for_column_info(None) |
| 282 | + |
| 283 | + with patch( |
| 284 | + "feast.infra.compute_engines.feature_builder._get_column_names", |
| 285 | + return_value=_MOCK_COLUMN_NAMES, |
| 286 | + ): |
| 287 | + col_info = builder.get_column_info(view) |
| 288 | + |
| 289 | + assert col_info.feature_cols == ["user_avg_rating", "user_review_count"] |
0 commit comments