refactor: address remaining PR #22000 review feedback

adriangb · claude · adriangb · commit fa5e9d08660e · 2026-05-05T13:02:03.000-05:00
Follow-ups to the cherry-picked refactor that landed the file_index keying: * Reject `TABLESAMPLE` without an explicit method instead of silently treating it as `SYSTEM` (#22000 (comment)). PostgreSQL requires an explicit method and Spark defaults to block-level; picking one here in core would commit to semantics callers may not want. Added an slt case to lock the new error. * Rephrase the `SamplePushdown` planning error from "TABLESAMPLE is not supported for this source" to "TABLESAMPLE could not be pushed down" since the failure may originate at any node along the passthrough chain, not just the leaf source (#22000 (comment)). Updated the slt assertion to match. * Dedupe the SYSTEM-mode adaptive split comments in the parquet opener; the outer block now covers determinism and the inner block covers the row-group-vs-row split math without overlap (#22000 (comment)). * Update the `select.md` and `relation_planner/table_sample.rs` REPEATABLE wording to reflect that sampling now keys on the execution `partition_index`, not the on-disk file path (#22000 (comment) and #discussion_r3187445171). * Replace the opener-level "REPEATABLE ignores file name" test with a "sampling keys on partition_index" test that verifies same partition_index → same selection regardless of file name and different partition_index → uncorrelated samples. Added `with_partition_index` to the test builder. * Refresh the `run_examples-7` snapshot to match the new seed mix (the per-row-group hash now folds in the optional REPEATABLE seed alongside `file_index`; deterministic but a different draw). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/datafusion-examples/examples/relation_planner/table_sample.rs b/datafusion-examples/examples/relation_planner/table_sample.rs
@@ -306,8 +306,10 @@ async fn run_examples(ctx: &SessionContext) -> Result<()> {
     // through. Routed against the parquet-backed copy of the table so
     // the `SamplePushdown` rule can absorb the sample into the scan.
     // `REPEATABLE(42)` makes the rows deterministic across runs and
-    // across machines (the seed dominates the file path in the
-    // sampler's hash input).
+    // across machines: the parquet sampler keys on the seed plus the
+    // execution `partition_index` (a stable per-file id), never on the
+    // on-disk path, so the same query against the same data picks the
+    // same rows everywhere.
     let results = run_example(
         ctx,
         "Example 7: SYSTEM (handled by the built-in, not this example)",
@@ -318,11 +320,11 @@ async fn run_examples(ctx: &SessionContext) -> Result<()> {
     +---------+---------+
     | column1 | column2 |
     +---------+---------+
-    | 5       | row_5   |
     | 6       | row_6   |
     | 7       | row_7   |
     | 8       | row_8   |
     | 9       | row_9   |
+    | 10      | row_10  |
     +---------+---------+
     ");
 
diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs
@@ -1777,6 +1777,13 @@ mod test {
             self
         }
 
+        /// Set the partition_index plumbed through to the opener.
+        /// Sampling keys on this so different partitions decorrelate.
+        fn with_partition_index(mut self, partition_index: usize) -> Self {
+            self.partition_index = partition_index;
+            self
+        }
+
         /// Set the object store (required for building).
         fn with_store(mut self, store: Arc<dyn ObjectStore>) -> Self {
             self.store = Some(store);
@@ -3009,11 +3016,15 @@ mod test {
         );
     }
 
-    /// REPEATABLE(seed) must produce the same selection regardless of
-    /// where the parquet file lives. This is the SQL semantics users
-    /// expect from `TABLESAMPLE ... REPEATABLE(n)`.
+    /// Sampling must key on the execution `partition_index`, not the
+    /// on-disk file path: two parquet files with different names but
+    /// identical content opened at the same partition_index pick the
+    /// same rows. Different partition_index values must decorrelate.
+    /// This is what makes `TABLESAMPLE ... REPEATABLE(n)` reproducible
+    /// across environments without leaking object-store paths into the
+    /// sample seed.
     #[tokio::test]
-    async fn system_target_remaining_repeatable_seed_ignores_file_name() {
+    async fn system_target_remaining_sample_keys_on_partition_index() {
         let store = Arc::new(InMemory::new()) as Arc<dyn ObjectStore>;
 
         let values: Vec<Option<i32>> = (0..100).map(Some).collect();
@@ -3033,15 +3044,18 @@ mod test {
             ..Default::default()
         };
 
-        let opener = ParquetMorselizerBuilder::new()
+        // Same partition_index for both files -> same selection,
+        // regardless of file name.
+        let opener_p0 = ParquetMorselizerBuilder::new()
             .with_store(Arc::clone(&store))
             .with_schema(Arc::clone(&schema))
             .with_projection_indices(&[0])
-            .with_sampling(sampling)
+            .with_partition_index(0)
+            .with_sampling(sampling.clone())
             .build();
 
         let stream_a = open_file(
-            &opener,
+            &opener_p0,
             PartitionedFile::new(
                 "first.parquet".to_string(),
                 u64::try_from(len_a).unwrap(),
@@ -3050,7 +3064,7 @@ mod test {
         .await
         .unwrap();
         let stream_b = open_file(
-            &opener,
+            &opener_p0,
             PartitionedFile::new(
                 "second_path.parquet".to_string(),
                 u64::try_from(len_b).unwrap(),
@@ -3064,52 +3078,37 @@ mod test {
 
         assert_eq!(
             rows_a, rows_b,
-            "REPEATABLE(seed) must select the same rows regardless of file path"
+            "sampling must key on partition_index, not file name"
         );
         assert!(
             !rows_a.is_empty() && rows_a.len() < 100,
             "expected a strict subset; got {} rows",
             rows_a.len()
         );
 
-        // Without a seed the selection must depend on the file name —
-        // otherwise unrelated parquet files in the same scan would all
-        // produce correlated samples, defeating the purpose of file-
-        // axis randomisation.
-        let unseeded_sampling = crate::sampling::ParquetSampling {
-            system_target_remaining: Some(0.5),
-            row_cluster_size: 4,
-            ..Default::default()
-        };
-        let unseeded_opener = ParquetMorselizerBuilder::new()
+        // Different partition_index -> uncorrelated samples even with
+        // the same seed. Otherwise every file in a parallel scan would
+        // pick the same row indices, defeating file-axis randomisation.
+        let opener_p1 = ParquetMorselizerBuilder::new()
             .with_store(Arc::clone(&store))
             .with_schema(Arc::clone(&schema))
             .with_projection_indices(&[0])
-            .with_sampling(unseeded_sampling)
+            .with_partition_index(1)
+            .with_sampling(sampling)
             .build();
-        let stream_a2 = open_file(
-            &unseeded_opener,
+        let stream_a_p1 = open_file(
+            &opener_p1,
             PartitionedFile::new(
                 "first.parquet".to_string(),
                 u64::try_from(len_a).unwrap(),
             ),
         )
         .await
         .unwrap();
-        let stream_b2 = open_file(
-            &unseeded_opener,
-            PartitionedFile::new(
-                "second_path.parquet".to_string(),
-                u64::try_from(len_b).unwrap(),
-            ),
-        )
-        .await
-        .unwrap();
-        let rows_a2 = collect_values(stream_a2).await;
-        let rows_b2 = collect_values(stream_b2).await;
+        let rows_a_p1 = collect_values(stream_a_p1).await;
         assert_ne!(
-            rows_a2, rows_b2,
-            "without a seed, different file names should produce different samples"
+            rows_a, rows_a_p1,
+            "different partition_index must produce different samples"
         );
     }
 
diff --git a/datafusion/physical-optimizer/src/sample_pushdown.rs b/datafusion/physical-optimizer/src/sample_pushdown.rs
@@ -73,7 +73,7 @@ impl PhysicalOptimizerRule for SamplePushdown {
                 Pushdown::Pushed(new_child) => Ok(Transformed::yes(new_child)),
                 Pushdown::Failed(reason) => {
                     datafusion_common::plan_err!(
-                        "TABLESAMPLE is not supported for this source: {reason}. \
+                        "TABLESAMPLE could not be pushed down: {reason}. \
                          A generic post-scan SampleExec is not yet implemented; \
                          see https://github.com/apache/datafusion/issues/16533"
                     )
diff --git a/datafusion/sql/src/sample.rs b/datafusion/sql/src/sample.rs
@@ -99,9 +99,21 @@ impl RelationPlanner for TableSampleSystemPlanner {
         }
         match ts.name {
             // The built-in planner only handles SYSTEM (and BLOCK as an
-            // alias for SYSTEM, matching Hive). Anything else is a
-            // semantics commitment we don't want to make in core.
-            Some(TableSampleMethod::System) | Some(TableSampleMethod::Block) | None => {}
+            // alias for SYSTEM, matching Hive). An unspecified method
+            // is rejected rather than silently picking SYSTEM, since
+            // the right default differs by engine (PostgreSQL requires
+            // an explicit method; Spark defaults to block-level).
+            // Anything else is a semantics commitment we don't want to
+            // make in core.
+            Some(TableSampleMethod::System) | Some(TableSampleMethod::Block) => {}
+            None => {
+                return not_impl_err!(
+                    "TABLESAMPLE without an explicit method is not supported; \
+                     write TABLESAMPLE SYSTEM (...) (or register a custom \
+                     RelationPlanner before the built-in TableSampleSystemPlanner \
+                     to define a default)."
+                );
+            }
             Some(other) => {
                 return not_impl_err!(
                     "TABLESAMPLE method {other} is not supported (only SYSTEM). \
diff --git a/datafusion/sqllogictest/test_files/tablesample.slt b/datafusion/sqllogictest/test_files/tablesample.slt
@@ -78,6 +78,9 @@ physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/
 statement error TABLESAMPLE method BERNOULLI is not supported
 SELECT count(*) FROM sample_table TABLESAMPLE BERNOULLI (10);
 
+statement error TABLESAMPLE without an explicit method is not supported
+SELECT count(*) FROM sample_table TABLESAMPLE (10);
+
 statement error TABLESAMPLE with ROWS count is not supported
 SELECT count(*) FROM sample_table TABLESAMPLE SYSTEM (100 ROWS);
 
@@ -102,7 +105,7 @@ STORED AS CSV
 LOCATION 'test_files/scratch/tablesample/sample_table.csv'
 OPTIONS ('format.has_header' 'true');
 
-statement error TABLESAMPLE is not supported for this source
+statement error TABLESAMPLE could not be pushed down
 SELECT count(*) FROM sample_csv TABLESAMPLE SYSTEM (50);
 
 statement ok
diff --git a/docs/source/user-guide/sql/select.md b/docs/source/user-guide/sql/select.md
@@ -156,9 +156,11 @@ pure row-level sampling.
 
 `REPEATABLE(seed)` mixes the seed into every random draw, so all
 levels produce the same selection across runs. The selection also
-depends on the file name, the row-group index within the file, and
-the cluster size, so different files don't accidentally see
-correlated samples.
+depends on the execution `partition_index` of each file (a stable
+per-file id assigned by the scan, independent of the on-disk path),
+the row-group index within the file, and the cluster size, so
+different files don't accidentally see correlated samples and the
+sample is reproducible across environments.
 
 The sampling is visible in `EXPLAIN`:
 

Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,7 @@ impl PhysicalOptimizerRule for SamplePushdown {`
`73`	`73`	`Pushdown::Pushed(new_child) => Ok(Transformed::yes(new_child)),`
`74`	`74`	`Pushdown::Failed(reason) => {`
`75`	`75`	`datafusion_common::plan_err!(`
`76`		`- "TABLESAMPLE is not supported for this source: {reason}. \`
	`76`	`+ "TABLESAMPLE could not be pushed down: {reason}. \`
`77`	`77`	`A generic post-scan SampleExec is not yet implemented; \`
`78`	`78`	`see https://github.com/apache/datafusion/issues/16533"`
`79`	`79`	`)`