Remove redundant StringView GC in external sort

EeshanBembi · adriangb · commit f49d7b343679 · 2026-04-14T17:32:17.000-05:00
The SpillManager now handles GC for StringView/BinaryView arrays internally
via gc_view_arrays(), making the organize_stringview_arrays() function in
external sort redundant.

Changes:
- Remove organize_stringview_arrays() call and function from sort.rs
- Use batch.clone() for early return (cheaper than creating new batch)
- Use arrow_data::MAX_INLINE_VIEW_LEN constant instead of custom constant
- Update comment in spill_manager.rs to reference gc_view_arrays()
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -100,6 +100,7 @@ arrow-avro = { version = "58.1.0", default-features = false, features = [
     "xz",
 ] }
 arrow-buffer = { version = "58.1.0", default-features = false }
+arrow-data = { version = "58.1.0", default-features = false }
 arrow-flight = { version = "58.1.0", features = [
     "flight-sql-experimental",
 ] }
diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml
@@ -72,6 +72,7 @@ pin-project-lite = "^0.2.7"
 tokio = { workspace = true }
 
 [dev-dependencies]
+arrow-data = { workspace = true }
 criterion = { workspace = true, features = ["async_futures"] }
 datafusion-functions-aggregate = { workspace = true }
 datafusion-functions-window = { workspace = true }
diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs
@@ -53,7 +53,7 @@ use crate::{
     Statistics,
 };
 
-use arrow::array::{Array, RecordBatch, RecordBatchOptions, StringViewArray};
+use arrow::array::{Array, RecordBatch, RecordBatchOptions};
 use arrow::compute::{concat_batches, lexsort_to_indices, take_arrays};
 use arrow::datatypes::SchemaRef;
 use datafusion_common::config::SpillCompression;
@@ -418,8 +418,6 @@ impl ExternalSorter {
                 Some((self.spill_manager.create_in_progress_file("Sorting")?, 0));
         }
 
-        Self::organize_stringview_arrays(globally_sorted_batches)?;
-
         debug!("Spilling sort data of ExternalSorter to disk whilst inserting");
 
         let batches_to_spill = std::mem::take(globally_sorted_batches);
@@ -463,71 +461,6 @@ impl ExternalSorter {
         Ok(())
     }
 
-    /// Reconstruct `globally_sorted_batches` to organize the payload buffers of each
-    /// `StringViewArray` in sequential order by calling `gc()` on them.
-    ///
-    /// Note this is a workaround until <https://github.com/apache/arrow-rs/issues/7185> is
-    /// available
-    ///
-    /// # Rationale
-    /// After (merge-based) sorting, all batches will be sorted into a single run,
-    /// but physically this sorted run is chunked into many small batches. For
-    /// `StringViewArray`s inside each sorted run, their inner buffers are not
-    /// re-constructed by default, leading to non-sequential payload locations
-    /// (permutated by `interleave()` Arrow kernel). A single payload buffer might
-    /// be shared by multiple `RecordBatch`es.
-    /// When writing each batch to disk, the writer has to write all referenced buffers,
-    /// because they have to be read back one by one to reduce memory usage. This
-    /// causes extra disk reads and writes, and potentially execution failure.
-    ///
-    /// # Example
-    /// Before sorting:
-    /// batch1 -> buffer1
-    /// batch2 -> buffer2
-    ///
-    /// sorted_batch1 -> buffer1
-    ///               -> buffer2
-    /// sorted_batch2 -> buffer1
-    ///               -> buffer2
-    ///
-    /// Then when spilling each batch, the writer has to write all referenced buffers
-    /// repeatedly.
-    fn organize_stringview_arrays(
-        globally_sorted_batches: &mut Vec<RecordBatch>,
-    ) -> Result<()> {
-        let mut organized_batches = Vec::with_capacity(globally_sorted_batches.len());
-
-        for batch in globally_sorted_batches.drain(..) {
-            let mut new_columns: Vec<Arc<dyn Array>> =
-                Vec::with_capacity(batch.num_columns());
-
-            let mut arr_mutated = false;
-            for array in batch.columns() {
-                if let Some(string_view_array) =
-                    array.as_any().downcast_ref::<StringViewArray>()
-                {
-                    let new_array = string_view_array.gc();
-                    new_columns.push(Arc::new(new_array));
-                    arr_mutated = true;
-                } else {
-                    new_columns.push(Arc::clone(array));
-                }
-            }
-
-            let organized_batch = if arr_mutated {
-                RecordBatch::try_new(batch.schema(), new_columns)?
-            } else {
-                batch
-            };
-
-            organized_batches.push(organized_batch);
-        }
-
-        *globally_sorted_batches = organized_batches;
-
-        Ok(())
-    }
-
     /// Sorts the in-memory batches and merges them into a single sorted run, then writes
     /// the result to spill files.
     async fn sort_and_spill_in_mem_batches(&mut self) -> Result<()> {
diff --git a/datafusion/physical-plan/src/spill/mod.rs b/datafusion/physical-plan/src/spill/mod.rs
@@ -349,12 +349,6 @@ fn get_max_alignment_for_schema(schema: &Schema) -> usize {
 #[cfg(test)]
 const VIEW_SIZE_BYTES: usize = 16;
 
-/// Maximum size of inlined string/binary data in StringView/BinaryView arrays.
-/// Strings/binaries <= 12 bytes are stored inline within the 16-byte view structure.
-/// This matches the Arrow specification for view arrays.
-#[cfg(test)]
-const INLINE_THRESHOLD: usize = 12;
-
 /// Performs garbage collection on StringView and BinaryView arrays before spilling to reduce memory usage.
 ///
 /// # Why GC is needed
@@ -382,11 +376,8 @@ pub(crate) fn gc_view_arrays(batch: &RecordBatch) -> Result<RecordBatch> {
     });
 
     if !has_view_arrays {
-        // Return a new batch to maintain consistent behavior
-        return Ok(RecordBatch::try_new(
-            batch.schema(),
-            batch.columns().to_vec(),
-        )?);
+        // RecordBatch::clone() is cheap - just Arc reference count bumps
+        return Ok(batch.clone());
     }
 
     let mut new_columns: Vec<Arc<dyn Array>> = Vec::with_capacity(batch.num_columns());
@@ -452,10 +443,11 @@ fn should_gc_view_array(data_buffers: &[arrow::buffer::Buffer]) -> bool {
 
 #[cfg(test)]
 fn calculate_string_view_waste_ratio(array: &StringViewArray) -> f64 {
+    use arrow_data::MAX_INLINE_VIEW_LEN;
     calculate_view_waste_ratio(array.len(), array.data_buffers(), |i| {
         if !array.is_null(i) {
             let value = array.value(i);
-            if value.len() > INLINE_THRESHOLD {
+            if value.len() > MAX_INLINE_VIEW_LEN as usize {
                 return value.len();
             }
         }
diff --git a/datafusion/physical-plan/src/spill/spill_manager.rs b/datafusion/physical-plan/src/spill/spill_manager.rs
@@ -197,7 +197,7 @@ impl SpillManager {
 pub(crate) trait GetSlicedSize {
     /// Returns the size of the `RecordBatch` when sliced.
     /// Note: if multiple arrays or even a single array share the same data buffers, we may double count each buffer.
-    /// Therefore, make sure we call gc() or organize_stringview_arrays() before using this method.
+    /// Therefore, make sure we call gc() or gc_view_arrays() before using this method.
     fn get_sliced_size(&self) -> Result<usize>;
 }
 

Original file line number	Diff line number	Diff line change
`@@ -197,7 +197,7 @@ impl SpillManager {`
`197`	`197`	`pub(crate) trait GetSlicedSize {`
`198`	`198`	/// Returns the size of the `RecordBatch` when sliced.
`199`	`199`	`/// Note: if multiple arrays or even a single array share the same data buffers, we may double count each buffer.`
`200`		`- /// Therefore, make sure we call gc() or organize_stringview_arrays() before using this method.`
	`200`	`+ /// Therefore, make sure we call gc() or gc_view_arrays() before using this method.`
`201`	`201`	`fn get_sliced_size(&self) -> Result<usize>;`
`202`	`202`	`}`
`203`	`203`