Skip to content

Commit 23633d4

Browse files
EeshanBembiadriangb
authored andcommitted
Fix massive spill files for StringView/BinaryView columns
Add garbage collection for StringView and BinaryView arrays before spilling to disk. This prevents sliced arrays from carrying their entire original buffers when written to spill files. Changes: - Add gc_view_arrays() function to apply GC on view arrays - Integrate GC into InProgressSpillFile::append_batch() - Use simple threshold-based heuristic (100+ rows, 10KB+ buffer size) Fixes #19414 where GROUP BY on StringView columns created 820MB spill files instead of 33MB due to sliced arrays maintaining references to original buffers. Testing shows 80-98% reduction in spill file sizes for typical GROUP BY workloads.
1 parent 2818abb commit 23633d4

2 files changed

Lines changed: 520 additions & 3 deletions

File tree

datafusion/physical-plan/src/spill/in_progress_spill_file.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ use arrow::array::RecordBatch;
2424
use datafusion_common::exec_datafusion_err;
2525
use datafusion_execution::disk_manager::RefCountedTempFile;
2626

27-
use super::{IPCStreamWriter, spill_manager::SpillManager};
27+
use super::{IPCStreamWriter, gc_view_arrays, spill_manager::SpillManager};
2828

2929
/// Represents an in-progress spill file used for writing `RecordBatch`es to disk, created by `SpillManager`.
3030
/// Caller is able to use this struct to incrementally append in-memory batches to
@@ -50,6 +50,7 @@ impl InProgressSpillFile {
5050
}
5151

5252
/// Appends a `RecordBatch` to the spill file, initializing the writer if necessary.
53+
/// Performs garbage collection on StringView/BinaryView arrays to reduce spill file size.
5354
///
5455
/// # Errors
5556
/// - Returns an error if the file is not active (has been finalized)
@@ -61,6 +62,9 @@ impl InProgressSpillFile {
6162
"Append operation failed: No active in-progress file. The file may have already been finalized."
6263
));
6364
}
65+
66+
let gc_batch = gc_view_arrays(batch)?;
67+
6468
if self.writer.is_none() {
6569
// Use the SpillManager's declared schema rather than the batch's schema.
6670
// Individual batches may have different schemas (e.g., different nullability)
@@ -87,7 +91,7 @@ impl InProgressSpillFile {
8791
}
8892
}
8993
if let Some(writer) = &mut self.writer {
90-
let (spilled_rows, _) = writer.write(batch)?;
94+
let (spilled_rows, _) = writer.write(&gc_batch)?;
9195
if let Some(in_progress_file) = &mut self.in_progress_file {
9296
let pre_size = in_progress_file.current_disk_usage();
9397
in_progress_file.update_disk_usage()?;

0 commit comments

Comments
 (0)