Skip to content

Commit edc19f9

Browse files
committed
Use get_buffer_memory_size in should_gc_view_array
Address PR review: avoid duplicating data-buffer size calculation by deriving it from get_buffer_memory_size minus the views buffer.
1 parent 51c47bc commit edc19f9

1 file changed

Lines changed: 8 additions & 11 deletions

File tree

  • datafusion/physical-plan/src/spill

datafusion/physical-plan/src/spill/mod.rs

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -348,7 +348,6 @@ fn get_max_alignment_for_schema(schema: &Schema) -> usize {
348348

349349
/// Size of a single view structure in StringView/BinaryView arrays (in bytes).
350350
/// Each view is 16 bytes: 4 bytes length + 4 bytes prefix + 8 bytes buffer ID/offset.
351-
#[cfg(test)]
352351
const VIEW_SIZE_BYTES: usize = 16;
353352

354353
/// Performs garbage collection on StringView and BinaryView arrays before spilling to reduce memory usage.
@@ -443,22 +442,20 @@ pub(crate) fn gc_view_arrays(batch: &RecordBatch) -> Result<RecordBatch> {
443442
/// Determines whether a view array should be garbage collected before spilling.
444443
///
445444
/// Arrow's `gc()` always allocates new compact buffers (it is never a no-op), so we
446-
/// check here to skip the allocation cost when data buffers are small.
447-
///
448-
/// We check `data_buffers` capacity rather than `array.get_buffer_memory_size()` because
449-
/// the latter includes the views buffer (16 bytes × n_rows), which scales with row count
450-
/// regardless of string content. We want to trigger GC only when non-inline string data
451-
/// is substantial enough to justify the copy.
445+
/// check here to skip the allocation cost when data buffers are small. We subtract
446+
/// the views buffer (16 bytes × n_rows) from `get_buffer_memory_size()` so the
447+
/// threshold tracks non-inline string data rather than row count.
452448
fn should_gc_view_array<T: ByteViewType>(array: &GenericByteViewArray<T>) -> bool {
453449
const MIN_BUFFER_SIZE_FOR_GC: usize = 10 * 1024; // 10KB threshold
454450

455-
let data_buffers = array.data_buffers();
456-
if data_buffers.is_empty() {
451+
if array.data_buffers().is_empty() {
457452
return false;
458453
}
459454

460-
let total_buffer_size: usize = data_buffers.iter().map(|b| b.capacity()).sum();
461-
total_buffer_size > MIN_BUFFER_SIZE_FOR_GC
455+
let data_buffer_size = array
456+
.get_buffer_memory_size()
457+
.saturating_sub(array.len() * VIEW_SIZE_BYTES);
458+
data_buffer_size > MIN_BUFFER_SIZE_FOR_GC
462459
}
463460

464461
#[cfg(test)]

0 commit comments

Comments
 (0)