@@ -348,7 +348,6 @@ fn get_max_alignment_for_schema(schema: &Schema) -> usize {
348348
349349/// Size of a single view structure in StringView/BinaryView arrays (in bytes).
350350/// Each view is 16 bytes: 4 bytes length + 4 bytes prefix + 8 bytes buffer ID/offset.
351- #[ cfg( test) ]
352351const VIEW_SIZE_BYTES : usize = 16 ;
353352
354353/// Performs garbage collection on StringView and BinaryView arrays before spilling to reduce memory usage.
@@ -443,22 +442,20 @@ pub(crate) fn gc_view_arrays(batch: &RecordBatch) -> Result<RecordBatch> {
443442/// Determines whether a view array should be garbage collected before spilling.
444443///
445444/// Arrow's `gc()` always allocates new compact buffers (it is never a no-op), so we
446- /// check here to skip the allocation cost when data buffers are small.
447- ///
448- /// We check `data_buffers` capacity rather than `array.get_buffer_memory_size()` because
449- /// the latter includes the views buffer (16 bytes × n_rows), which scales with row count
450- /// regardless of string content. We want to trigger GC only when non-inline string data
451- /// is substantial enough to justify the copy.
445+ /// check here to skip the allocation cost when data buffers are small. We subtract
446+ /// the views buffer (16 bytes × n_rows) from `get_buffer_memory_size()` so the
447+ /// threshold tracks non-inline string data rather than row count.
452448fn should_gc_view_array < T : ByteViewType > ( array : & GenericByteViewArray < T > ) -> bool {
453449 const MIN_BUFFER_SIZE_FOR_GC : usize = 10 * 1024 ; // 10KB threshold
454450
455- let data_buffers = array. data_buffers ( ) ;
456- if data_buffers. is_empty ( ) {
451+ if array. data_buffers ( ) . is_empty ( ) {
457452 return false ;
458453 }
459454
460- let total_buffer_size: usize = data_buffers. iter ( ) . map ( |b| b. capacity ( ) ) . sum ( ) ;
461- total_buffer_size > MIN_BUFFER_SIZE_FOR_GC
455+ let data_buffer_size = array
456+ . get_buffer_memory_size ( )
457+ . saturating_sub ( array. len ( ) * VIEW_SIZE_BYTES ) ;
458+ data_buffer_size > MIN_BUFFER_SIZE_FOR_GC
462459}
463460
464461#[ cfg( test) ]
0 commit comments