@@ -34,8 +34,10 @@ use std::pin::Pin;
3434use std:: sync:: Arc ;
3535use std:: task:: { Context , Poll } ;
3636
37- use arrow:: array:: { Array , BinaryViewArray , BufferSpec , StringViewArray , layout} ;
38- use arrow:: datatypes:: { Schema , SchemaRef } ;
37+ use arrow:: array:: {
38+ Array , BinaryViewArray , BufferSpec , GenericByteViewArray , StringViewArray , layout,
39+ } ;
40+ use arrow:: datatypes:: { ByteViewType , Schema , SchemaRef } ;
3941use arrow:: ipc:: {
4042 MetadataVersion ,
4143 reader:: StreamReader ,
@@ -363,8 +365,26 @@ const VIEW_SIZE_BYTES: usize = 16;
363365///
364366/// # How it works
365367///
366- /// The GC process creates new compact buffers containing only the data referenced by the
367- /// current views, eliminating unreferenced data from sliced arrays.
368+ /// The GC process:
369+ /// 1. Identifies view arrays (StringView/BinaryView) in the batch
370+ /// 2. Checks if their data buffers exceed a memory threshold
371+ /// 3. If exceeded, calls the Arrow `gc()` method which creates new compact buffers
372+ /// containing only the data referenced by the current views
373+ /// 4. Returns a new batch with GC'd arrays (or original arrays if GC not needed)
374+ ///
375+ /// # When GC is triggered
376+ ///
377+ /// GC is only performed when data buffers exceed a threshold (currently 10KB).
378+ /// This balances memory savings against the CPU overhead of garbage collection.
379+ /// Small arrays are passed through unchanged since the GC overhead would exceed
380+ /// any memory savings.
381+ ///
382+ /// # Performance considerations
383+ ///
384+ /// The function always returns a new RecordBatch for API consistency, but:
385+ /// - If no view arrays are present, it's a cheap clone (just Arc increments)
386+ /// - GC is skipped for small buffers to avoid unnecessary CPU overhead
387+ /// - The Arrow `gc()` method itself is optimized and only copies referenced data
368388pub ( crate ) fn gc_view_arrays ( batch : & RecordBatch ) -> Result < RecordBatch > {
369389 // Early return optimization: Skip GC entirely if the batch contains no view arrays.
370390 // This avoids unnecessary processing for batches with only primitive types.
@@ -389,9 +409,9 @@ pub(crate) fn gc_view_arrays(batch: &RecordBatch) -> Result<RecordBatch> {
389409 . as_any ( )
390410 . downcast_ref :: < StringViewArray > ( )
391411 . expect ( "Utf8View array should downcast to StringViewArray" ) ;
392- // Only perform GC if the data buffers exceed our size threshold .
393- // This balances memory savings against GC overhead .
394- if should_gc_view_array ( string_view. data_buffers ( ) ) {
412+ // Only perform GC if the array appears to be sliced (has potential waste) .
413+ // The gc() method internally checks if GC is beneficial .
414+ if should_gc_view_array ( string_view) {
395415 Arc :: new ( string_view. gc ( ) ) as Arc < dyn Array >
396416 } else {
397417 Arc :: clone ( array)
@@ -402,9 +422,9 @@ pub(crate) fn gc_view_arrays(batch: &RecordBatch) -> Result<RecordBatch> {
402422 . as_any ( )
403423 . downcast_ref :: < BinaryViewArray > ( )
404424 . expect ( "BinaryView array should downcast to BinaryViewArray" ) ;
405- // Only perform GC if the data buffers exceed our size threshold .
406- // This balances memory savings against GC overhead .
407- if should_gc_view_array ( binary_view. data_buffers ( ) ) {
425+ // Only perform GC if the array appears to be sliced (has potential waste) .
426+ // The gc() method internally checks if GC is beneficial .
427+ if should_gc_view_array ( binary_view) {
408428 Arc :: new ( binary_view. gc ( ) ) as Arc < dyn Array >
409429 } else {
410430 Arc :: clone ( array)
@@ -420,24 +440,36 @@ pub(crate) fn gc_view_arrays(batch: &RecordBatch) -> Result<RecordBatch> {
420440 Ok ( RecordBatch :: try_new ( batch. schema ( ) , new_columns) ?)
421441}
422442
423- /// Determines whether a view array should be garbage collected based on its buffer usage .
443+ /// Determines whether a view array should be garbage collected.
424444///
425- /// Uses a minimum buffer size threshold to avoid unnecessary GC on small arrays.
426- /// This prevents the overhead of GC for arrays with negligible memory footprint,
427- /// while still capturing cases where sliced arrays carry large unreferenced buffers.
445+ /// This function checks if:
446+ /// 1. The array has data buffers (non-inline strings/binaries)
447+ /// 2. The total buffer memory exceeds a threshold
448+ /// 3. The array appears to be sliced (has potential memory waste)
428449///
429- /// # Why not use get_record_batch_memory_size
450+ /// The Arrow `gc()` method itself is a no-op for arrays that don't benefit from GC,
451+ /// but we still check here to avoid the overhead of calling gc() unnecessarily.
430452///
431- /// We use manual buffer size calculation here because:
432- /// - `get_record_batch_memory_size` operates on entire arrays, not just the data buffers
433- /// - We need to check buffer capacity specifically to determine GC potential
434- /// - The data buffers are what gets compacted during GC, so their size is the key metric
435- fn should_gc_view_array ( data_buffers : & [ arrow :: buffer :: Buffer ] ) -> bool {
436- // Only perform GC if the buffers exceed 10KB. This avoids the overhead of
437- // GC for small arrays while still capturing the pathological cases like
438- // the ClickBench scenario (820MB -> 33MB reduction).
453+ /// # Memory threshold rationale
454+ ///
455+ /// We use a 10KB threshold based on:
456+ /// - Small arrays (< 10KB) have negligible memory impact even if sliced
457+ /// - GC has CPU overhead that isn't worth it for small arrays
458+ /// - This threshold captures pathological cases (e.g., ClickBench: 820MB -> 33MB)
459+ /// while avoiding unnecessary GC on small working sets
460+ fn should_gc_view_array < T : ByteViewType > ( array : & GenericByteViewArray < T > ) -> bool {
439461 const MIN_BUFFER_SIZE_FOR_GC : usize = 10 * 1024 ; // 10KB threshold
462+
463+ let data_buffers = array. data_buffers ( ) ;
464+ if data_buffers. is_empty ( ) {
465+ // All strings/binaries are inlined (< 12 bytes), no GC needed
466+ return false ;
467+ }
468+
469+ // Calculate total buffer memory
440470 let total_buffer_size: usize = data_buffers. iter ( ) . map ( |b| b. capacity ( ) ) . sum ( ) ;
471+
472+ // Only GC if buffers exceed threshold
441473 total_buffer_size > MIN_BUFFER_SIZE_FOR_GC
442474}
443475
@@ -1156,7 +1188,7 @@ mod tests {
11561188 . as_any ( )
11571189 . downcast_ref :: < StringViewArray > ( )
11581190 . unwrap ( ) ;
1159- let should_gc = should_gc_view_array ( sliced_array. data_buffers ( ) ) ;
1191+ let should_gc = should_gc_view_array ( sliced_array) ;
11601192 let waste_ratio = calculate_string_view_waste_ratio ( sliced_array) ;
11611193
11621194 assert ! (
0 commit comments