@@ -2652,11 +2652,20 @@ mod tests {
26522652 Ok ( ( ) )
26532653 }
26542654
2655+ /// Ensure that memory tracker usage does not blow up after hash repartition of string view
2656+ /// arrays.
2657+ /// See: https://github.com/apache/datafusion/issues/20491
26552658 #[ tokio:: test]
26562659 async fn hash_repartition_string_view_compaction ( ) -> Result < ( ) > {
26572660 let schema = test_schema_string_view ( ) ;
26582661 let num_partitions = 8 ;
2659- let batch = create_string_view_batch ( 800 , num_partitions) ;
2662+
2663+ // use batch_size rows per partition to avoid the coalescer from GCing during coalesce.
2664+ let batch_size = SessionConfig :: new ( ) . batch_size ( ) ;
2665+
2666+ let expected_rows = batch_size * num_partitions;
2667+ let batch = create_string_view_batch ( expected_rows, num_partitions) ;
2668+ let original_size = batch. get_array_memory_size ( ) ;
26602669 let partitions = vec ! [ vec![ batch] ] ;
26612670
26622671 let output_partitions = repartition (
@@ -2670,9 +2679,19 @@ mod tests {
26702679
26712680 let total_rows: usize = output_partitions
26722681 . iter ( )
2673- . map ( |x| x. iter ( ) . map ( |x| x. num_rows ( ) ) . sum :: < usize > ( ) )
2682+ . flatten ( )
2683+ . map ( |batch| batch. num_rows ( ) )
2684+ . sum ( ) ;
2685+ assert_eq ! ( total_rows, expected_rows) ;
2686+
2687+ let repartitioned_size: usize = output_partitions
2688+ . iter ( )
2689+ . flatten ( )
2690+ . map ( |batch| batch. get_array_memory_size ( ) )
26742691 . sum ( ) ;
2675- assert_eq ! ( total_rows, 800 ) ;
2692+ // without GC, the repartitioned_size blows up to 4x (in this case) of the original_size.
2693+ // So using a safe threshold of 2x.
2694+ assert ! ( repartitioned_size <= original_size * 2 ) ;
26762695
26772696 Ok ( ( ) )
26782697 }
0 commit comments