@@ -35,6 +35,23 @@ use rand_distr::{Normal, Pareto};
3535use std:: fmt:: Write ;
3636use std:: sync:: Arc ;
3737
38+ /// Payload profile for the benchmark `utf8` column.
39+ ///
40+ /// The small profile preserves the existing `hi0`..`hi3` baseline. Medium and
41+ /// large profiles keep the same low cardinality but scale each value's byte
42+ /// width so string aggregation can expose the cost of copying larger payloads.
43+ #[ derive( Clone , Copy , Debug ) ]
44+ pub enum Utf8PayloadProfile {
45+ /// 3-byte baseline values such as `hi0`.
46+ Small ,
47+ /// 64-byte payloads that are large enough to make copying noticeable
48+ /// without dominating the benchmark with allocator churn.
49+ Medium ,
50+ /// 1024-byte payloads that amplify both CPU and memory pressure in
51+ /// grouped `string_agg` workloads.
52+ Large ,
53+ }
54+
3855/// create an in-memory table given the partition len, array len, and batch size,
3956/// and the result table will be of array_len in total, and then partitioned, and batched.
4057#[ expect( clippy:: allow_attributes) ] // some issue where expect(dead_code) doesn't fire properly
@@ -44,9 +61,32 @@ pub fn create_table_provider(
4461 array_len : usize ,
4562 batch_size : usize ,
4663) -> Result < Arc < MemTable > > {
64+ create_table_provider_with_payload (
65+ partitions_len,
66+ array_len,
67+ batch_size,
68+ Utf8PayloadProfile :: Small ,
69+ )
70+ }
71+
72+ /// Create an in-memory table with a configurable `utf8` payload size.
73+ #[ expect( clippy:: allow_attributes) ] // some issue where expect(dead_code) doesn't fire properly
74+ #[ allow( dead_code) ]
75+ pub fn create_table_provider_with_payload (
76+ partitions_len : usize ,
77+ array_len : usize ,
78+ batch_size : usize ,
79+ utf8_payload_profile : Utf8PayloadProfile ,
80+ ) -> Result < Arc < MemTable > > {
81+ let _ = Utf8PayloadProfile :: all ( ) ;
4782 let schema = Arc :: new ( create_schema ( ) ) ;
48- let partitions =
49- create_record_batches ( & schema, array_len, partitions_len, batch_size) ;
83+ let partitions = create_record_batches (
84+ & schema,
85+ array_len,
86+ partitions_len,
87+ batch_size,
88+ utf8_payload_profile,
89+ ) ;
5090 // declare a table in memory. In spark API, this corresponds to createDataFrame(...).
5191 MemTable :: try_new ( schema, partitions) . map ( Arc :: new)
5292}
@@ -91,12 +131,14 @@ fn create_record_batch(
91131 rng : & mut StdRng ,
92132 batch_size : usize ,
93133 batch_index : usize ,
134+ payloads : & [ String ; 4 ] ,
94135) -> RecordBatch {
95136 // Randomly choose from 4 distinct key values; a higher number increases sparseness.
96137 let key_suffixes = [ 0 , 1 , 2 , 3 ] ;
97- let keys = StringArray :: from_iter_values (
98- ( 0 ..batch_size) . map ( |_| format ! ( "hi{}" , key_suffixes. choose( rng) . unwrap( ) ) ) ,
99- ) ;
138+ let keys = StringArray :: from_iter_values ( ( 0 ..batch_size) . map ( |_| {
139+ let suffix = * key_suffixes. choose ( rng) . unwrap ( ) ;
140+ payloads[ suffix] . as_str ( )
141+ } ) ) ;
100142
101143 let values = create_data ( rng, batch_size, 0.5 ) ;
102144
@@ -146,10 +188,12 @@ pub fn create_record_batches(
146188 array_len : usize ,
147189 partitions_len : usize ,
148190 batch_size : usize ,
191+ utf8_payload_profile : Utf8PayloadProfile ,
149192) -> Vec < Vec < RecordBatch > > {
150193 let mut rng = StdRng :: seed_from_u64 ( 42 ) ;
151194 let mut partitions = Vec :: with_capacity ( partitions_len) ;
152195 let batches_per_partition = array_len / batch_size / partitions_len;
196+ let payloads = utf8_payload_profile. payloads ( ) ;
153197
154198 for _ in 0 ..partitions_len {
155199 let mut batches = Vec :: with_capacity ( batches_per_partition) ;
@@ -159,13 +203,37 @@ pub fn create_record_batches(
159203 & mut rng,
160204 batch_size,
161205 batch_index,
206+ & payloads,
162207 ) ) ;
163208 }
164209 partitions. push ( batches) ;
165210 }
166211 partitions
167212}
168213
214+ impl Utf8PayloadProfile {
215+ fn all ( ) -> [ Self ; 3 ] {
216+ [ Self :: Small , Self :: Medium , Self :: Large ]
217+ }
218+
219+ fn payloads ( self ) -> [ String ; 4 ] {
220+ std:: array:: from_fn ( |idx| match self {
221+ Self :: Small => format ! ( "hi{idx}" ) ,
222+ Self :: Medium => payload_string ( "mid" , idx, 64 ) ,
223+ Self :: Large => payload_string ( "large" , idx, 1024 ) ,
224+ } )
225+ }
226+ }
227+
228+ fn payload_string ( prefix : & str , suffix : usize , target_len : usize ) -> String {
229+ let mut value = format ! ( "{prefix}{suffix}_" ) ;
230+ value. extend ( std:: iter:: repeat_n (
231+ ( b'a' + suffix as u8 ) as char ,
232+ target_len - value. len ( ) ,
233+ ) ) ;
234+ value
235+ }
236+
169237/// An enum that wraps either a regular StringBuilder or a GenericByteViewBuilder
170238/// so that both can be used interchangeably.
171239enum TraceIdBuilder {
0 commit comments