@@ -54,6 +54,10 @@ const NULL_PERCENTS: [f64; 2] = [0., 0.2];
5454const STRING_LENGTHS : [ usize ; 3 ] = [ 3 , 12 , 100 ] ;
5555const ARRAY_LENGTH : usize = 8192 ;
5656
57+ /// Mixed string lengths for realistic benchmarks.
58+ /// ~50% short (≤12 bytes), ~50% long (>12 bytes).
59+ const MIXED_STRING_LENGTHS : & [ usize ] = & [ 3 , 6 , 9 , 12 , 16 , 20 , 25 , 30 ] ;
60+
5761/// Returns a friendly type name for the array type.
5862fn array_type_name < A : ' static > ( ) -> & ' static str {
5963 let id = TypeId :: of :: < A > ( ) ;
@@ -150,6 +154,71 @@ fn bench_numeric_type<T, A>(
150154 }
151155}
152156
157+ /// Generates a random string with a length chosen from MIXED_STRING_LENGTHS.
158+ fn random_mixed_length_string ( rng : & mut StdRng ) -> String {
159+ let len = * MIXED_STRING_LENGTHS . choose ( rng) . unwrap ( ) ;
160+ random_string ( rng, len)
161+ }
162+
163+ /// Benchmarks realistic mixed-length IN list scenario.
164+ ///
165+ /// Tests with:
166+ /// - Mixed short (≤12 bytes) and long (>12 bytes) strings in the IN list
167+ /// - Varying prefixes (fully random strings)
168+ /// - Configurable match rate (% of values that are in the IN list)
169+ /// - Various IN list sizes (3, 8, 28, 100)
170+ fn bench_realistic_mixed_strings < A > (
171+ c : & mut Criterion ,
172+ rng : & mut StdRng ,
173+ make_scalar : fn ( String ) -> ScalarValue ,
174+ ) where
175+ A : Array + FromIterator < Option < String > > + ' static ,
176+ {
177+ for in_list_length in IN_LIST_LENGTHS {
178+ for match_percent in [ 0.0 , 0.25 , 0.75 ] {
179+ for null_percent in NULL_PERCENTS {
180+ // Generate IN list with mixed-length random strings
181+ let in_list_strings: Vec < String > = ( 0 ..in_list_length)
182+ . map ( |_| random_mixed_length_string ( rng) )
183+ . collect ( ) ;
184+
185+ let in_list: Vec < _ > = in_list_strings
186+ . iter ( )
187+ . map ( |s| make_scalar ( s. clone ( ) ) )
188+ . collect ( ) ;
189+
190+ // Generate values array with controlled match rate
191+ let values: A = ( 0 ..ARRAY_LENGTH )
192+ . map ( |_| {
193+ if !rng. random_bool ( 1.0 - null_percent) {
194+ None
195+ } else if rng. random_bool ( match_percent) {
196+ // Pick from IN list (will match)
197+ Some ( in_list_strings. choose ( rng) . unwrap ( ) . clone ( ) )
198+ } else {
199+ // Generate new random string (unlikely to match)
200+ Some ( random_mixed_length_string ( rng) )
201+ }
202+ } )
203+ . collect ( ) ;
204+
205+ do_bench (
206+ c,
207+ & format ! (
208+ "in_list/{}/mixed/list={}/match={}%/nulls={}%" ,
209+ array_type_name:: <A >( ) ,
210+ in_list_length,
211+ ( match_percent * 100.0 ) as u32 ,
212+ ( null_percent * 100.0 ) as u32
213+ ) ,
214+ Arc :: new ( values) ,
215+ & in_list,
216+ ) ;
217+ }
218+ }
219+ }
220+ }
221+
153222/// Entry point: registers in_list benchmarks for string and numeric array types.
154223fn criterion_benchmark ( c : & mut Criterion ) {
155224 let mut rng = StdRng :: seed_from_u64 ( 120320 ) ;
@@ -158,6 +227,14 @@ fn criterion_benchmark(c: &mut Criterion) {
158227 bench_string_type :: < StringArray > ( c, & mut rng, |s| ScalarValue :: Utf8 ( Some ( s) ) ) ;
159228 bench_string_type :: < StringViewArray > ( c, & mut rng, |s| ScalarValue :: Utf8View ( Some ( s) ) ) ;
160229
230+ // Realistic mixed-length string benchmarks (TPC-H style)
231+ bench_realistic_mixed_strings :: < StringArray > ( c, & mut rng, |s| {
232+ ScalarValue :: Utf8 ( Some ( s) )
233+ } ) ;
234+ bench_realistic_mixed_strings :: < StringViewArray > ( c, & mut rng, |s| {
235+ ScalarValue :: Utf8View ( Some ( s) )
236+ } ) ;
237+
161238 // Benchmarks for numeric types
162239 bench_numeric_type :: < u8 , UInt8Array > (
163240 c,
0 commit comments