Skip to content

Commit bb9a4a7

Browse files
bench: increase in_list benchmark coverage (#19443)
## Which issue does this PR close? - Related to #19241 ## Rationale for this change This PR adds benchmarks and tests to ground upcoming `in_list` optimizations: 1. **Realistic Data Patterns**: Adds mixed-length string benchmarks to accurately measure the `StringView` two-stage lookup (prefix check + validation) performance across variable lengths. 2. **Type Coverage**: Adds baseline tests for temporal and decimal types to ensure correctness before they are migrated to specialized evaluation paths. ## What changes are included in this PR? - **Mixed-Length Benchmarks**: Scenarios for `StringArray` and `StringViewArray` with variable lengths, match rates, and null densities. - **Extended Tests**: Coverage for esoteric types (Temporal, Duration, Interval, Decimal256) in `physical-expr`. ## Are these changes tested? Yes, via new unit tests and benchmark verification. ## Are there any user-facing changes? No.
1 parent 1e59164 commit bb9a4a7

2 files changed

Lines changed: 369 additions & 10 deletions

File tree

datafusion/physical-expr/benches/in_list.rs

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,10 @@ const NULL_PERCENTS: [f64; 2] = [0., 0.2];
5454
const STRING_LENGTHS: [usize; 3] = [3, 12, 100];
5555
const ARRAY_LENGTH: usize = 8192;
5656

57+
/// Mixed string lengths for realistic benchmarks.
58+
/// ~50% short (≤12 bytes), ~50% long (>12 bytes).
59+
const MIXED_STRING_LENGTHS: &[usize] = &[3, 6, 9, 12, 16, 20, 25, 30];
60+
5761
/// Returns a friendly type name for the array type.
5862
fn array_type_name<A: 'static>() -> &'static str {
5963
let id = TypeId::of::<A>();
@@ -150,6 +154,71 @@ fn bench_numeric_type<T, A>(
150154
}
151155
}
152156

157+
/// Generates a random string with a length chosen from MIXED_STRING_LENGTHS.
158+
fn random_mixed_length_string(rng: &mut StdRng) -> String {
159+
let len = *MIXED_STRING_LENGTHS.choose(rng).unwrap();
160+
random_string(rng, len)
161+
}
162+
163+
/// Benchmarks realistic mixed-length IN list scenario.
164+
///
165+
/// Tests with:
166+
/// - Mixed short (≤12 bytes) and long (>12 bytes) strings in the IN list
167+
/// - Varying prefixes (fully random strings)
168+
/// - Configurable match rate (% of values that are in the IN list)
169+
/// - Various IN list sizes (3, 8, 28, 100)
170+
fn bench_realistic_mixed_strings<A>(
171+
c: &mut Criterion,
172+
rng: &mut StdRng,
173+
make_scalar: fn(String) -> ScalarValue,
174+
) where
175+
A: Array + FromIterator<Option<String>> + 'static,
176+
{
177+
for in_list_length in IN_LIST_LENGTHS {
178+
for match_percent in [0.0, 0.25, 0.75] {
179+
for null_percent in NULL_PERCENTS {
180+
// Generate IN list with mixed-length random strings
181+
let in_list_strings: Vec<String> = (0..in_list_length)
182+
.map(|_| random_mixed_length_string(rng))
183+
.collect();
184+
185+
let in_list: Vec<_> = in_list_strings
186+
.iter()
187+
.map(|s| make_scalar(s.clone()))
188+
.collect();
189+
190+
// Generate values array with controlled match rate
191+
let values: A = (0..ARRAY_LENGTH)
192+
.map(|_| {
193+
if !rng.random_bool(1.0 - null_percent) {
194+
None
195+
} else if rng.random_bool(match_percent) {
196+
// Pick from IN list (will match)
197+
Some(in_list_strings.choose(rng).unwrap().clone())
198+
} else {
199+
// Generate new random string (unlikely to match)
200+
Some(random_mixed_length_string(rng))
201+
}
202+
})
203+
.collect();
204+
205+
do_bench(
206+
c,
207+
&format!(
208+
"in_list/{}/mixed/list={}/match={}%/nulls={}%",
209+
array_type_name::<A>(),
210+
in_list_length,
211+
(match_percent * 100.0) as u32,
212+
(null_percent * 100.0) as u32
213+
),
214+
Arc::new(values),
215+
&in_list,
216+
);
217+
}
218+
}
219+
}
220+
}
221+
153222
/// Entry point: registers in_list benchmarks for string and numeric array types.
154223
fn criterion_benchmark(c: &mut Criterion) {
155224
let mut rng = StdRng::seed_from_u64(120320);
@@ -158,6 +227,14 @@ fn criterion_benchmark(c: &mut Criterion) {
158227
bench_string_type::<StringArray>(c, &mut rng, |s| ScalarValue::Utf8(Some(s)));
159228
bench_string_type::<StringViewArray>(c, &mut rng, |s| ScalarValue::Utf8View(Some(s)));
160229

230+
// Realistic mixed-length string benchmarks (TPC-H style)
231+
bench_realistic_mixed_strings::<StringArray>(c, &mut rng, |s| {
232+
ScalarValue::Utf8(Some(s))
233+
});
234+
bench_realistic_mixed_strings::<StringViewArray>(c, &mut rng, |s| {
235+
ScalarValue::Utf8View(Some(s))
236+
});
237+
161238
// Benchmarks for numeric types
162239
bench_numeric_type::<u8, UInt8Array>(
163240
c,

0 commit comments

Comments
 (0)