Skip to content

Commit b50e1f7

Browse files
chore: Add microbenchmark (compared to ExprOrExpr) (#20076)
## Which issue does this PR close? Related to #19994 - This PR extracts the benchmark code to allow performance comparison. ## Rationale for this change As pointed out by @alamb in #19994, this separates the microbenchmark code so that the benchmarking scripts can compare the optimization PR against main with the benchmark already in place. ## What changes are included in this PR? Adds a microbenchmark for the divide-by-zero protection pattern in `case_when.rs`: - Benchmarks with varying percentages of zeros (0%, 10%, 50%, 90%) - Compares `DivideByZeroProtection` pattern (where checked column matches divisor) vs `ExpressionOrExpression` fallback (where they don't match) ## Are these changes tested? benchmark code only. ## Are there any user-facing changes? No.
1 parent 2860ada commit b50e1f7

1 file changed

Lines changed: 109 additions & 0 deletions

File tree

datafusion/physical-expr/benches/case_when.rs

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ use arrow::datatypes::{ArrowNativeTypeOp, Field, Schema};
2020
use arrow::record_batch::RecordBatch;
2121
use arrow::util::test_util::seedable_rng;
2222
use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
23+
use datafusion_common::ScalarValue;
2324
use datafusion_expr::Operator;
2425
use datafusion_physical_expr::expressions::{BinaryExpr, case, col, lit};
2526
use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
@@ -93,6 +94,7 @@ fn criterion_benchmark(c: &mut Criterion) {
9394
run_benchmarks(c, &make_batch(8192, 100));
9495

9596
benchmark_lookup_table_case_when(c, 8192);
97+
benchmark_divide_by_zero_protection(c, 8192);
9698
}
9799

98100
fn run_benchmarks(c: &mut Criterion, batch: &RecordBatch) {
@@ -517,5 +519,112 @@ fn benchmark_lookup_table_case_when(c: &mut Criterion, batch_size: usize) {
517519
}
518520
}
519521

522+
fn benchmark_divide_by_zero_protection(c: &mut Criterion, batch_size: usize) {
523+
let mut group = c.benchmark_group("divide_by_zero_protection");
524+
525+
for zero_percentage in [0.0, 0.1, 0.5, 0.9] {
526+
let rng = &mut seedable_rng();
527+
528+
let numerator: Int32Array =
529+
(0..batch_size).map(|_| Some(rng.random::<i32>())).collect();
530+
531+
let divisor_values: Vec<Option<i32>> = (0..batch_size)
532+
.map(|_| {
533+
let roll: f32 = rng.random();
534+
if roll < zero_percentage {
535+
Some(0)
536+
} else {
537+
let mut val = rng.random::<i32>();
538+
while val == 0 {
539+
val = rng.random::<i32>();
540+
}
541+
Some(val)
542+
}
543+
})
544+
.collect();
545+
546+
let divisor: Int32Array = divisor_values.iter().cloned().collect();
547+
let divisor_copy: Int32Array = divisor_values.iter().cloned().collect();
548+
549+
let schema = Arc::new(Schema::new(vec![
550+
Field::new("numerator", numerator.data_type().clone(), true),
551+
Field::new("divisor", divisor.data_type().clone(), true),
552+
Field::new("divisor_copy", divisor_copy.data_type().clone(), true),
553+
]));
554+
555+
let batch = RecordBatch::try_new(
556+
Arc::clone(&schema),
557+
vec![
558+
Arc::new(numerator),
559+
Arc::new(divisor),
560+
Arc::new(divisor_copy),
561+
],
562+
)
563+
.unwrap();
564+
565+
let numerator_col = col("numerator", &batch.schema()).unwrap();
566+
let divisor_col = col("divisor", &batch.schema()).unwrap();
567+
let divisor_copy_col = col("divisor_copy", &batch.schema()).unwrap();
568+
569+
// DivideByZeroProtection: WHEN condition checks `divisor_col > 0` and division
570+
// uses `divisor_col` as divisor. Since the checked column matches the divisor,
571+
// this triggers the DivideByZeroProtection optimization.
572+
group.bench_function(
573+
format!(
574+
"{} rows, {}% zeros: DivideByZeroProtection",
575+
batch_size,
576+
(zero_percentage * 100.0) as i32
577+
),
578+
|b| {
579+
let when = Arc::new(BinaryExpr::new(
580+
Arc::clone(&divisor_col),
581+
Operator::Gt,
582+
lit(0i32),
583+
));
584+
let then = Arc::new(BinaryExpr::new(
585+
Arc::clone(&numerator_col),
586+
Operator::Divide,
587+
Arc::clone(&divisor_col),
588+
));
589+
let else_null: Arc<dyn PhysicalExpr> = lit(ScalarValue::Int32(None));
590+
let expr =
591+
Arc::new(case(None, vec![(when, then)], Some(else_null)).unwrap());
592+
593+
b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap()))
594+
},
595+
);
596+
597+
// ExpressionOrExpression: WHEN condition checks `divisor_copy_col > 0` but
598+
// division uses `divisor_col` as divisor. Since the checked column does NOT
599+
// match the divisor, this falls back to ExpressionOrExpression evaluation.
600+
group.bench_function(
601+
format!(
602+
"{} rows, {}% zeros: ExpressionOrExpression",
603+
batch_size,
604+
(zero_percentage * 100.0) as i32
605+
),
606+
|b| {
607+
let when = Arc::new(BinaryExpr::new(
608+
Arc::clone(&divisor_copy_col),
609+
Operator::Gt,
610+
lit(0i32),
611+
));
612+
let then = Arc::new(BinaryExpr::new(
613+
Arc::clone(&numerator_col),
614+
Operator::Divide,
615+
Arc::clone(&divisor_col),
616+
));
617+
let else_null: Arc<dyn PhysicalExpr> = lit(ScalarValue::Int32(None));
618+
let expr =
619+
Arc::new(case(None, vec![(when, then)], Some(else_null)).unwrap());
620+
621+
b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap()))
622+
},
623+
);
624+
}
625+
626+
group.finish();
627+
}
628+
520629
criterion_group!(benches, criterion_benchmark);
521630
criterion_main!(benches);

0 commit comments

Comments
 (0)