Skip to content

Commit 43d32a8

Browse files
theirixalamb
andauthored
chore: use bench array helpers from Arrow bench_util (#21544)
## Which issue does this PR close? - Closes #. ## Rationale for this change While working on function benchmarks, I've noticed duplicated code, which can be replaced with [arrow::util::bench_util](https://github.com/apache/arrow-rs/blob/main/arrow/src/util/bench_util.rs) helpers. I recall a discussion in apache/arrow-rs#7294 about bench_util / test_utils usage, and seems like it can be used to simplify clients code. ## What changes are included in this PR? - Equivalent changes in benchmarks. More complicated and targeted helpers are left as is ## Are these changes tested? - Run benchmarks to check errors or performance degradations ## Are there any user-facing changes? <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. --> <!-- If there are any breaking changes to public APIs, please add the `api change` label. --> --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent d8c9797 commit 43d32a8

6 files changed

Lines changed: 100 additions & 331 deletions

File tree

datafusion/functions-aggregate/benches/array_agg.rs

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,14 @@ use std::sync::Arc;
2020

2121
use arrow::array::{
2222
Array, ArrayRef, ArrowPrimitiveType, AsArray, ListArray, NullBufferBuilder,
23-
PrimitiveArray,
2423
};
2524
use arrow::datatypes::{Field, Int64Type};
2625
use criterion::{Criterion, criterion_group, criterion_main};
2726
use datafusion_expr::Accumulator;
2827
use datafusion_functions_aggregate::array_agg::ArrayAggAccumulator;
2928

3029
use arrow::buffer::OffsetBuffer;
30+
use arrow::util::bench_util::create_primitive_array;
3131
use rand::Rng;
3232
use rand::SeedableRng;
3333
use rand::distr::{Distribution, StandardUniform};
@@ -54,24 +54,6 @@ fn merge_batch_bench(c: &mut Criterion, name: &str, values: ArrayRef) {
5454
});
5555
}
5656

57-
pub fn create_primitive_array<T>(size: usize, null_density: f32) -> PrimitiveArray<T>
58-
where
59-
T: ArrowPrimitiveType,
60-
StandardUniform: Distribution<T::Native>,
61-
{
62-
let mut rng = seedable_rng();
63-
64-
(0..size)
65-
.map(|_| {
66-
if rng.random::<f32>() < null_density {
67-
None
68-
} else {
69-
Some(rng.random())
70-
}
71-
})
72-
.collect()
73-
}
74-
7557
/// Create List array with the given item data type, null density, null locations and zero length lists density
7658
/// Creates a random (but fixed-seeded) array of a given size and null density
7759
pub fn create_list_array<T>(

datafusion/functions-nested/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ log = { workspace = true }
6464
memchr = { workspace = true }
6565

6666
[dev-dependencies]
67+
arrow = { workspace = true, features = ["test_utils"] }
6768
criterion = { workspace = true, features = ["async_tokio"] }
6869
rand = { workspace = true }
6970

datafusion/functions-nested/benches/array_min_max.rs

Lines changed: 11 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -17,81 +17,34 @@
1717

1818
use std::sync::Arc;
1919

20-
use arrow::array::{ArrayRef, Int64Array, ListArray};
21-
use arrow::buffer::{NullBuffer, OffsetBuffer};
22-
use arrow::datatypes::{DataType, Field};
20+
use arrow::array::{Array, ArrayRef};
21+
use arrow::datatypes::{DataType, Field, Int64Type};
22+
use arrow::util::bench_util::create_primitive_list_array_with_seed;
2323
use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
2424
use datafusion_common::config::ConfigOptions;
2525
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
2626
use datafusion_functions_nested::min_max::ArrayMax;
27-
use rand::rngs::StdRng;
28-
use rand::{Rng, SeedableRng};
2927

3028
const NUM_ROWS: usize = 8192;
3129
const SEED: u64 = 42;
3230
const LIST_NULL_DENSITY: f64 = 0.1;
3331
const ELEMENT_NULL_DENSITY: f64 = 0.1;
3432

35-
fn create_int64_list_array(
36-
num_rows: usize,
37-
list_size: usize,
38-
element_null_density: f64,
39-
) -> ArrayRef {
40-
let mut rng = StdRng::seed_from_u64(SEED);
41-
let total_values = num_rows * list_size;
42-
43-
if element_null_density > 0.0 {
44-
let values: Vec<Option<i64>> = (0..total_values)
45-
.map(|_| {
46-
if rng.random::<f64>() < element_null_density {
47-
None
48-
} else {
49-
Some(rng.random::<i64>() % 10_000)
50-
}
51-
})
52-
.collect();
53-
let values_array = Arc::new(Int64Array::from(values));
54-
55-
let offsets: Vec<i32> = (0..=num_rows).map(|i| (i * list_size) as i32).collect();
56-
let nulls: Vec<bool> = (0..num_rows)
57-
.map(|_| rng.random::<f64>() >= LIST_NULL_DENSITY)
58-
.collect();
59-
60-
Arc::new(ListArray::new(
61-
Arc::new(Field::new("item", DataType::Int64, true)),
62-
OffsetBuffer::new(offsets.into()),
63-
values_array,
64-
Some(NullBuffer::from(nulls)),
65-
))
66-
} else {
67-
// No element nulls — values array has no null buffer
68-
let values: Vec<i64> = (0..total_values)
69-
.map(|_| rng.random::<i64>() % 10_000)
70-
.collect();
71-
let values_array = Arc::new(Int64Array::from(values));
72-
73-
let offsets: Vec<i32> = (0..=num_rows).map(|i| (i * list_size) as i32).collect();
74-
let nulls: Vec<bool> = (0..num_rows)
75-
.map(|_| rng.random::<f64>() >= LIST_NULL_DENSITY)
76-
.collect();
77-
78-
Arc::new(ListArray::new(
79-
Arc::new(Field::new("item", DataType::Int64, false)),
80-
OffsetBuffer::new(offsets.into()),
81-
values_array,
82-
Some(NullBuffer::from(nulls)),
83-
))
84-
}
85-
}
86-
8733
fn criterion_benchmark(c: &mut Criterion) {
8834
let udf = ArrayMax::new();
8935
let config_options = Arc::new(ConfigOptions::default());
9036

9137
for list_size in [10, 100, 1000] {
9238
for (label, null_density) in [("nulls", ELEMENT_NULL_DENSITY), ("no_nulls", 0.0)]
9339
{
94-
let list_array = create_int64_list_array(NUM_ROWS, list_size, null_density);
40+
let list_array: ArrayRef =
41+
Arc::new(create_primitive_list_array_with_seed::<i32, Int64Type>(
42+
NUM_ROWS,
43+
LIST_NULL_DENSITY as f32,
44+
null_density as f32,
45+
list_size,
46+
SEED,
47+
));
9548
let args = vec![ColumnarValue::Array(Arc::clone(&list_array))];
9649
let arg_fields =
9750
vec![Field::new("arg_0", list_array.data_type().clone(), true).into()];

datafusion/functions-nested/benches/array_remove.rs

Lines changed: 33 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,12 @@
1616
// under the License.
1717

1818
use arrow::array::{
19-
Array, ArrayRef, BinaryArray, BooleanArray, Decimal128Array, FixedSizeBinaryArray,
20-
Float64Array, Int64Array, ListArray, StringArray,
19+
Array, ArrayRef, BinaryArray, BooleanArray, FixedSizeBinaryArray, ListArray,
20+
StringArray,
2121
};
2222
use arrow::buffer::OffsetBuffer;
23-
use arrow::datatypes::{DataType, Field};
23+
use arrow::datatypes::{DataType, Decimal128Type, Field, Float64Type, Int64Type};
24+
use arrow::util::bench_util::create_primitive_list_array_with_seed;
2425
use criterion::{
2526
criterion_group, criterion_main, {BenchmarkId, Criterion},
2627
};
@@ -55,7 +56,15 @@ fn bench_array_remove_int64(c: &mut Criterion) {
5556
let mut group = c.benchmark_group("array_remove_int64");
5657

5758
for &array_size in ARRAY_SIZES {
58-
let list_array = create_int64_list_array(NUM_ROWS, array_size, NULL_DENSITY);
59+
let list_array: ArrayRef =
60+
Arc::new(create_primitive_list_array_with_seed::<i32, Int64Type>(
61+
NUM_ROWS,
62+
0.0,
63+
NULL_DENSITY as f32,
64+
array_size,
65+
SEED,
66+
));
67+
5968
let element_to_remove = ScalarValue::Int64(Some(1));
6069
let args = create_args(list_array.clone(), element_to_remove.clone());
6170

@@ -96,7 +105,14 @@ fn bench_array_remove_f64(c: &mut Criterion) {
96105
let mut group = c.benchmark_group("array_remove_f64");
97106

98107
for &array_size in ARRAY_SIZES {
99-
let list_array = create_f64_list_array(NUM_ROWS, array_size, NULL_DENSITY);
108+
let list_array: ArrayRef =
109+
Arc::new(create_primitive_list_array_with_seed::<i32, Float64Type>(
110+
NUM_ROWS,
111+
0.0,
112+
NULL_DENSITY as f32,
113+
array_size,
114+
SEED,
115+
));
100116
let element_to_remove = ScalarValue::Float64(Some(1.0));
101117
let args = create_args(list_array.clone(), element_to_remove.clone());
102118

@@ -260,8 +276,17 @@ fn bench_array_remove_decimal64(c: &mut Criterion) {
260276
let mut group = c.benchmark_group("array_remove_decimal64");
261277

262278
for &array_size in ARRAY_SIZES {
263-
let list_array = create_decimal64_list_array(NUM_ROWS, array_size, NULL_DENSITY);
264-
let element_to_remove = ScalarValue::Decimal128(Some(100_i128), 10, 2);
279+
let list_array: ArrayRef = Arc::new(create_primitive_list_array_with_seed::<
280+
i32,
281+
Decimal128Type,
282+
>(
283+
NUM_ROWS,
284+
0.0,
285+
NULL_DENSITY as f32,
286+
array_size,
287+
SEED,
288+
));
289+
let element_to_remove = ScalarValue::Decimal128(Some(100_i128), 38, 10);
265290
let args = create_args(list_array.clone(), element_to_remove.clone());
266291

267292
group.bench_with_input(
@@ -276,7 +301,7 @@ fn bench_array_remove_decimal64(c: &mut Criterion) {
276301
arg_fields: vec![
277302
Field::new("arr", list_array.data_type().clone(), false)
278303
.into(),
279-
Field::new("el", DataType::Decimal128(10, 2), false)
304+
Field::new("el", DataType::Decimal128(38, 10), false)
280305
.into(),
281306
],
282307
number_rows: NUM_ROWS,
@@ -348,66 +373,6 @@ fn create_args(list_array: ArrayRef, element: ScalarValue) -> Vec<ColumnarValue>
348373
]
349374
}
350375

351-
fn create_int64_list_array(
352-
num_rows: usize,
353-
array_size: usize,
354-
null_density: f64,
355-
) -> ArrayRef {
356-
let mut rng = StdRng::seed_from_u64(SEED);
357-
let values = (0..num_rows * array_size)
358-
.map(|_| {
359-
if rng.random::<f64>() < null_density {
360-
None
361-
} else {
362-
Some(rng.random_range(0..array_size as i64))
363-
}
364-
})
365-
.collect::<Int64Array>();
366-
let offsets = (0..=num_rows)
367-
.map(|i| (i * array_size) as i32)
368-
.collect::<Vec<i32>>();
369-
370-
Arc::new(
371-
ListArray::try_new(
372-
Arc::new(Field::new("item", DataType::Int64, true)),
373-
OffsetBuffer::new(offsets.into()),
374-
Arc::new(values),
375-
None,
376-
)
377-
.unwrap(),
378-
)
379-
}
380-
381-
fn create_f64_list_array(
382-
num_rows: usize,
383-
array_size: usize,
384-
null_density: f64,
385-
) -> ArrayRef {
386-
let mut rng = StdRng::seed_from_u64(SEED);
387-
let values = (0..num_rows * array_size)
388-
.map(|_| {
389-
if rng.random::<f64>() < null_density {
390-
None
391-
} else {
392-
Some(rng.random_range(0..array_size as i64) as f64)
393-
}
394-
})
395-
.collect::<Float64Array>();
396-
let offsets = (0..=num_rows)
397-
.map(|i| (i * array_size) as i32)
398-
.collect::<Vec<i32>>();
399-
400-
Arc::new(
401-
ListArray::try_new(
402-
Arc::new(Field::new("item", DataType::Float64, true)),
403-
OffsetBuffer::new(offsets.into()),
404-
Arc::new(values),
405-
None,
406-
)
407-
.unwrap(),
408-
)
409-
}
410-
411376
fn create_string_list_array(
412377
num_rows: usize,
413378
array_size: usize,
@@ -500,38 +465,6 @@ fn create_boolean_list_array(
500465
)
501466
}
502467

503-
fn create_decimal64_list_array(
504-
num_rows: usize,
505-
array_size: usize,
506-
null_density: f64,
507-
) -> ArrayRef {
508-
let mut rng = StdRng::seed_from_u64(SEED);
509-
let values = (0..num_rows * array_size)
510-
.map(|_| {
511-
if rng.random::<f64>() < null_density {
512-
None
513-
} else {
514-
Some(rng.random_range(0..array_size) as i128 * 100)
515-
}
516-
})
517-
.collect::<Decimal128Array>()
518-
.with_precision_and_scale(10, 2)
519-
.unwrap();
520-
let offsets = (0..=num_rows)
521-
.map(|i| (i * array_size) as i32)
522-
.collect::<Vec<i32>>();
523-
524-
Arc::new(
525-
ListArray::try_new(
526-
Arc::new(Field::new("item", DataType::Decimal128(10, 2), true)),
527-
OffsetBuffer::new(offsets.into()),
528-
Arc::new(values),
529-
None,
530-
)
531-
.unwrap(),
532-
)
533-
}
534-
535468
fn create_fixed_size_binary_list_array(
536469
num_rows: usize,
537470
array_size: usize,

0 commit comments

Comments
 (0)