Skip to content

Commit ecf3b50

Browse files
authored
Add StructArray and RunArray benchmark tests to with_hashes (#20182)
## Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes #123` indicates that this PR will close issue #123. --> - Closes #20181 ## Rationale for this change <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> Issue #20152 shows some areas of optimization for `RunArray` and `StructArray` hashing. But the existing `with_hashes` benchmark tests don't include coverage for these! ## What changes are included in this PR? <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> Added benchmarks to `with_hashes.rs`: - **StructArray**: 4-column struct (bool, int32, int64, string) - **RunArray**: Int32 run-encoded array - Both include single/multiple columns and with/without nulls ## Are these changes tested? <!-- We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 2. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? --> No additional tests added, but the benchmarks both compile and run. <details> <summary>a sample run:</summary> ``` ❯ cargo bench --features=parquet --bench with_hashes -- array Compiling datafusion-common v52.1.0 (/Users/notashes/dev/datafusion/datafusion/common) Finished `bench` profile [optimized] target(s) in 34.49s Running benches/with_hashes.rs (target/release/deps/with_hashes-2f180744d22084f3) Gnuplot not found, using plotters backend struct_array: single, no nulls time: [38.389 µs 38.437 µs 38.485 µs] Found 5 outliers among 100 measurements (5.00%) 1 (1.00%) low severe 2 (2.00%) low mild 2 (2.00%) high mild struct_array: single, nulls time: [46.108 µs 46.197 µs 46.291 µs] Found 4 outliers among 100 measurements (4.00%) 3 (3.00%) high mild 1 (1.00%) high severe struct_array: multiple, no nulls time: [114.64 µs 114.79 µs 114.93 µs] Found 4 outliers among 100 measurements (4.00%) 1 (1.00%) low severe 2 (2.00%) low mild 1 (1.00%) high mild struct_array: multiple, nulls time: [138.29 µs 138.62 µs 139.07 µs] Found 8 outliers among 100 measurements (8.00%) 1 (1.00%) low severe 4 (4.00%) low mild 1 (1.00%) high mild 2 (2.00%) high severe run_array_int32: single, no nulls time: [1.8777 µs 1.9098 µs 1.9457 µs] Found 3 outliers among 100 measurements (3.00%) 3 (3.00%) high mild run_array_int32: single, nulls time: [2.0110 µs 2.0417 µs 2.0751 µs] Found 7 outliers among 100 measurements (7.00%) 6 (6.00%) high mild 1 (1.00%) high severe run_array_int32: multiple, no nulls time: [5.0511 µs 5.0603 µs 5.0693 µs] Found 6 outliers among 100 measurements (6.00%) 1 (1.00%) low mild 5 (5.00%) high mild run_array_int32: multiple, nulls time: [5.6052 µs 5.6201 µs 5.6353 µs] Found 4 outliers among 100 measurements (4.00%) 3 (3.00%) high mild 1 (1.00%) high severe ``` </details> ## Are there any user-facing changes? <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. --> <!-- If there are any breaking changes to public APIs, please add the `api change` label. -->
1 parent 4e2c0f1 commit ecf3b50

1 file changed

Lines changed: 149 additions & 25 deletions

File tree

datafusion/common/benches/with_hashes.rs

Lines changed: 149 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,13 @@
2020
use ahash::RandomState;
2121
use arrow::array::{
2222
Array, ArrayRef, ArrowPrimitiveType, DictionaryArray, GenericStringArray,
23-
NullBufferBuilder, OffsetSizeTrait, PrimitiveArray, StringViewArray, make_array,
23+
NullBufferBuilder, OffsetSizeTrait, PrimitiveArray, RunArray, StringViewArray,
24+
StructArray, make_array,
2425
};
2526
use arrow::buffer::NullBuffer;
26-
use arrow::datatypes::{ArrowDictionaryKeyType, Int32Type, Int64Type};
27+
use arrow::datatypes::{
28+
ArrowDictionaryKeyType, DataType, Field, Fields, Int32Type, Int64Type,
29+
};
2730
use criterion::{Bencher, Criterion, criterion_group, criterion_main};
2831
use datafusion_common::hash_utils::with_hashes;
2932
use rand::Rng;
@@ -37,6 +40,7 @@ const BATCH_SIZE: usize = 8192;
3740
struct BenchData {
3841
name: &'static str,
3942
array: ArrayRef,
43+
supports_nulls: bool,
4044
}
4145

4246
fn criterion_benchmark(c: &mut Criterion) {
@@ -47,50 +51,74 @@ fn criterion_benchmark(c: &mut Criterion) {
4751
BenchData {
4852
name: "int64",
4953
array: primitive_array::<Int64Type>(BATCH_SIZE),
54+
supports_nulls: true,
5055
},
5156
BenchData {
5257
name: "utf8",
5358
array: pool.string_array::<i32>(BATCH_SIZE),
59+
supports_nulls: true,
5460
},
5561
BenchData {
5662
name: "large_utf8",
5763
array: pool.string_array::<i64>(BATCH_SIZE),
64+
supports_nulls: true,
5865
},
5966
BenchData {
6067
name: "utf8_view",
6168
array: pool.string_view_array(BATCH_SIZE),
69+
supports_nulls: true,
6270
},
6371
BenchData {
6472
name: "utf8_view (small)",
6573
array: small_pool.string_view_array(BATCH_SIZE),
74+
supports_nulls: true,
6675
},
6776
BenchData {
6877
name: "dictionary_utf8_int32",
6978
array: pool.dictionary_array::<Int32Type>(BATCH_SIZE),
79+
supports_nulls: true,
80+
},
81+
BenchData {
82+
name: "struct_array",
83+
array: create_struct_array(&pool, BATCH_SIZE),
84+
supports_nulls: true,
85+
},
86+
BenchData {
87+
name: "run_array_int32",
88+
array: create_run_array::<Int32Type>(BATCH_SIZE),
89+
supports_nulls: true,
7090
},
7191
];
7292

73-
for BenchData { name, array } in cases {
74-
// with_hash has different code paths for single vs multiple arrays and nulls vs no nulls
75-
let nullable_array = add_nulls(&array);
93+
for BenchData {
94+
name,
95+
array,
96+
supports_nulls,
97+
} in cases
98+
{
7699
c.bench_function(&format!("{name}: single, no nulls"), |b| {
77100
do_hash_test(b, std::slice::from_ref(&array));
78101
});
79-
c.bench_function(&format!("{name}: single, nulls"), |b| {
80-
do_hash_test(b, std::slice::from_ref(&nullable_array));
81-
});
82102
c.bench_function(&format!("{name}: multiple, no nulls"), |b| {
83103
let arrays = vec![array.clone(), array.clone(), array.clone()];
84104
do_hash_test(b, &arrays);
85105
});
86-
c.bench_function(&format!("{name}: multiple, nulls"), |b| {
87-
let arrays = vec![
88-
nullable_array.clone(),
89-
nullable_array.clone(),
90-
nullable_array.clone(),
91-
];
92-
do_hash_test(b, &arrays);
93-
});
106+
107+
if supports_nulls {
108+
let nullable_array = add_nulls(&array);
109+
110+
c.bench_function(&format!("{name}: single, nulls"), |b| {
111+
do_hash_test(b, std::slice::from_ref(&nullable_array));
112+
});
113+
c.bench_function(&format!("{name}: multiple, nulls"), |b| {
114+
let arrays = vec![
115+
nullable_array.clone(),
116+
nullable_array.clone(),
117+
nullable_array.clone(),
118+
];
119+
do_hash_test(b, &arrays);
120+
});
121+
}
94122
}
95123
}
96124

@@ -122,16 +150,51 @@ where
122150
builder.finish().expect("should be nulls in buffer")
123151
}
124152

125-
// Returns an new array that is the same as array, but with nulls
153+
// Returns a new array that is the same as array, but with nulls
154+
// Handles the special case of RunArray where nulls must be in the values array
126155
fn add_nulls(array: &ArrayRef) -> ArrayRef {
127-
let array_data = array
128-
.clone()
129-
.into_data()
130-
.into_builder()
131-
.nulls(Some(create_null_mask(array.len())))
132-
.build()
133-
.unwrap();
134-
make_array(array_data)
156+
use arrow::datatypes::DataType;
157+
158+
match array.data_type() {
159+
DataType::RunEndEncoded(_, _) => {
160+
// RunArray can't have top-level nulls, so apply nulls to the values array
161+
let run_array = array
162+
.as_any()
163+
.downcast_ref::<RunArray<Int32Type>>()
164+
.expect("Expected RunArray");
165+
166+
let run_ends_buffer = run_array.run_ends().inner().clone();
167+
let run_ends_array = PrimitiveArray::<Int32Type>::new(run_ends_buffer, None);
168+
let values = run_array.values().clone();
169+
170+
// Add nulls to the values array
171+
let values_with_nulls = {
172+
let array_data = values
173+
.clone()
174+
.into_data()
175+
.into_builder()
176+
.nulls(Some(create_null_mask(values.len())))
177+
.build()
178+
.unwrap();
179+
make_array(array_data)
180+
};
181+
182+
Arc::new(
183+
RunArray::try_new(&run_ends_array, values_with_nulls.as_ref())
184+
.expect("Failed to create RunArray with null values"),
185+
)
186+
}
187+
_ => {
188+
let array_data = array
189+
.clone()
190+
.into_data()
191+
.into_builder()
192+
.nulls(Some(create_null_mask(array.len())))
193+
.build()
194+
.unwrap();
195+
make_array(array_data)
196+
}
197+
}
135198
}
136199

137200
pub fn make_rng() -> StdRng {
@@ -205,5 +268,66 @@ where
205268
Arc::new(array)
206269
}
207270

271+
fn boolean_array(array_len: usize) -> ArrayRef {
272+
let mut rng = make_rng();
273+
Arc::new(
274+
(0..array_len)
275+
.map(|_| Some(rng.random::<bool>()))
276+
.collect::<arrow::array::BooleanArray>(),
277+
)
278+
}
279+
280+
/// Create a StructArray with multiple columns
281+
fn create_struct_array(pool: &StringPool, array_len: usize) -> ArrayRef {
282+
let bool_array = boolean_array(array_len);
283+
let int32_array = primitive_array::<Int32Type>(array_len);
284+
let int64_array = primitive_array::<Int64Type>(array_len);
285+
let str_array = pool.string_array::<i32>(array_len);
286+
287+
let fields = Fields::from(vec![
288+
Field::new("bool_col", DataType::Boolean, false),
289+
Field::new("int32_col", DataType::Int32, false),
290+
Field::new("int64_col", DataType::Int64, false),
291+
Field::new("string_col", DataType::Utf8, false),
292+
]);
293+
294+
Arc::new(StructArray::new(
295+
fields,
296+
vec![bool_array, int32_array, int64_array, str_array],
297+
None,
298+
))
299+
}
300+
301+
/// Create a RunArray to test run array hashing.
302+
fn create_run_array<T>(array_len: usize) -> ArrayRef
303+
where
304+
T: ArrowPrimitiveType,
305+
StandardUniform: Distribution<T::Native>,
306+
{
307+
let mut rng = make_rng();
308+
309+
// Create runs of varying lengths
310+
let mut run_ends = Vec::new();
311+
let mut values = Vec::new();
312+
let mut current_end = 0;
313+
314+
while current_end < array_len {
315+
// Random run length between 1 and 50
316+
let run_length = rng.random_range(1..=50).min(array_len - current_end);
317+
current_end += run_length;
318+
run_ends.push(current_end as i32);
319+
values.push(Some(rng.random::<T::Native>()));
320+
}
321+
322+
let run_ends_array = Arc::new(PrimitiveArray::<Int32Type>::from(run_ends));
323+
let values_array: Arc<dyn Array> =
324+
Arc::new(values.into_iter().collect::<PrimitiveArray<T>>());
325+
326+
Arc::new(
327+
RunArray::try_new(&run_ends_array, values_array.as_ref())
328+
.expect("Failed to create RunArray"),
329+
)
330+
}
331+
208332
criterion_group!(benches, criterion_benchmark);
209333
criterion_main!(benches);

0 commit comments

Comments
 (0)