Skip to content

Commit 28b5b37

Browse files
committed
feat: use bitmasks for multiple column aggregation
use `BooleanBufferBuilder` from https://github.com/apache/arrow-rs/blob/d6168e526aae79d6fbafe8c11062b5f834021052/arrow-buffer/src/util/bit_util.rs
1 parent 8efed09 commit 28b5b37

5 files changed

Lines changed: 300 additions & 322 deletions

File tree

datafusion/physical-plan/src/aggregates/group_values/multi_group_by/boolean.rs

Lines changed: 62 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ use crate::aggregates::group_values::multi_group_by::{GroupColumn, nulls_equal_t
2222
use crate::aggregates::group_values::null_builder::MaybeNullBufferBuilder;
2323
use arrow::array::{Array as _, ArrayRef, AsArray, BooleanArray, BooleanBufferBuilder};
2424
use datafusion_common::Result;
25-
use itertools::izip;
2625

2726
/// An implementation of [`GroupColumn`] for booleans
2827
///
@@ -81,32 +80,31 @@ impl<const NULLABLE: bool> GroupColumn for BooleanGroupValueBuilder<NULLABLE> {
8180
lhs_rows: &[usize],
8281
array: &ArrayRef,
8382
rhs_rows: &[usize],
84-
equal_to_results: &mut [bool],
83+
equal_to_results: &mut BooleanBufferBuilder,
8584
) {
8685
let array = array.as_boolean();
8786

88-
let iter = izip!(
89-
lhs_rows.iter(),
90-
rhs_rows.iter(),
91-
equal_to_results.iter_mut(),
92-
);
93-
94-
for (&lhs_row, &rhs_row, equal_to_result) in iter {
95-
// Has found not equal to in previous column, don't need to check
96-
if !*equal_to_result {
87+
for (idx, (&lhs_row, &rhs_row)) in
88+
lhs_rows.iter().zip(rhs_rows.iter()).enumerate()
89+
{
90+
if !equal_to_results.get_bit(idx) {
9791
continue;
9892
}
9993

10094
if NULLABLE {
10195
let exist_null = self.nulls.is_null(lhs_row);
10296
let input_null = array.is_null(rhs_row);
10397
if let Some(result) = nulls_equal_to(exist_null, input_null) {
104-
*equal_to_result = result;
98+
if !result {
99+
equal_to_results.set_bit(idx, false);
100+
}
105101
continue;
106102
}
107103
}
108104

109-
*equal_to_result = self.buffer.get_bit(lhs_row) == array.value(rhs_row);
105+
if self.buffer.get_bit(lhs_row) != array.value(rhs_row) {
106+
equal_to_results.set_bit(idx, false);
107+
}
110108
}
111109
}
112110

@@ -195,10 +193,20 @@ impl<const NULLABLE: bool> GroupColumn for BooleanGroupValueBuilder<NULLABLE> {
195193

196194
#[cfg(test)]
197195
mod tests {
198-
use arrow::array::NullBufferBuilder;
196+
use arrow::array::{BooleanBufferBuilder, NullBufferBuilder};
199197

200198
use super::*;
201199

200+
fn make_true_buffer(n: usize) -> BooleanBufferBuilder {
201+
let mut buf = BooleanBufferBuilder::new(n);
202+
buf.append_n(n, true);
203+
buf
204+
}
205+
206+
fn to_vec(buf: &BooleanBufferBuilder) -> Vec<bool> {
207+
(0..buf.len()).map(|i| buf.get_bit(i)).collect()
208+
}
209+
202210
#[test]
203211
fn test_nullable_boolean_equal_to() {
204212
let append = |builder: &mut BooleanGroupValueBuilder<true>,
@@ -213,10 +221,11 @@ mod tests {
213221
lhs_rows: &[usize],
214222
input_array: &ArrayRef,
215223
rhs_rows: &[usize],
216-
equal_to_results: &mut Vec<bool>| {
224+
equal_to_results: &mut BooleanBufferBuilder| {
217225
let iter = lhs_rows.iter().zip(rhs_rows.iter());
218226
for (idx, (&lhs_row, &rhs_row)) in iter.enumerate() {
219-
equal_to_results[idx] = builder.equal_to(lhs_row, input_array, rhs_row);
227+
equal_to_results
228+
.set_bit(idx, builder.equal_to(lhs_row, input_array, rhs_row));
220229
}
221230
};
222231

@@ -237,7 +246,7 @@ mod tests {
237246
lhs_rows: &[usize],
238247
input_array: &ArrayRef,
239248
rhs_rows: &[usize],
240-
equal_to_results: &mut Vec<bool>| {
249+
equal_to_results: &mut BooleanBufferBuilder| {
241250
builder.vectorized_equal_to(
242251
lhs_rows,
243252
input_array,
@@ -257,18 +266,9 @@ mod tests {
257266
&[usize],
258267
&ArrayRef,
259268
&[usize],
260-
&mut Vec<bool>,
269+
&mut BooleanBufferBuilder,
261270
),
262271
{
263-
// Will cover such cases:
264-
// - exist null, input not null
265-
// - exist null, input null; values not equal
266-
// - exist null, input null; values equal
267-
// - exist not null, input null
268-
// - exist not null, input not null; values not equal
269-
// - exist not null, input not null; values equal
270-
271-
// Define PrimitiveGroupValueBuilder
272272
let mut builder = BooleanGroupValueBuilder::<true>::new();
273273
let builder_array = Arc::new(BooleanArray::from(vec![
274274
None,
@@ -280,7 +280,6 @@ mod tests {
280280
])) as ArrayRef;
281281
append(&mut builder, &builder_array, &[0, 1, 2, 3, 4, 5]);
282282

283-
// Define input array
284283
let (values, _nulls) = BooleanArray::from(vec![
285284
Some(true),
286285
Some(false),
@@ -291,32 +290,31 @@ mod tests {
291290
])
292291
.into_parts();
293292

294-
// explicitly build a null buffer where one of the null values also happens to match
295293
let mut nulls = NullBufferBuilder::new(6);
296294
nulls.append_non_null();
297-
nulls.append_null(); // this sets Some(false) to null above
295+
nulls.append_null();
298296
nulls.append_null();
299297
nulls.append_null();
300298
nulls.append_non_null();
301299
nulls.append_non_null();
302300
let input_array = Arc::new(BooleanArray::new(values, nulls.finish())) as ArrayRef;
303301

304-
// Check
305-
let mut equal_to_results = vec![true; builder.len()];
302+
let mut equal_to_results = make_true_buffer(builder.len());
306303
equal_to(
307304
&builder,
308305
&[0, 1, 2, 3, 4, 5],
309306
&input_array,
310307
&[0, 1, 2, 3, 4, 5],
311308
&mut equal_to_results,
312309
);
313-
314-
assert!(!equal_to_results[0]);
315-
assert!(equal_to_results[1]);
316-
assert!(equal_to_results[2]);
317-
assert!(!equal_to_results[3]);
318-
assert!(!equal_to_results[4]);
319-
assert!(equal_to_results[5]);
310+
let results = to_vec(&equal_to_results);
311+
312+
assert!(!results[0]);
313+
assert!(results[1]);
314+
assert!(results[2]);
315+
assert!(!results[3]);
316+
assert!(!results[4]);
317+
assert!(results[5]);
320318
}
321319

322320
#[test]
@@ -333,10 +331,11 @@ mod tests {
333331
lhs_rows: &[usize],
334332
input_array: &ArrayRef,
335333
rhs_rows: &[usize],
336-
equal_to_results: &mut Vec<bool>| {
334+
equal_to_results: &mut BooleanBufferBuilder| {
337335
let iter = lhs_rows.iter().zip(rhs_rows.iter());
338336
for (idx, (&lhs_row, &rhs_row)) in iter.enumerate() {
339-
equal_to_results[idx] = builder.equal_to(lhs_row, input_array, rhs_row);
337+
equal_to_results
338+
.set_bit(idx, builder.equal_to(lhs_row, input_array, rhs_row));
340339
}
341340
};
342341

@@ -357,7 +356,7 @@ mod tests {
357356
lhs_rows: &[usize],
358357
input_array: &ArrayRef,
359358
rhs_rows: &[usize],
360-
equal_to_results: &mut Vec<bool>| {
359+
equal_to_results: &mut BooleanBufferBuilder| {
361360
builder.vectorized_equal_to(
362361
lhs_rows,
363362
input_array,
@@ -377,14 +376,9 @@ mod tests {
377376
&[usize],
378377
&ArrayRef,
379378
&[usize],
380-
&mut Vec<bool>,
379+
&mut BooleanBufferBuilder,
381380
),
382381
{
383-
// Will cover such cases:
384-
// - values equal
385-
// - values not equal
386-
387-
// Define PrimitiveGroupValueBuilder
388382
let mut builder = BooleanGroupValueBuilder::<false>::new();
389383
let builder_array = Arc::new(BooleanArray::from(vec![
390384
Some(false),
@@ -394,35 +388,31 @@ mod tests {
394388
])) as ArrayRef;
395389
append(&mut builder, &builder_array, &[0, 1, 2, 3]);
396390

397-
// Define input array
398391
let input_array = Arc::new(BooleanArray::from(vec![
399392
Some(false),
400393
Some(false),
401394
Some(true),
402395
Some(true),
403396
])) as ArrayRef;
404397

405-
// Check
406-
let mut equal_to_results = vec![true; builder.len()];
398+
let mut equal_to_results = make_true_buffer(builder.len());
407399
equal_to(
408400
&builder,
409401
&[0, 1, 2, 3],
410402
&input_array,
411403
&[0, 1, 2, 3],
412404
&mut equal_to_results,
413405
);
406+
let results = to_vec(&equal_to_results);
414407

415-
assert!(equal_to_results[0]);
416-
assert!(!equal_to_results[1]);
417-
assert!(!equal_to_results[2]);
418-
assert!(equal_to_results[3]);
408+
assert!(results[0]);
409+
assert!(!results[1]);
410+
assert!(!results[2]);
411+
assert!(results[3]);
419412
}
420413

421414
#[test]
422415
fn test_nullable_boolean_vectorized_operation_special_case() {
423-
// Test the special `all nulls` or `not nulls` input array case
424-
// for vectorized append and equal to
425-
426416
let mut builder = BooleanGroupValueBuilder::<true>::new();
427417

428418
// All nulls input array
@@ -432,19 +422,20 @@ mod tests {
432422
.vectorized_append(&all_nulls_input_array, &[0, 1, 2, 3, 4])
433423
.unwrap();
434424

435-
let mut equal_to_results = vec![true; all_nulls_input_array.len()];
425+
let mut equal_to_results = make_true_buffer(all_nulls_input_array.len());
436426
builder.vectorized_equal_to(
437427
&[0, 1, 2, 3, 4],
438428
&all_nulls_input_array,
439429
&[0, 1, 2, 3, 4],
440430
&mut equal_to_results,
441431
);
432+
let results = to_vec(&equal_to_results);
442433

443-
assert!(equal_to_results[0]);
444-
assert!(equal_to_results[1]);
445-
assert!(equal_to_results[2]);
446-
assert!(equal_to_results[3]);
447-
assert!(equal_to_results[4]);
434+
assert!(results[0]);
435+
assert!(results[1]);
436+
assert!(results[2]);
437+
assert!(results[3]);
438+
assert!(results[4]);
448439

449440
// All not nulls input array
450441
let all_not_nulls_input_array = Arc::new(BooleanArray::from(vec![
@@ -458,18 +449,19 @@ mod tests {
458449
.vectorized_append(&all_not_nulls_input_array, &[0, 1, 2, 3, 4])
459450
.unwrap();
460451

461-
let mut equal_to_results = vec![true; all_not_nulls_input_array.len()];
452+
let mut equal_to_results = make_true_buffer(all_not_nulls_input_array.len());
462453
builder.vectorized_equal_to(
463454
&[5, 6, 7, 8, 9],
464455
&all_not_nulls_input_array,
465456
&[0, 1, 2, 3, 4],
466457
&mut equal_to_results,
467458
);
459+
let results = to_vec(&equal_to_results);
468460

469-
assert!(equal_to_results[0]);
470-
assert!(equal_to_results[1]);
471-
assert!(equal_to_results[2]);
472-
assert!(equal_to_results[3]);
473-
assert!(equal_to_results[4]);
461+
assert!(results[0]);
462+
assert!(results[1]);
463+
assert!(results[2]);
464+
assert!(results[3]);
465+
assert!(results[4]);
474466
}
475467
}

0 commit comments

Comments
 (0)