Skip to content

Commit 4162bb6

Browse files
committed
feat: use bitmasks for multiple column aggregation
1 parent bbf67d9 commit 4162bb6

5 files changed

Lines changed: 333 additions & 197 deletions

File tree

datafusion/physical-plan/src/aggregates/group_values/multi_group_by/boolean.rs

Lines changed: 57 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,10 @@
1818
use std::sync::Arc;
1919

2020
use crate::aggregates::group_values::multi_group_by::Nulls;
21-
use crate::aggregates::group_values::multi_group_by::{GroupColumn, nulls_equal_to};
21+
use crate::aggregates::group_values::multi_group_by::{EqualToResults, GroupColumn, nulls_equal_to};
2222
use crate::aggregates::group_values::null_builder::MaybeNullBufferBuilder;
2323
use arrow::array::{Array as _, ArrayRef, AsArray, BooleanArray, BooleanBufferBuilder};
2424
use datafusion_common::Result;
25-
use itertools::izip;
2625

2726
/// An implementation of [`GroupColumn`] for booleans
2827
///
@@ -81,32 +80,34 @@ impl<const NULLABLE: bool> GroupColumn for BooleanGroupValueBuilder<NULLABLE> {
8180
lhs_rows: &[usize],
8281
array: &ArrayRef,
8382
rhs_rows: &[usize],
84-
equal_to_results: &mut [bool],
83+
equal_to_results: &mut EqualToResults,
8584
) {
8685
let array = array.as_boolean();
8786

88-
let iter = izip!(
89-
lhs_rows.iter(),
90-
rhs_rows.iter(),
91-
equal_to_results.iter_mut(),
92-
);
93-
94-
for (&lhs_row, &rhs_row, equal_to_result) in iter {
95-
// Has found not equal to in previous column, don't need to check
96-
if !*equal_to_result {
87+
for (idx, (&lhs_row, &rhs_row)) in
88+
lhs_rows.iter().zip(rhs_rows.iter()).enumerate()
89+
{
90+
if !equal_to_results.get_bit(idx) {
9791
continue;
9892
}
9993

94+
let chunk_idx = idx / 64;
95+
let bit_pos = idx % 64;
96+
10097
if NULLABLE {
10198
let exist_null = self.nulls.is_null(lhs_row);
10299
let input_null = array.is_null(rhs_row);
103100
if let Some(result) = nulls_equal_to(exist_null, input_null) {
104-
*equal_to_result = result;
101+
if !result {
102+
equal_to_results.and_chunk(chunk_idx, !(1u64 << bit_pos));
103+
}
105104
continue;
106105
}
107106
}
108107

109-
*equal_to_result = self.buffer.get_bit(lhs_row) == array.value(rhs_row);
108+
if self.buffer.get_bit(lhs_row) != array.value(rhs_row) {
109+
equal_to_results.and_chunk(chunk_idx, !(1u64 << bit_pos));
110+
}
110111
}
111112
}
112113

@@ -195,6 +196,7 @@ impl<const NULLABLE: bool> GroupColumn for BooleanGroupValueBuilder<NULLABLE> {
195196

196197
#[cfg(test)]
197198
mod tests {
199+
use crate::aggregates::group_values::multi_group_by::EqualToResults;
198200
use arrow::array::NullBufferBuilder;
199201

200202
use super::*;
@@ -213,10 +215,10 @@ mod tests {
213215
lhs_rows: &[usize],
214216
input_array: &ArrayRef,
215217
rhs_rows: &[usize],
216-
equal_to_results: &mut Vec<bool>| {
218+
equal_to_results: &mut EqualToResults| {
217219
let iter = lhs_rows.iter().zip(rhs_rows.iter());
218220
for (idx, (&lhs_row, &rhs_row)) in iter.enumerate() {
219-
equal_to_results[idx] = builder.equal_to(lhs_row, input_array, rhs_row);
221+
equal_to_results.set_bit(idx, builder.equal_to(lhs_row, input_array, rhs_row));
220222
}
221223
};
222224

@@ -237,7 +239,7 @@ mod tests {
237239
lhs_rows: &[usize],
238240
input_array: &ArrayRef,
239241
rhs_rows: &[usize],
240-
equal_to_results: &mut Vec<bool>| {
242+
equal_to_results: &mut EqualToResults| {
241243
builder.vectorized_equal_to(
242244
lhs_rows,
243245
input_array,
@@ -257,7 +259,7 @@ mod tests {
257259
&[usize],
258260
&ArrayRef,
259261
&[usize],
260-
&mut Vec<bool>,
262+
&mut EqualToResults,
261263
),
262264
{
263265
// Will cover such cases:
@@ -302,21 +304,23 @@ mod tests {
302304
let input_array = Arc::new(BooleanArray::new(values, nulls.finish())) as ArrayRef;
303305

304306
// Check
305-
let mut equal_to_results = vec![true; builder.len()];
307+
let mut equal_to_results = EqualToResults::new();
308+
equal_to_results.reset(builder.len());
306309
equal_to(
307310
&builder,
308311
&[0, 1, 2, 3, 4, 5],
309312
&input_array,
310313
&[0, 1, 2, 3, 4, 5],
311314
&mut equal_to_results,
312315
);
313-
314-
assert!(!equal_to_results[0]);
315-
assert!(equal_to_results[1]);
316-
assert!(equal_to_results[2]);
317-
assert!(!equal_to_results[3]);
318-
assert!(!equal_to_results[4]);
319-
assert!(equal_to_results[5]);
316+
let results = equal_to_results.to_vec();
317+
318+
assert!(!results[0]);
319+
assert!(results[1]);
320+
assert!(results[2]);
321+
assert!(!results[3]);
322+
assert!(!results[4]);
323+
assert!(results[5]);
320324
}
321325

322326
#[test]
@@ -333,10 +337,10 @@ mod tests {
333337
lhs_rows: &[usize],
334338
input_array: &ArrayRef,
335339
rhs_rows: &[usize],
336-
equal_to_results: &mut Vec<bool>| {
340+
equal_to_results: &mut EqualToResults| {
337341
let iter = lhs_rows.iter().zip(rhs_rows.iter());
338342
for (idx, (&lhs_row, &rhs_row)) in iter.enumerate() {
339-
equal_to_results[idx] = builder.equal_to(lhs_row, input_array, rhs_row);
343+
equal_to_results.set_bit(idx, builder.equal_to(lhs_row, input_array, rhs_row));
340344
}
341345
};
342346

@@ -357,7 +361,7 @@ mod tests {
357361
lhs_rows: &[usize],
358362
input_array: &ArrayRef,
359363
rhs_rows: &[usize],
360-
equal_to_results: &mut Vec<bool>| {
364+
equal_to_results: &mut EqualToResults| {
361365
builder.vectorized_equal_to(
362366
lhs_rows,
363367
input_array,
@@ -377,7 +381,7 @@ mod tests {
377381
&[usize],
378382
&ArrayRef,
379383
&[usize],
380-
&mut Vec<bool>,
384+
&mut EqualToResults,
381385
),
382386
{
383387
// Will cover such cases:
@@ -403,19 +407,21 @@ mod tests {
403407
])) as ArrayRef;
404408

405409
// Check
406-
let mut equal_to_results = vec![true; builder.len()];
410+
let mut equal_to_results = EqualToResults::new();
411+
equal_to_results.reset(builder.len());
407412
equal_to(
408413
&builder,
409414
&[0, 1, 2, 3],
410415
&input_array,
411416
&[0, 1, 2, 3],
412417
&mut equal_to_results,
413418
);
419+
let results = equal_to_results.to_vec();
414420

415-
assert!(equal_to_results[0]);
416-
assert!(!equal_to_results[1]);
417-
assert!(!equal_to_results[2]);
418-
assert!(equal_to_results[3]);
421+
assert!(results[0]);
422+
assert!(!results[1]);
423+
assert!(!results[2]);
424+
assert!(results[3]);
419425
}
420426

421427
#[test]
@@ -432,19 +438,21 @@ mod tests {
432438
.vectorized_append(&all_nulls_input_array, &[0, 1, 2, 3, 4])
433439
.unwrap();
434440

435-
let mut equal_to_results = vec![true; all_nulls_input_array.len()];
441+
let mut equal_to_results = EqualToResults::new();
442+
equal_to_results.reset(all_nulls_input_array.len());
436443
builder.vectorized_equal_to(
437444
&[0, 1, 2, 3, 4],
438445
&all_nulls_input_array,
439446
&[0, 1, 2, 3, 4],
440447
&mut equal_to_results,
441448
);
449+
let results = equal_to_results.to_vec();
442450

443-
assert!(equal_to_results[0]);
444-
assert!(equal_to_results[1]);
445-
assert!(equal_to_results[2]);
446-
assert!(equal_to_results[3]);
447-
assert!(equal_to_results[4]);
451+
assert!(results[0]);
452+
assert!(results[1]);
453+
assert!(results[2]);
454+
assert!(results[3]);
455+
assert!(results[4]);
448456

449457
// All not nulls input array
450458
let all_not_nulls_input_array = Arc::new(BooleanArray::from(vec![
@@ -458,18 +466,20 @@ mod tests {
458466
.vectorized_append(&all_not_nulls_input_array, &[0, 1, 2, 3, 4])
459467
.unwrap();
460468

461-
let mut equal_to_results = vec![true; all_not_nulls_input_array.len()];
469+
let mut equal_to_results = EqualToResults::new();
470+
equal_to_results.reset(all_not_nulls_input_array.len());
462471
builder.vectorized_equal_to(
463472
&[5, 6, 7, 8, 9],
464473
&all_not_nulls_input_array,
465474
&[0, 1, 2, 3, 4],
466475
&mut equal_to_results,
467476
);
477+
let results = equal_to_results.to_vec();
468478

469-
assert!(equal_to_results[0]);
470-
assert!(equal_to_results[1]);
471-
assert!(equal_to_results[2]);
472-
assert!(equal_to_results[3]);
473-
assert!(equal_to_results[4]);
479+
assert!(results[0]);
480+
assert!(results[1]);
481+
assert!(results[2]);
482+
assert!(results[3]);
483+
assert!(results[4]);
474484
}
475485
}

0 commit comments

Comments
 (0)