Skip to content

Commit b320233

Browse files
committed
bitmap_smaller_datatypes
1 parent 7601623 commit b320233

1 file changed

Lines changed: 100 additions & 1 deletion

File tree

datafusion/functions-aggregate/benches/approx_distinct.rs

Lines changed: 100 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,10 @@
1717

1818
use std::sync::Arc;
1919

20-
use arrow::array::{ArrayRef, Int64Array, StringArray, StringViewArray};
20+
use arrow::array::{
21+
ArrayRef, BooleanArray, Int8Array, Int16Array, Int64Array, StringArray,
22+
StringViewArray, UInt8Array, UInt16Array,
23+
};
2124
use arrow::datatypes::{DataType, Field, Schema};
2225
use criterion::{Criterion, criterion_group, criterion_main};
2326
use datafusion_expr::function::AccumulatorArgs;
@@ -56,6 +59,45 @@ fn create_i64_array(n_distinct: usize) -> Int64Array {
5659
.collect()
5760
}
5861

62+
fn create_u8_array(n_distinct: usize) -> UInt8Array {
63+
let mut rng = StdRng::seed_from_u64(42);
64+
let max_val = n_distinct.min(256) as u8;
65+
(0..BATCH_SIZE)
66+
.map(|_| Some(rng.random_range(0..max_val)))
67+
.collect()
68+
}
69+
70+
fn create_i8_array(n_distinct: usize) -> Int8Array {
71+
let mut rng = StdRng::seed_from_u64(42);
72+
let max_val = (n_distinct.min(256) / 2) as i8;
73+
(0..BATCH_SIZE)
74+
.map(|_| Some(rng.random_range(-max_val..max_val)))
75+
.collect()
76+
}
77+
78+
fn create_u16_array(n_distinct: usize) -> UInt16Array {
79+
let mut rng = StdRng::seed_from_u64(42);
80+
let max_val = n_distinct.min(65536) as u16;
81+
(0..BATCH_SIZE)
82+
.map(|_| Some(rng.random_range(0..max_val)))
83+
.collect()
84+
}
85+
86+
fn create_i16_array(n_distinct: usize) -> Int16Array {
87+
let mut rng = StdRng::seed_from_u64(42);
88+
let max_val = (n_distinct.min(65536) / 2) as i16;
89+
(0..BATCH_SIZE)
90+
.map(|_| Some(rng.random_range(-max_val..max_val)))
91+
.collect()
92+
}
93+
94+
fn create_bool_array() -> BooleanArray {
95+
let mut rng = StdRng::seed_from_u64(42);
96+
(0..BATCH_SIZE)
97+
.map(|_| Some(rng.random_bool(0.5)))
98+
.collect()
99+
}
100+
59101
/// Creates a pool of `n_distinct` random strings of the given length.
60102
fn create_string_pool(n_distinct: usize, string_length: usize) -> Vec<String> {
61103
let mut rng = StdRng::seed_from_u64(42);
@@ -133,6 +175,63 @@ fn approx_distinct_benchmark(c: &mut Criterion) {
133175
);
134176
}
135177
}
178+
179+
// --- Bitmap type benchmarks (our optimization) ---
180+
181+
// UInt8
182+
let values = Arc::new(create_u8_array(200)) as ArrayRef;
183+
c.bench_function("approx_distinct u8 bitmap", |b| {
184+
b.iter(|| {
185+
let mut accumulator = prepare_accumulator(DataType::UInt8);
186+
accumulator
187+
.update_batch(std::slice::from_ref(&values))
188+
.unwrap()
189+
})
190+
});
191+
192+
// Int8
193+
let values = Arc::new(create_i8_array(200)) as ArrayRef;
194+
c.bench_function("approx_distinct i8 bitmap", |b| {
195+
b.iter(|| {
196+
let mut accumulator = prepare_accumulator(DataType::Int8);
197+
accumulator
198+
.update_batch(std::slice::from_ref(&values))
199+
.unwrap()
200+
})
201+
});
202+
203+
// UInt16
204+
let values = Arc::new(create_u16_array(50000)) as ArrayRef;
205+
c.bench_function("approx_distinct u16 bitmap", |b| {
206+
b.iter(|| {
207+
let mut accumulator = prepare_accumulator(DataType::UInt16);
208+
accumulator
209+
.update_batch(std::slice::from_ref(&values))
210+
.unwrap()
211+
})
212+
});
213+
214+
// Int16
215+
let values = Arc::new(create_i16_array(50000)) as ArrayRef;
216+
c.bench_function("approx_distinct i16 bitmap", |b| {
217+
b.iter(|| {
218+
let mut accumulator = prepare_accumulator(DataType::Int16);
219+
accumulator
220+
.update_batch(std::slice::from_ref(&values))
221+
.unwrap()
222+
})
223+
});
224+
225+
// Boolean
226+
let values = Arc::new(create_bool_array()) as ArrayRef;
227+
c.bench_function("approx_distinct bool bitmap", |b| {
228+
b.iter(|| {
229+
let mut accumulator = prepare_accumulator(DataType::Boolean);
230+
accumulator
231+
.update_batch(std::slice::from_ref(&values))
232+
.unwrap()
233+
})
234+
});
136235
}
137236

138237
criterion_group!(benches, approx_distinct_benchmark);

0 commit comments

Comments
 (0)