Skip to content

Commit 8367a75

Browse files
committed
create_benches_small_int
1 parent 8d91fb0 commit 8367a75

3 files changed

Lines changed: 240 additions & 1 deletion

File tree

datafusion/functions-aggregate/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,3 +83,7 @@ harness = false
8383
[[bench]]
8484
name = "first_last"
8585
harness = false
86+
87+
[[bench]]
88+
name = "count_distinct"
89+
harness = false

datafusion/functions-aggregate/benches/approx_distinct.rs

Lines changed: 82 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,10 @@
1717

1818
use std::sync::Arc;
1919

20-
use arrow::array::{ArrayRef, Int64Array, StringArray, StringViewArray};
20+
use arrow::array::{
21+
ArrayRef, Int8Array, Int16Array, Int64Array, StringArray, StringViewArray,
22+
UInt8Array, UInt16Array,
23+
};
2124
use arrow::datatypes::{DataType, Field, Schema};
2225
use criterion::{Criterion, criterion_group, criterion_main};
2326
use datafusion_expr::function::AccumulatorArgs;
@@ -56,6 +59,38 @@ fn create_i64_array(n_distinct: usize) -> Int64Array {
5659
.collect()
5760
}
5861

62+
fn create_u8_array(n_distinct: usize) -> UInt8Array {
63+
let mut rng = StdRng::seed_from_u64(42);
64+
let max_val = n_distinct.min(256) as u8;
65+
(0..BATCH_SIZE)
66+
.map(|_| Some(rng.random_range(0..max_val)))
67+
.collect()
68+
}
69+
70+
fn create_i8_array(n_distinct: usize) -> Int8Array {
71+
let mut rng = StdRng::seed_from_u64(42);
72+
let max_val = (n_distinct.min(256) / 2) as i8;
73+
(0..BATCH_SIZE)
74+
.map(|_| Some(rng.random_range(-max_val..max_val)))
75+
.collect()
76+
}
77+
78+
fn create_u16_array(n_distinct: usize) -> UInt16Array {
79+
let mut rng = StdRng::seed_from_u64(42);
80+
let max_val = n_distinct.min(65536) as u16;
81+
(0..BATCH_SIZE)
82+
.map(|_| Some(rng.random_range(0..max_val)))
83+
.collect()
84+
}
85+
86+
fn create_i16_array(n_distinct: usize) -> Int16Array {
87+
let mut rng = StdRng::seed_from_u64(42);
88+
let max_val = (n_distinct.min(65536) / 2) as i16;
89+
(0..BATCH_SIZE)
90+
.map(|_| Some(rng.random_range(-max_val..max_val)))
91+
.collect()
92+
}
93+
5994
/// Creates a pool of `n_distinct` random strings of the given length.
6095
fn create_string_pool(n_distinct: usize, string_length: usize) -> Vec<String> {
6196
let mut rng = StdRng::seed_from_u64(42);
@@ -133,6 +168,52 @@ fn approx_distinct_benchmark(c: &mut Criterion) {
133168
);
134169
}
135170
}
171+
172+
// Small integer types
173+
174+
// UInt8
175+
let values = Arc::new(create_u8_array(200)) as ArrayRef;
176+
c.bench_function("approx_distinct u8 bitmap", |b| {
177+
b.iter(|| {
178+
let mut accumulator = prepare_accumulator(DataType::UInt8);
179+
accumulator
180+
.update_batch(std::slice::from_ref(&values))
181+
.unwrap()
182+
})
183+
});
184+
185+
// Int8
186+
let values = Arc::new(create_i8_array(200)) as ArrayRef;
187+
c.bench_function("approx_distinct i8 bitmap", |b| {
188+
b.iter(|| {
189+
let mut accumulator = prepare_accumulator(DataType::Int8);
190+
accumulator
191+
.update_batch(std::slice::from_ref(&values))
192+
.unwrap()
193+
})
194+
});
195+
196+
// UInt16
197+
let values = Arc::new(create_u16_array(50000)) as ArrayRef;
198+
c.bench_function("approx_distinct u16 bitmap", |b| {
199+
b.iter(|| {
200+
let mut accumulator = prepare_accumulator(DataType::UInt16);
201+
accumulator
202+
.update_batch(std::slice::from_ref(&values))
203+
.unwrap()
204+
})
205+
});
206+
207+
// Int16
208+
let values = Arc::new(create_i16_array(50000)) as ArrayRef;
209+
c.bench_function("approx_distinct i16 bitmap", |b| {
210+
b.iter(|| {
211+
let mut accumulator = prepare_accumulator(DataType::Int16);
212+
accumulator
213+
.update_batch(std::slice::from_ref(&values))
214+
.unwrap()
215+
})
216+
});
136217
}
137218

138219
criterion_group!(benches, approx_distinct_benchmark);
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use std::sync::Arc;
19+
20+
use arrow::array::{
21+
ArrayRef, Int8Array, Int16Array, Int64Array, UInt8Array, UInt16Array,
22+
};
23+
use arrow::datatypes::{DataType, Field, Schema};
24+
use criterion::{Criterion, criterion_group, criterion_main};
25+
use datafusion_expr::function::AccumulatorArgs;
26+
use datafusion_expr::{Accumulator, AggregateUDFImpl};
27+
use datafusion_functions_aggregate::count::Count;
28+
use datafusion_physical_expr::expressions::col;
29+
use rand::rngs::StdRng;
30+
use rand::{Rng, SeedableRng};
31+
32+
const BATCH_SIZE: usize = 8192;
33+
34+
fn prepare_accumulator(data_type: DataType) -> Box<dyn Accumulator> {
35+
let schema = Arc::new(Schema::new(vec![Field::new("f", data_type, true)]));
36+
let expr = col("f", &schema).unwrap();
37+
let accumulator_args = AccumulatorArgs {
38+
return_field: Field::new("f", DataType::Int64, true).into(),
39+
schema: &schema,
40+
expr_fields: &[expr.return_field(&schema).unwrap()],
41+
ignore_nulls: false,
42+
order_bys: &[],
43+
is_reversed: false,
44+
name: "count(distinct f)",
45+
is_distinct: true,
46+
exprs: &[expr],
47+
};
48+
Count::new().accumulator(accumulator_args).unwrap()
49+
}
50+
51+
fn create_i64_array(n_distinct: usize) -> Int64Array {
52+
let mut rng = StdRng::seed_from_u64(42);
53+
(0..BATCH_SIZE)
54+
.map(|_| Some(rng.random_range(0..n_distinct as i64)))
55+
.collect()
56+
}
57+
58+
fn create_u8_array(n_distinct: usize) -> UInt8Array {
59+
let mut rng = StdRng::seed_from_u64(42);
60+
let max_val = n_distinct.min(256) as u8;
61+
(0..BATCH_SIZE)
62+
.map(|_| Some(rng.random_range(0..max_val)))
63+
.collect()
64+
}
65+
66+
fn create_i8_array(n_distinct: usize) -> Int8Array {
67+
let mut rng = StdRng::seed_from_u64(42);
68+
let max_val = (n_distinct.min(256) / 2) as i8;
69+
(0..BATCH_SIZE)
70+
.map(|_| Some(rng.random_range(-max_val..max_val)))
71+
.collect()
72+
}
73+
74+
fn create_u16_array(n_distinct: usize) -> UInt16Array {
75+
let mut rng = StdRng::seed_from_u64(42);
76+
let max_val = n_distinct.min(65536) as u16;
77+
(0..BATCH_SIZE)
78+
.map(|_| Some(rng.random_range(0..max_val)))
79+
.collect()
80+
}
81+
82+
fn create_i16_array(n_distinct: usize) -> Int16Array {
83+
let mut rng = StdRng::seed_from_u64(42);
84+
let max_val = (n_distinct.min(65536) / 2) as i16;
85+
(0..BATCH_SIZE)
86+
.map(|_| Some(rng.random_range(-max_val..max_val)))
87+
.collect()
88+
}
89+
90+
fn count_distinct_benchmark(c: &mut Criterion) {
91+
for pct in [80, 99] {
92+
let n_distinct = BATCH_SIZE * pct / 100;
93+
94+
// Int64
95+
let values = Arc::new(create_i64_array(n_distinct)) as ArrayRef;
96+
c.bench_function(&format!("count_distinct i64 {pct}% distinct"), |b| {
97+
b.iter(|| {
98+
let mut accumulator = prepare_accumulator(DataType::Int64);
99+
accumulator
100+
.update_batch(std::slice::from_ref(&values))
101+
.unwrap()
102+
})
103+
});
104+
}
105+
106+
// Small integer types
107+
108+
// UInt8
109+
let values = Arc::new(create_u8_array(200)) as ArrayRef;
110+
c.bench_function("count_distinct u8 bitmap", |b| {
111+
b.iter(|| {
112+
let mut accumulator = prepare_accumulator(DataType::UInt8);
113+
accumulator
114+
.update_batch(std::slice::from_ref(&values))
115+
.unwrap()
116+
})
117+
});
118+
119+
// Int8
120+
let values = Arc::new(create_i8_array(200)) as ArrayRef;
121+
c.bench_function("count_distinct i8 bitmap", |b| {
122+
b.iter(|| {
123+
let mut accumulator = prepare_accumulator(DataType::Int8);
124+
accumulator
125+
.update_batch(std::slice::from_ref(&values))
126+
.unwrap()
127+
})
128+
});
129+
130+
// UInt16
131+
let values = Arc::new(create_u16_array(50000)) as ArrayRef;
132+
c.bench_function("count_distinct u16 bitmap", |b| {
133+
b.iter(|| {
134+
let mut accumulator = prepare_accumulator(DataType::UInt16);
135+
accumulator
136+
.update_batch(std::slice::from_ref(&values))
137+
.unwrap()
138+
})
139+
});
140+
141+
// Int16
142+
let values = Arc::new(create_i16_array(50000)) as ArrayRef;
143+
c.bench_function("count_distinct i16 bitmap", |b| {
144+
b.iter(|| {
145+
let mut accumulator = prepare_accumulator(DataType::Int16);
146+
accumulator
147+
.update_batch(std::slice::from_ref(&values))
148+
.unwrap()
149+
})
150+
});
151+
}
152+
153+
criterion_group!(benches, count_distinct_benchmark);
154+
criterion_main!(benches);

0 commit comments

Comments
 (0)