Skip to content

Commit ddfc282

Browse files
authored
Add tests for sqllogictest prioritization (#20656)
## Which issue does this PR close? - Follow on to #20576 ## Rationale for this change Implement suggestions from @kosiew in #20576 (comment): > Can we add a deterministic tie-breaker in sort_tests (for equal priority) using relative_path, e.g. sort_by_key(|f| (priority, f.relative_path.clone())) to keep run order stable? > > This would also benefit from a small unit test covering: > > prioritized files are first, > non-prioritized files keep deterministic ordering ## What changes are included in this PR? 1. Add deterministic tie breaker so tests are always run in the same order 2. Add a test for the ordering Note that in order to actually write these tests I needed to pull the TestFile and some related logic into its own module ## Are these changes tested? Yes by CI I also ran the tests locally via ```shell cargo test --profile=ci --test sqllogictests ``` I also double checked sqllogictests ```shell INCLUDE_SQLITE=true cargo test --profile release-nonlto --test sqllogictests ``` ## Are there any user-facing changes? <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. --> <!-- If there are any breaking changes to public APIs, please add the `api change` label. -->
1 parent cbe5cb3 commit ddfc282

3 files changed

Lines changed: 213 additions & 131 deletions

File tree

datafusion/sqllogictest/bin/sqllogictests.rs

Lines changed: 25 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,8 @@
1818
use clap::{ColorChoice, Parser, ValueEnum};
1919
use datafusion::common::instant::Instant;
2020
use datafusion::common::utils::get_available_parallelism;
21-
use datafusion::common::{
22-
DataFusionError, HashMap, Result, exec_datafusion_err, exec_err,
23-
};
21+
use datafusion::common::{DataFusionError, Result, exec_datafusion_err, exec_err};
22+
use datafusion_sqllogictest::TestFile;
2423
use datafusion_sqllogictest::{
2524
CurrentlyExecutingSqlTracker, DataFusion, DataFusionSubstraitRoundTrip, Filter,
2625
TestContext, df_value_validator, read_dir_recursive, setup_scratch_dir,
@@ -44,13 +43,12 @@ use crate::postgres_container::{
4443
};
4544
use datafusion::common::runtime::SpawnedTask;
4645
use futures::FutureExt;
47-
use std::ffi::OsStr;
4846
use std::fs;
4947
use std::io::{IsTerminal, stderr, stdout};
5048
use std::path::{Path, PathBuf};
5149
use std::str::FromStr;
50+
use std::sync::Arc;
5251
use std::sync::atomic::{AtomicUsize, Ordering};
53-
use std::sync::{Arc, LazyLock};
5452
use std::time::Duration;
5553

5654
#[cfg(feature = "postgres")]
@@ -59,6 +57,7 @@ mod postgres_container;
5957
const TEST_DIRECTORY: &str = "test_files/";
6058
const DATAFUSION_TESTING_TEST_DIRECTORY: &str = "../../datafusion-testing/data/";
6159
const PG_COMPAT_FILE_PREFIX: &str = "pg_compat_";
60+
const TPCH_PREFIX: &str = "tpch";
6261
const SQLITE_PREFIX: &str = "sqlite";
6362
const ERRS_PER_FILE_LIMIT: usize = 10;
6463
const TIMING_DEBUG_SLOW_FILES_ENV: &str = "SLT_TIMING_DEBUG_SLOW_FILES";
@@ -77,55 +76,6 @@ struct FileTiming {
7776
elapsed: Duration,
7877
}
7978

80-
/// TEST PRIORITY
81-
///
82-
/// Heuristically prioritize some test to run earlier.
83-
///
84-
/// Prioritizes test to run earlier if they are known to be long running (as
85-
/// each test file itself is run sequentially, but multiple test files are run
86-
/// in parallel.
87-
///
88-
/// Tests not listed here will run after the listed tests in an arbitrary order.
89-
///
90-
/// You can find the top longest running tests by running `--timing-summary` mode.
91-
/// For example
92-
///
93-
/// ```shell
94-
/// $ cargo test --profile=ci --test sqllogictests -- --timing-summary top
95-
/// ...
96-
/// Per-file elapsed summary (deterministic):
97-
/// 1. 5.375s push_down_filter_regression.slt
98-
/// 2. 3.174s aggregate.slt
99-
/// 3. 3.158s imdb.slt
100-
/// 4. 2.793s joins.slt
101-
/// 5. 2.505s array.slt
102-
/// 6. 2.265s aggregate_skip_partial.slt
103-
/// 7. 2.260s window.slt
104-
/// 8. 1.677s group_by.slt
105-
/// 9. 0.973s datetime/timestamps.slt
106-
/// 10. 0.822s cte.slt
107-
/// ```
108-
static TEST_PRIORITY: LazyLock<HashMap<PathBuf, usize>> = LazyLock::new(|| {
109-
[
110-
(PathBuf::from("push_down_filter_regression.slt"), 0), // longest running, so run first.
111-
(PathBuf::from("aggregate.slt"), 1),
112-
(PathBuf::from("joins.slt"), 2),
113-
(PathBuf::from("imdb.slt"), 3),
114-
(PathBuf::from("array.slt"), 4),
115-
(PathBuf::from("aggregate_skip_partial.slt"), 5),
116-
(PathBuf::from("window.slt"), 6),
117-
(PathBuf::from("group_by.slt"), 7),
118-
(PathBuf::from("datetime/timestamps.slt"), 8),
119-
(PathBuf::from("cte.slt"), 9),
120-
]
121-
.into_iter()
122-
.collect()
123-
});
124-
125-
/// Default priority for tests not in the TEST_PRIORITY map. Tests with lower
126-
/// priority values run first.
127-
static DEFAULT_PRIORITY: usize = 100;
128-
12979
pub fn main() -> Result<()> {
13080
tokio::runtime::Builder::new_multi_thread()
13181
.enable_all()
@@ -832,91 +782,35 @@ async fn run_complete_file_with_postgres(
832782
plan_err!("Can not run with postgres as postgres feature is not enabled")
833783
}
834784

835-
/// Represents a parsed test file
836-
#[derive(Debug)]
837-
struct TestFile {
838-
/// The absolute path to the file
839-
pub path: PathBuf,
840-
/// The relative path of the file (used for display)
841-
pub relative_path: PathBuf,
842-
}
843-
844-
impl TestFile {
845-
fn new(path: PathBuf) -> Self {
846-
let p = path.to_string_lossy();
847-
let relative_path = PathBuf::from(if p.starts_with(TEST_DIRECTORY) {
848-
p.strip_prefix(TEST_DIRECTORY).unwrap()
849-
} else if p.starts_with(DATAFUSION_TESTING_TEST_DIRECTORY) {
850-
p.strip_prefix(DATAFUSION_TESTING_TEST_DIRECTORY).unwrap()
851-
} else {
852-
""
853-
});
854-
855-
Self {
856-
path,
857-
relative_path,
858-
}
859-
}
860-
861-
fn is_slt_file(&self) -> bool {
862-
self.path.extension() == Some(OsStr::new("slt"))
863-
}
864-
865-
fn check_sqlite(&self, options: &Options) -> bool {
866-
if !self.relative_path.starts_with(SQLITE_PREFIX) {
867-
return true;
868-
}
869-
870-
options.include_sqlite
871-
}
872-
873-
fn check_tpch(&self, options: &Options) -> bool {
874-
if !self.relative_path.starts_with("tpch") {
875-
return true;
876-
}
785+
fn read_test_files(options: &Options) -> Result<Vec<TestFile>> {
786+
let prefixes: &[&str] = if options.include_sqlite {
787+
&[TEST_DIRECTORY, DATAFUSION_TESTING_TEST_DIRECTORY]
788+
} else {
789+
&[TEST_DIRECTORY]
790+
};
877791

878-
options.include_tpch
879-
}
880-
}
792+
let directories = prefixes
793+
.iter()
794+
.map(|prefix| {
795+
read_dir_recursive(prefix).map_err(|e| {
796+
exec_datafusion_err!("Error reading test directory {prefix}: {e}")
797+
})
798+
})
799+
.collect::<Result<Vec<_>>>()?;
881800

882-
fn read_test_files(options: &Options) -> Result<Vec<TestFile>> {
883-
let mut paths = read_dir_recursive(TEST_DIRECTORY)?
801+
let mut paths = directories
884802
.into_iter()
885-
.map(TestFile::new)
803+
.flatten()
804+
.map(|p| TestFile::new(p, prefixes))
886805
.filter(|f| options.check_test_file(&f.path))
887806
.filter(|f| f.is_slt_file())
888-
.filter(|f| f.check_tpch(options))
889-
.filter(|f| f.check_sqlite(options))
807+
.filter(|f| !f.relative_path_starts_with(TPCH_PREFIX) || options.include_tpch)
808+
.filter(|f| !f.relative_path_starts_with(SQLITE_PREFIX) || options.include_sqlite)
890809
.filter(|f| options.check_pg_compat_file(f.path.as_path()))
891810
.collect::<Vec<_>>();
892-
if options.include_sqlite {
893-
let mut sqlite_paths = read_dir_recursive(DATAFUSION_TESTING_TEST_DIRECTORY)?
894-
.into_iter()
895-
.map(TestFile::new)
896-
.filter(|f| options.check_test_file(&f.path))
897-
.filter(|f| f.is_slt_file())
898-
.filter(|f| f.check_sqlite(options))
899-
.filter(|f| options.check_pg_compat_file(f.path.as_path()))
900-
.collect::<Vec<_>>();
901-
902-
paths.append(&mut sqlite_paths)
903-
}
904811

905-
Ok(sort_tests(paths))
906-
}
907-
908-
/// Sort the tests heuristically by order of "priority"
909-
///
910-
/// Prioritizes test to run earlier if they are known to be long running (as
911-
/// each test file itself is run sequentially, but multiple test files are run
912-
/// in parallel.
913-
fn sort_tests(mut tests: Vec<TestFile>) -> Vec<TestFile> {
914-
tests.sort_by_key(|f| {
915-
TEST_PRIORITY
916-
.get(&f.relative_path)
917-
.unwrap_or(&DEFAULT_PRIORITY)
918-
});
919-
tests
812+
paths.sort_unstable();
813+
Ok(paths)
920814
}
921815

922816
/// Parsed command line options

datafusion/sqllogictest/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
//! DataFusion sqllogictest driver
2828
2929
mod engines;
30+
mod test_file;
3031

3132
pub use engines::CurrentlyExecutingSqlTracker;
3233
pub use engines::DFColumnType;
@@ -46,4 +47,5 @@ mod util;
4647

4748
pub use filters::*;
4849
pub use test_context::TestContext;
50+
pub use test_file::TestFile;
4951
pub use util::*;

0 commit comments

Comments
 (0)