diff --git a/Cargo.lock b/Cargo.lock index af52588e5338e..8a2b06e0f351d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -112,7 +112,7 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -123,7 +123,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -1736,6 +1736,7 @@ dependencies = [ name = "datafusion-benchmarks" version = "53.1.0" dependencies = [ + "anstream", "arrow", "async-trait", "bytes", @@ -1759,6 +1760,7 @@ dependencies = [ "tempfile", "tokio", "tokio-util", + "toml", ] [[package]] @@ -2712,7 +2714,7 @@ dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -2856,7 +2858,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -4118,7 +4120,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -4867,7 +4869,7 @@ dependencies = [ "once_cell", "socket2", "tracing", - "windows-sys 0.52.0", + "windows-sys 0.60.2", ] [[package]] @@ -5248,7 +5250,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -5511,6 +5513,15 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "serde_spanned" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6662b5879511e06e8999a8a235d848113e942c9124f211511b16466ee2995f26" +dependencies = [ + "serde_core", +] + [[package]] name = "serde_tokenstream" version = "0.2.3" @@ -5710,7 +5721,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" dependencies = [ "libc", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -5811,7 +5822,7 @@ dependencies = [ "cfg-if", "libc", "psm", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -5981,7 +5992,7 @@ dependencies = [ "getrandom 0.4.2", "once_cell", "rustix", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -6239,6 +6250,21 @@ dependencies = [ "tokio", ] +[[package]] +name = "toml" +version = "1.1.2+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81f3d15e84cbcd896376e6730314d59fb5a87f31e4b038454184435cd57defee" +dependencies = [ + "indexmap 2.14.0", + "serde_core", + "serde_spanned", + "toml_datetime", + "toml_parser", + "toml_writer", + "winnow", +] + [[package]] name = "toml_datetime" version = "1.1.1+spec-1.1.0" @@ -6269,6 +6295,12 @@ dependencies = [ "winnow", ] +[[package]] +name = "toml_writer" +version = "1.1.1+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "756daf9b1013ebe47a8776667b466417e2d4c5679d441c26230efd9ef78692db" + [[package]] name = "tonic" version = "0.14.5" @@ -6899,7 +6931,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 1815f8bc42ca3..97e0d901b95f9 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -40,10 +40,11 @@ snmalloc = ["snmalloc-rs"] mimalloc_extended = ["libmimalloc-sys/extended"] [dependencies] +anstream = "1.0" arrow = { workspace = true } async-trait = "0.1" bytes = { workspace = true } -clap = { version = "4.6.0", features = ["derive", "env"] } +clap = { version = "4.6.1", features = ["derive", "env", "color"] } criterion = { workspace = true, features = ["html_reports"] } datafusion = { workspace = true, default-features = true } datafusion-common = { workspace = true, default-features = true } @@ -61,6 +62,7 @@ serde_json = { workspace = true } snmalloc-rs = { version = "0.7", optional = true } tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] } tokio-util = { version = "0.7.17" } +toml = "1.1" [dev-dependencies] datafusion-proto = { workspace = true } diff --git a/benchmarks/sql_benchmarks/tpch/tpch.suite b/benchmarks/sql_benchmarks/tpch/tpch.suite new file mode 100644 index 0000000000000..317b32e57f45f --- /dev/null +++ b/benchmarks/sql_benchmarks/tpch/tpch.suite @@ -0,0 +1,16 @@ +name = "tpch" +description = "TPC-H SQL benchmarks" + +[[options]] +name = "format" +short = "f" +default = "parquet" +values = ["parquet", "csv", "mem"] +help = "Selects the TPC-H data format." + +[[options]] +name = "scale-factor" +short = "sf" +default = "1" +values = ["1", "10"] +help = "Selects the TPC-H scale factor." diff --git a/benchmarks/src/benchmark_runner/cli.rs b/benchmarks/src/benchmark_runner/cli.rs new file mode 100644 index 0000000000000..30fb71b5a30cc --- /dev/null +++ b/benchmarks/src/benchmark_runner/cli.rs @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! CLI construction and argument conversion for `benchmark_runner`. +//! +//! This module owns the clap command tree for the initial runner surface: +//! top-level help and suite listing. + +use clap::builder::styling::{AnsiColor, Styles}; +use clap::{ArgMatches, Command}; +use datafusion_common::{Result, exec_datafusion_err}; + +const HELP_STYLES: Styles = Styles::styled() + .header(AnsiColor::Green.on_default().bold()) + .usage(AnsiColor::Green.on_default().bold()) + .literal(AnsiColor::Cyan.on_default().bold()) + .placeholder(AnsiColor::Cyan.on_default()); + +#[derive(Debug)] +pub enum RunnerCommand { + Help, + List, +} + +/// Builds the command tree for help and suite listing. +pub fn build_cli() -> Command { + Command::new("benchmark_runner") + .about("Inspect DataFusion SQL benchmark suites.") + .styles(HELP_STYLES) + .subcommand_required(false) + .arg_required_else_help(false) + .disable_help_subcommand(true) + .subcommand(Command::new("help").about("Print help")) + .subcommand(Command::new("list").about("List SQL benchmark suites")) +} + +/// Converts clap matches into a typed command. +pub(crate) fn command_from_matches(matches: &ArgMatches) -> Result { + match matches.subcommand() { + None | Some(("help", _)) => Ok(RunnerCommand::Help), + Some(("list", _)) => Ok(RunnerCommand::List), + Some((name, _)) => Err(exec_datafusion_err!("Unknown command '{name}'")), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn list_rejects_unrecognized_options() { + let matches = + build_cli().try_get_matches_from(["benchmark_runner", "list", "--format"]); + + assert!(matches.is_err(), "{matches:?}"); + } + + #[test] + fn help_mentions_list_command() { + let err = build_cli() + .try_get_matches_from(["benchmark_runner", "--help"]) + .unwrap_err(); + let help = err.to_string(); + + assert!(help.contains("list")); + } +} diff --git a/benchmarks/src/benchmark_runner/mod.rs b/benchmarks/src/benchmark_runner/mod.rs new file mode 100644 index 0000000000000..458e5f974c152 --- /dev/null +++ b/benchmarks/src/benchmark_runner/mod.rs @@ -0,0 +1,104 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Command-line inspection for SQL benchmark suites. +//! +//! This module backs the `benchmark_runner` binary. The initial command +//! surface lists discovered SQL benchmark suites from `.suite` files and +//! prints the top-level help. +//! +//! Common invocations: +//! +//! ```text +//! cargo run --bin benchmark_runner -- --help +//! cargo run --release --bin benchmark_runner -- list +//! ``` +//! +//! The public entry point is [`run_cli`]. The submodules are kept private so +//! the command-line flow remains the single supported API: +//! +//! - `cli` builds the clap command tree and parses the selected command. +//! - `suite` loads `.suite` metadata and discovers benchmark query files. +//! - `output` formats colored `list` command output. + +mod cli; +mod output; +mod suite; + +use crate::benchmark_runner::cli::{RunnerCommand, build_cli, command_from_matches}; +use crate::benchmark_runner::output::format_suite_list_styled; +use crate::benchmark_runner::suite::SuiteRegistry; +use datafusion::error::Result; +use datafusion_common::DataFusionError; +use std::io::Write; +use std::path::PathBuf; + +/// Runs the benchmark runner command-line flow for the provided argument list. +/// +/// This discovers suite metadata, parses the help/list command, and dispatches +/// to the selected implementation. +pub fn run_cli(args: I) -> Result<()> +where + I: IntoIterator, + T: Clone + Into, +{ + let benchmark_dir = default_benchmark_dir(); + let registry = SuiteRegistry::discover(&benchmark_dir)?; + let mut cli = build_cli(); + let matches = match cli.try_get_matches_from_mut(args) { + Ok(matches) => matches, + Err(e) if e.kind() == clap::error::ErrorKind::DisplayHelp => { + e.print()?; + return Ok(()); + } + Err(e) => return Err(DataFusionError::External(Box::new(e))), + }; + let command = command_from_matches(&matches)?; + + match command { + RunnerCommand::Help => { + cli.print_long_help()?; + println!(); + } + RunnerCommand::List => { + print_styled(&format_suite_list_styled(®istry)?)?; + } + } + + Ok(()) +} + +/// Writes already styled output through `anstream` so ANSI color handling +/// matches clap help output on supported terminals. +fn print_styled(output: &str) -> Result<()> { + let mut stdout = anstream::stdout(); + + write!(&mut stdout, "{output}") + .map_err(|e| DataFusionError::External(Box::new(e)))?; + Ok(()) +} + +/// Resolves the SQL benchmark root from either the repository root or the +/// benchmarks crate manifest directory. +fn default_benchmark_dir() -> PathBuf { + let repo_root_path = PathBuf::from("benchmarks/sql_benchmarks"); + if repo_root_path.exists() { + repo_root_path + } else { + PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("sql_benchmarks") + } +} diff --git a/benchmarks/src/benchmark_runner/output.rs b/benchmarks/src/benchmark_runner/output.rs new file mode 100644 index 0000000000000..9eb975311e29f --- /dev/null +++ b/benchmarks/src/benchmark_runner/output.rs @@ -0,0 +1,173 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Formatting helpers for human-readable benchmark runner output. +//! +//! The runner intentionally uses the same colored style for `list` output as +//! clap uses for help text. + +use crate::benchmark_runner::suite::{SuiteConfig, SuiteOption, SuiteRegistry}; +use clap::builder::styling::{AnsiColor, Style}; +use datafusion_common::Result; +use std::fmt::{Display, Write as _}; + +/// Formats the `list` command output with suite summaries, query-id hints, and +/// configurable suite options. +pub fn format_suite_list_styled(registry: &SuiteRegistry) -> Result { + let mut output = String::new(); + + for suite in registry.suites() { + write_suite_list_entry(&mut output, suite)?; + } + + Ok(output) +} + +/// Writes one suite entry for the `list` command. +fn write_suite_list_entry(output: &mut String, suite: &SuiteConfig) -> Result<()> { + writeln!(output, "{}", header(&suite.name))?; + writeln!(output, " {}: {}", label("description"), suite.description)?; + + let queries = suite.discover_queries()?; + + if let (Some(first), Some(last)) = (queries.first(), queries.last()) { + writeln!( + output, + " {}: {}-{} discovered under {} as {}", + label("query ids"), + value(first.id), + value(last.id), + suite.query_search_root().display(), + literal("qNN.benchmark") + )?; + } + writeln!(output, " {}:", label("options"))?; + + for option in &suite.options { + write_suite_list_option(output, option)?; + } + + Ok(()) +} + +/// Writes one suite option summary for the `list` command. +fn write_suite_list_option(output: &mut String, option: &SuiteOption) -> Result<()> { + let values = option + .values + .iter() + .map(|v| { + if v == &option.default { + format!("{} ({})", value(v), label("default")) + } else { + value(v) + } + }) + .collect::>() + .join(", "); + + writeln!( + output, + " {} {} {}", + literal(option_display(option)), + placeholder(""), + values + )?; + + Ok(()) +} + +fn option_display(option: &SuiteOption) -> String { + match &option.short { + Some(short) => format!("-{short}, --{}", option.name), + None => format!("--{}", option.name), + } +} + +fn header(text: impl Display) -> String { + styled(AnsiColor::Green.on_default().bold(), text) +} + +fn literal(text: impl Display) -> String { + styled(AnsiColor::Cyan.on_default().bold(), text) +} + +fn placeholder(text: impl Display) -> String { + styled(AnsiColor::Cyan.on_default(), text) +} + +fn value(text: impl Display) -> String { + styled(AnsiColor::Green.on_default(), text) +} + +fn label(text: impl Display) -> String { + styled(Style::new().bold(), text) +} + +fn styled(style: Style, text: impl Display) -> String { + format!("{style}{text}{style:#}") +} + +#[cfg(test)] +mod tests { + use super::*; + use std::path::PathBuf; + + fn manifest_path(path: &str) -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(path) + } + + fn strip_ansi(input: &str) -> String { + let mut output = String::new(); + let mut chars = input.chars(); + while let Some(c) = chars.next() { + if c == '\x1b' { + for c in chars.by_ref() { + if c == 'm' { + break; + } + } + } else { + output.push(c); + } + } + output + } + + #[test] + fn list_output_mentions_tpch_options() { + let registry = SuiteRegistry::discover(manifest_path("sql_benchmarks")).unwrap(); + let output = strip_ansi(&format_suite_list_styled(®istry).unwrap()); + + assert!(output.contains("tpch\n description: TPC-H SQL benchmarks")); + assert!(output.contains("-f, --format parquet (default), csv, mem")); + assert!(output.contains("-sf, --scale-factor 1 (default), 10")); + } + + #[test] + fn styled_list_output_includes_ansi_sequences() { + let registry = SuiteRegistry::discover(manifest_path("sql_benchmarks")).unwrap(); + let output = format_suite_list_styled(®istry).unwrap(); + + assert!(output.contains("\u{1b}[")); + assert!(output.contains("tpch")); + assert!(output.contains("-f")); + assert!(output.contains("--format")); + assert!(output.contains("-sf")); + assert!(output.contains("--scale-factor")); + assert!(output.contains("")); + } +} diff --git a/benchmarks/src/benchmark_runner/suite.rs b/benchmarks/src/benchmark_runner/suite.rs new file mode 100644 index 0000000000000..fe5b3339b1643 --- /dev/null +++ b/benchmarks/src/benchmark_runner/suite.rs @@ -0,0 +1,500 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Suite-file loading and validation. +//! +//! A suite is described by a text `.suite` file that declares which options +//! the runner should display. Query discovery recursively scans from the suite +//! file's directory for `qNN.benchmark` files. Discovered queries are cached +//! lazily because they are reused by listing during a single CLI run. + +use datafusion_common::{DataFusionError, Result}; +use serde::{Deserialize, Serialize}; +use std::cell::OnceCell; +use std::collections::HashSet; +use std::fs::{self, DirEntry}; +use std::path::{Path, PathBuf}; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct SuiteQuery { + /// Numeric query id parsed from a `qNN.benchmark` file. + pub id: usize, + /// File name as it appears on disk, for example `q01.benchmark`. + pub file_name: String, + /// Full path to the benchmark file. + pub path: PathBuf, +} + +/// Parsed `.suite` file plus runtime metadata derived from its location. +/// +/// The serialized fields define the text configuration format. The skipped +/// fields are populated from the suite file path and used for discovery and +/// caching during a single runner invocation. +#[derive(Debug, Deserialize, Serialize)] +pub struct SuiteConfig { + /// Suite selector used on the command line, such as `tpch`. + pub name: String, + /// Human-readable suite description shown by `list`. + pub description: String, + /// Suite-specific options shown by `list`. + #[serde(default)] + pub options: Vec, + /// Path to the `.suite` file that produced this config. + #[serde(skip)] + pub suite_path: PathBuf, + /// Directory containing the `.suite` file. + #[serde(skip)] + pub suite_dir: PathBuf, + /// Lazily discovered benchmark query files for this suite. + #[serde(skip)] + pub(crate) query_cache: OnceCell>, +} + +impl Clone for SuiteConfig { + fn clone(&self) -> Self { + Self { + name: self.name.clone(), + description: self.description.clone(), + options: self.options.clone(), + suite_path: self.suite_path.clone(), + suite_dir: self.suite_dir.clone(), + query_cache: OnceCell::new(), + } + } +} + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct SuiteOption { + /// Long option name, without the leading `--`. + pub name: String, + /// Optional short alias, without the leading `-`. + #[serde(default)] + pub short: Option, + /// Default option value. + pub default: String, + /// Allowed option values. + pub values: Vec, + /// Help text shown in command output. + pub help: String, +} + +/// Discovered suite metadata, sorted by suite name. +#[derive(Debug, Clone)] +pub struct SuiteRegistry { + suites: Vec, +} + +impl SuiteConfig { + /// Loads, parses, and validates one `.suite` file. + /// + /// The suite file path and containing directory are stored on the returned + /// config so later discovery can be resolved relative to the suite file. + pub fn from_file(path: impl AsRef) -> Result { + let suite_path = path.as_ref().to_path_buf(); + let contents = fs::read_to_string(&suite_path).map_err(|e| { + DataFusionError::External( + format!("failed to read suite file {}: {e}", suite_path.display()).into(), + ) + })?; + let mut suite: Self = toml::from_str(&contents).map_err(|e| { + DataFusionError::External( + format!("failed to parse suite file {}: {e}", suite_path.display()) + .into(), + ) + })?; + + suite.suite_dir = suite_path + .parent() + .map(Path::to_path_buf) + .unwrap_or_else(|| PathBuf::from(".")); + suite.suite_path = suite_path; + suite.validate()?; + + Ok(suite) + } + + /// Returns the directory recursively searched for query benchmark files. + pub fn query_search_root(&self) -> &Path { + &self.suite_dir + } + + /// Discovers and caches the suite's benchmark query files. + /// + /// Query files are found by recursively scanning from the suite directory, + /// accepting only `qNN.benchmark` files, sorting by numeric query id, and + /// rejecting duplicate ids. + pub fn discover_queries(&self) -> Result> { + if let Some(queries) = self.query_cache.get() { + return Ok(queries.clone()); + } + + let queries = self.scan_queries()?; + let _ = self.query_cache.set(queries.clone()); + Ok(queries) + } + + /// Performs uncached query discovery and duplicate-id validation. + fn scan_queries(&self) -> Result> { + let mut queries = Vec::new(); + + self.scan_query_dir(self.query_search_root(), &mut queries)?; + queries.sort_by(|left, right| { + left.id + .cmp(&right.id) + .then_with(|| left.path.cmp(&right.path)) + }); + + for pair in queries.windows(2) { + let [left, right] = pair else { + continue; + }; + if left.id == right.id { + return Err(DataFusionError::Configuration(format!( + "duplicate QUERY_ID {} in suite '{}': {} and {}", + left.id, + self.name, + left.path.display(), + right.path.display() + ))); + } + } + + Ok(queries) + } + + /// Recursively scans a directory and appends valid query benchmark files to + /// the provided collection. + fn scan_query_dir(&self, dir: &Path, queries: &mut Vec) -> Result<()> { + let mut entries = read_dir_entries(dir, "benchmark query directory")?; + entries.sort_by_key(|entry| entry.file_name()); + + for entry in entries { + let path = entry.path(); + let file_type = entry.file_type().map_err(|e| { + DataFusionError::External( + format!( + "failed to read benchmark query entry type {}: {e}", + path.display() + ) + .into(), + ) + })?; + + if file_type.is_dir() { + self.scan_query_dir(&path, queries)?; + continue; + } + + if path + .extension() + .is_none_or(|extension| extension != "benchmark") + { + continue; + } + + let file_name = entry.file_name().to_string_lossy().into_owned(); + if let Some(id) = parse_query_file_name(&file_name) { + queries.push(SuiteQuery { + id, + file_name, + path, + }); + } + } + + Ok(()) + } + + /// Validates suite metadata that cannot be enforced by TOML deserialization. + fn validate(&self) -> Result<()> { + self.validate_suite_fields()?; + self.validate_options() + } + + /// Validates required suite-level fields. + fn validate_suite_fields(&self) -> Result<()> { + if self.name.trim().is_empty() { + return Err(DataFusionError::Configuration( + "suite name cannot be empty".to_string(), + )); + } + + Ok(()) + } + + /// Validates suite-defined option declarations. + fn validate_options(&self) -> Result<()> { + let mut option_names = HashSet::new(); + for option in &self.options { + if !option_names.insert(option.name.as_str()) { + return Err(DataFusionError::Configuration(format!( + "duplicate option name '{}'", + option.name + ))); + } + + option.validate()?; + } + + Ok(()) + } +} + +impl SuiteOption { + /// Validates one suite-defined option. + fn validate(&self) -> Result<()> { + if self.name.trim().is_empty() { + return Err(DataFusionError::Configuration( + "option name cannot be empty".to_string(), + )); + } + + if !is_valid_cli_option_name(&self.name) { + return Err(DataFusionError::Configuration(format!( + "invalid option name '{}'; expected lowercase ASCII letters, digits, and hyphens", + self.name + ))); + } + + self.validate_short_alias()?; + + if self.help.trim().is_empty() { + return Err(DataFusionError::Configuration(format!( + "help for option '{}' cannot be empty", + self.name + ))); + } + + if self.values.is_empty() { + return Err(DataFusionError::Configuration(format!( + "values for option '{}' cannot be empty", + self.name + ))); + } + + let mut values = HashSet::new(); + for value in &self.values { + if !values.insert(value.as_str()) { + return Err(DataFusionError::Configuration(format!( + "duplicate value '{}' for option '{}'", + value, self.name + ))); + } + } + + if !self.values.contains(&self.default) { + return Err(DataFusionError::Configuration(format!( + "default value '{}' for option '{}' must be present in values", + self.default, self.name + ))); + } + + Ok(()) + } + + /// Validates the optional short alias for one suite-defined option. + fn validate_short_alias(&self) -> Result<()> { + let Some(short) = &self.short else { + return Ok(()); + }; + + if short.trim().is_empty() { + return Err(DataFusionError::Configuration(format!( + "short alias for option '{}' cannot be empty", + self.name + ))); + } + + if !is_valid_cli_option_name(short) { + return Err(DataFusionError::Configuration(format!( + "invalid short alias '{short}' for option '{}'; expected lowercase ASCII letters, digits, and hyphens", + self.name + ))); + } + + Ok(()) + } +} + +impl SuiteRegistry { + /// Discovers all suite files below the SQL benchmark root. + /// + /// Each direct child directory is searched for `.suite` files. Suite names + /// must be unique across the registry so command selectors are + /// unambiguous. + pub fn discover(root: impl AsRef) -> Result { + let root = root.as_ref(); + let mut suites = Vec::new(); + let mut suite_names = HashSet::new(); + + for entry in read_dir_entries(root, "benchmark suite root")? { + if !entry + .file_type() + .map_err(|e| { + DataFusionError::External( + format!( + "failed to read benchmark suite entry type {}: {e}", + entry.path().display() + ) + .into(), + ) + })? + .is_dir() + { + continue; + } + + let mut suite_files = Vec::new(); + let suite_dir = entry.path(); + for suite_entry in read_dir_entries(&suite_dir, "benchmark suite directory")? + { + let path = suite_entry.path(); + if path + .extension() + .is_some_and(|extension| extension == "suite") + { + suite_files.push(path); + } + } + suite_files.sort(); + + for suite_file in suite_files { + let suite = SuiteConfig::from_file(suite_file)?; + if !suite_names.insert(suite.name.clone()) { + return Err(DataFusionError::Configuration(format!( + "duplicate suite name '{}'", + suite.name + ))); + } + suites.push(suite); + } + } + + suites.sort_by(|left, right| left.name.cmp(&right.name)); + + Ok(Self { suites }) + } + + /// Returns discovered suites sorted by selector name. + pub fn suites(&self) -> &[SuiteConfig] { + &self.suites + } +} + +fn parse_query_file_name(file_name: &str) -> Option { + let query_id = file_name.strip_prefix('q')?.strip_suffix(".benchmark")?; + + if query_id.len() < 2 || !query_id.chars().all(|c| c.is_ascii_digit()) { + return None; + } + + query_id.parse().ok() +} + +fn is_valid_cli_option_name(name: &str) -> bool { + name.chars() + .all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '-') +} + +fn read_dir_entries(dir: &Path, label: &str) -> Result> { + let entries = fs::read_dir(dir).map_err(|e| { + DataFusionError::External( + format!("failed to read {label} {}: {e}", dir.display()).into(), + ) + })?; + + entries + .collect::>>() + .map_err(|e| DataFusionError::External(Box::new(e))) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::fs; + + fn manifest_path(path: &str) -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(path) + } + + #[test] + fn discovers_tpch_suite_file() { + let registry = SuiteRegistry::discover(manifest_path("sql_benchmarks")).unwrap(); + + assert_eq!(registry.suites().len(), 1); + assert_eq!(registry.suites()[0].name, "tpch"); + assert_eq!(registry.suites()[0].options.len(), 2); + } + + #[test] + fn discovers_query_ids_in_numeric_order() { + let registry = SuiteRegistry::discover(manifest_path("sql_benchmarks")).unwrap(); + let suite = ®istry.suites()[0]; + let queries = suite.discover_queries().unwrap(); + let ids = queries.iter().map(|query| query.id).collect::>(); + + assert_eq!(ids.first(), Some(&1)); + assert_eq!(ids.last(), Some(&22)); + assert!(!ids.contains(&0)); + } + + #[test] + fn rejects_duplicate_suite_names() { + let dir = tempfile::tempdir().unwrap(); + let one = dir.path().join("one"); + let two = dir.path().join("two"); + fs::create_dir_all(&one).unwrap(); + fs::create_dir_all(&two).unwrap(); + fs::write( + one.join("suite.suite"), + "name = \"dup\"\ndescription = \"one\"\n", + ) + .unwrap(); + fs::write( + two.join("suite.suite"), + "name = \"dup\"\ndescription = \"two\"\n", + ) + .unwrap(); + + let err = SuiteRegistry::discover(dir.path()).unwrap_err(); + assert!(err.to_string().contains("duplicate suite name")); + } + + #[test] + fn rejects_invalid_option_metadata() { + let dir = tempfile::tempdir().unwrap(); + let suite_dir = dir.path().join("suite"); + fs::create_dir_all(&suite_dir).unwrap(); + fs::write( + suite_dir.join("bad.suite"), + r#" +name = "bad" +description = "bad suite" + +[[options]] +name = "BAD" +default = "one" +values = ["one"] +help = "bad" +"#, + ) + .unwrap(); + + let err = SuiteRegistry::discover(dir.path()).unwrap_err(); + assert!(err.to_string().contains("invalid option name")); + } +} diff --git a/benchmarks/src/bin/benchmark_runner.rs b/benchmarks/src/bin/benchmark_runner.rs new file mode 100644 index 0000000000000..831d6f96d7e22 --- /dev/null +++ b/benchmarks/src/bin/benchmark_runner.rs @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion_benchmarks::benchmark_runner::run_cli; + +#[cfg(all(feature = "snmalloc", feature = "mimalloc"))] +compile_error!( + "feature \"snmalloc\" and feature \"mimalloc\" cannot be enabled at the same time" +); + +#[cfg(feature = "snmalloc")] +#[global_allocator] +static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc; + +#[cfg(feature = "mimalloc")] +#[global_allocator] +static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; + +fn main() { + env_logger::init(); + if let Err(e) = run_cli(std::env::args()) { + eprintln!("{e}"); + std::process::exit(1); + } +} diff --git a/benchmarks/src/lib.rs b/benchmarks/src/lib.rs index 7c598e65d824c..983c5fc24a6eb 100644 --- a/benchmarks/src/lib.rs +++ b/benchmarks/src/lib.rs @@ -16,6 +16,7 @@ // under the License. //! DataFusion benchmark runner +pub mod benchmark_runner; pub mod cancellation; pub mod clickbench; pub mod h2o;