Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 43 additions & 1 deletion datafusion/common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ use crate::encryption::{FileDecryptionProperties, FileEncryptionProperties};
use crate::error::_config_err;
use crate::format::{ExplainAnalyzeLevel, ExplainFormat};
use crate::parquet_config::DFParquetWriterVersion;
use crate::parsers::CompressionTypeVariant;
use crate::parsers::{CompressionTypeVariant, CsvQuoteStyle};
use crate::utils::get_available_parallelism;
use crate::{DataFusionError, Result};
#[cfg(feature = "parquet_encryption")]
Expand Down Expand Up @@ -1855,6 +1855,17 @@ impl ConfigField for CompressionTypeVariant {
}
}

impl ConfigField for CsvQuoteStyle {
fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) {
v.some(key, self, description)
}

fn set(&mut self, _: &str, value: &str) -> Result<()> {
*self = CsvQuoteStyle::from_str(value)?;
Ok(())
}
}

/// An implementation trait used to recursively walk configuration
pub trait Visit {
fn some<V: Display>(&mut self, key: &str, value: V, description: &'static str);
Expand Down Expand Up @@ -2927,6 +2938,13 @@ config_namespace! {
pub terminator: Option<u8>, default = None
pub escape: Option<u8>, default = None
pub double_quote: Option<bool>, default = None
/// Quote style for CSV writing.
/// One of: "Always", "Necessary", "NonNumeric", "Never"
pub quote_style: CsvQuoteStyle, default = CsvQuoteStyle::Necessary
/// Whether to ignore leading whitespace in string values when writing CSV.
pub ignore_leading_whitespace: Option<bool>, default = None
/// Whether to ignore trailing whitespace in string values when writing CSV.
pub ignore_trailing_whitespace: Option<bool>, default = None
/// Specifies whether newlines in (quoted) values are supported.
Comment on lines +2945 to 2948
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm wondering why we have this trend of doing Option<bool> instead of just bool 🤔

What is the default in case of None, and how is that different from just true/false (whichever it is)?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was also curious about this but didn't want to break the pattern...

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it some ser-deser quirk? I'll see if I can figure this out.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No I can't find a good reason for this... Would you like me to change or keep the pattern?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps its to do with hierarchy of configs and/or sql parsing 🤔

@alamb do you happen to know a reason for having Option<bool> instead of plain bool for cases where it'll end up being true or false anyway? (i.e. None doesn't represent a third state, but eventually maps to either true or false)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I fired a question off into the Discord to see if anyone has some more insight; ideally I'd like to get this config right the first time to avoid needing a breaking change later (or forever be stuck with this awkward interface if it turns out to be unnecessary)

///
/// Parsing newlines in quoted values may be affected by execution behaviour such as
Expand Down Expand Up @@ -3035,6 +3053,30 @@ impl CsvOptions {
self
}

/// Set the quote style for CSV writing.
pub fn with_quote_style(mut self, quote_style: CsvQuoteStyle) -> Self {
self.quote_style = quote_style;
self
}

/// Set whether to ignore leading whitespace in string values when writing CSV.
pub fn with_ignore_leading_whitespace(
mut self,
ignore_leading_whitespace: bool,
) -> Self {
self.ignore_leading_whitespace = Some(ignore_leading_whitespace);
self
}

/// Set whether to ignore trailing whitespace in string values when writing CSV.
pub fn with_ignore_trailing_whitespace(
mut self,
ignore_trailing_whitespace: bool,
) -> Self {
self.ignore_trailing_whitespace = Some(ignore_trailing_whitespace);
self
}

/// Specifies whether newlines in (quoted) values are supported.
///
/// Parsing newlines in quoted values may be affected by execution behaviour such as
Expand Down
7 changes: 7 additions & 0 deletions datafusion/common/src/file_options/csv_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,13 @@ impl TryFrom<&CsvOptions> for CsvWriterOptions {
if let Some(v) = &value.double_quote {
builder = builder.with_double_quote(*v)
}
builder = builder.with_quote_style(value.quote_style.into());
if let Some(v) = &value.ignore_leading_whitespace {
builder = builder.with_ignore_leading_whitespace(*v)
}
if let Some(v) = &value.ignore_trailing_whitespace {
builder = builder.with_ignore_trailing_whitespace(*v)
}
Ok(CsvWriterOptions {
writer_options: builder,
compression: value.compression,
Expand Down
56 changes: 56 additions & 0 deletions datafusion/common/src/parsers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,59 @@ impl CompressionTypeVariant {
!matches!(self, &Self::UNCOMPRESSED)
}
}

/// CSV quote style
///
/// Controls when fields are quoted when writing CSV files.
/// Corresponds to [`arrow::csv::QuoteStyle`].
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
pub enum CsvQuoteStyle {
/// Quote all fields
Always,
/// Only quote fields when necessary (default)
#[default]
Necessary,
/// Quote all non-numeric fields
NonNumeric,
/// Never quote fields
Never,
}
Comment thread
Jefffrey marked this conversation as resolved.

impl FromStr for CsvQuoteStyle {
type Err = DataFusionError;

fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"always" => Ok(Self::Always),
"necessary" => Ok(Self::Necessary),
"non_numeric" | "nonnumeric" => Ok(Self::NonNumeric),
"never" => Ok(Self::Never),
_ => Err(DataFusionError::NotImplemented(format!(
"Unsupported CSV quote style {s}"
))),
}
}
}

impl From<CsvQuoteStyle> for arrow::csv::QuoteStyle {
fn from(style: CsvQuoteStyle) -> Self {
match style {
CsvQuoteStyle::Always => Self::Always,
CsvQuoteStyle::NonNumeric => Self::NonNumeric,
CsvQuoteStyle::Never => Self::Never,
CsvQuoteStyle::Necessary => Self::Necessary,
}
}
}

impl Display for CsvQuoteStyle {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let str = match self {
Self::Always => "Always",
Self::Necessary => "Necessary",
Self::NonNumeric => "NonNumeric",
Self::Never => "Never",
};
write!(f, "{str}")
}
}
19 changes: 19 additions & 0 deletions datafusion/proto-common/proto/datafusion_common.proto
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,13 @@ message JsonWriterOptions {
}


enum CsvQuoteStyle {
NECESSARY = 0;
ALWAYS = 1;
NON_NUMERIC = 2;
NEVER = 3;
}

message CsvWriterOptions {
// Compression type
CompressionTypeVariant compression = 1;
Expand All @@ -453,6 +460,12 @@ message CsvWriterOptions {
string escape = 10;
// Optional flag whether to double quotes, instead of escaping. Defaults to `true`
bool double_quote = 11;
// Quote style for CSV writing
CsvQuoteStyle quote_style = 12;
// Whether to ignore leading whitespace in string values
bool ignore_leading_whitespace = 13;
// Whether to ignore trailing whitespace in string values
bool ignore_trailing_whitespace = 14;
}

// Options controlling CSV format
Expand All @@ -476,6 +489,12 @@ message CsvOptions {
bytes terminator = 17; // Optional terminator character as a byte
bytes truncated_rows = 18; // Indicates if truncated rows are allowed
optional uint32 compression_level = 19; // Optional compression level
// Quote style for CSV writing
CsvQuoteStyle quote_style = 20;
// Whether to ignore leading whitespace in string values
bytes ignore_leading_whitespace = 21;
// Whether to ignore trailing whitespace in string values
bytes ignore_trailing_whitespace = 22;
}

// Options controlling CSV format
Expand Down
37 changes: 35 additions & 2 deletions datafusion/proto-common/src/from_proto/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ use crate::common::proto_error;
use crate::protobuf_common as protobuf;
use arrow::array::{ArrayRef, AsArray};
use arrow::buffer::Buffer;
use arrow::csv::WriterBuilder;
use arrow::csv::{QuoteStyle, WriterBuilder};
use arrow::datatypes::{
DataType, Field, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, Schema,
TimeUnit, UnionFields, UnionMode, i256,
Expand Down Expand Up @@ -947,6 +947,17 @@ impl From<CompressionTypeVariant> for protobuf::CompressionTypeVariant {
}
}

impl From<protobuf::CsvQuoteStyle> for datafusion_common::parsers::CsvQuoteStyle {
fn from(value: protobuf::CsvQuoteStyle) -> Self {
match value {
protobuf::CsvQuoteStyle::Necessary => Self::Necessary,
protobuf::CsvQuoteStyle::Always => Self::Always,
protobuf::CsvQuoteStyle::NonNumeric => Self::NonNumeric,
protobuf::CsvQuoteStyle::Never => Self::Never,
}
}
}

impl TryFrom<&protobuf::CsvWriterOptions> for CsvWriterOptions {
type Error = DataFusionError;

Expand Down Expand Up @@ -1003,6 +1014,15 @@ impl TryFrom<&protobuf::CsvOptions> for CsvOptions {
.then(|| proto_opts.null_regex.clone()),
comment: proto_opts.comment.first().copied(),
truncated_rows: proto_opts.truncated_rows.first().map(|h| *h != 0),
quote_style: proto_opts.quote_style().into(),
ignore_leading_whitespace: proto_opts
.ignore_leading_whitespace
.first()
.map(|h| *h != 0),
ignore_trailing_whitespace: proto_opts
.ignore_trailing_whitespace
.first()
.map(|h| *h != 0),
})
}
}
Expand Down Expand Up @@ -1253,12 +1273,25 @@ pub(crate) fn csv_writer_options_from_proto(
return Err(proto_error("Error parsing CSV Escape"));
}
}
let quote_style = match protobuf::CsvQuoteStyle::try_from(writer_options.quote_style)
{
Ok(protobuf::CsvQuoteStyle::Always) => QuoteStyle::Always,
Ok(protobuf::CsvQuoteStyle::NonNumeric) => QuoteStyle::NonNumeric,
Ok(protobuf::CsvQuoteStyle::Never) => QuoteStyle::Never,
Ok(protobuf::CsvQuoteStyle::Necessary) => QuoteStyle::Necessary,
_ => Err(proto_error(
"Unknown quote style, must be one of: 'Always', 'NonNumeric', 'Never', 'Necessary'",
))?,
};
Ok(builder
.with_header(writer_options.has_header)
.with_date_format(writer_options.date_format.clone())
.with_datetime_format(writer_options.datetime_format.clone())
.with_timestamp_format(writer_options.timestamp_format.clone())
.with_time_format(writer_options.time_format.clone())
.with_null(writer_options.null_value.clone())
.with_double_quote(writer_options.double_quote))
.with_double_quote(writer_options.double_quote)
.with_quote_style(quote_style)
.with_ignore_leading_whitespace(writer_options.ignore_leading_whitespace)
.with_ignore_trailing_whitespace(writer_options.ignore_trailing_whitespace))
}
Loading
Loading