Skip to content

Commit 85e75e2

Browse files
authored
Add quote style and trimming to csv writier (#20813)
## Which issue does this PR close? - Closes #10669 Related arrow-rs PRs apache/arrow-rs#8960 and apache/arrow-rs#9004 ## Rationale for this change The CSV writer was missing support for `quote_style`, `ignore_leading_whitespace`, and `ignore_trailing_whitespace` options that are available on the underlying arrow `WriterBuilder`. This meant users couldn't control quoting behaviour or whitespace trimming when writing CSV files. ## What changes are included in this PR? Adds three new CSV writer options wired through the full stack: - `quote_style` — controls when fields are quoted (`Always`, `Necessary`, `NonNumeric`, `Never`). Modelled as a protobuf enum (`CsvQuoteStyle`). - `ignore_leading_whitespace` — trims leading whitespace from string values on write. - `ignore_trailing_whitespace` — trims trailing whitespace from string values on write. ## Are these changes tested? Yes — sqllogictest coverage added in `csv_files.slt` ## Are there any user-facing changes? Three new `format.*` options available in COPY TO and CREATE EXTERNAL TABLE for CSV: - `format.quote_style` (string: `Always`, `Necessary`, `NonNumeric`, `Never`) - `format.ignore_leading_whitespace` (boolean) - `format.ignore_trailing_whitespace` (boolean)
1 parent 1ea328d commit 85e75e2

11 files changed

Lines changed: 717 additions & 8 deletions

File tree

datafusion/common/src/config.rs

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ use crate::encryption::{FileDecryptionProperties, FileEncryptionProperties};
2424
use crate::error::_config_err;
2525
use crate::format::{ExplainAnalyzeCategories, ExplainFormat, MetricType};
2626
use crate::parquet_config::DFParquetWriterVersion;
27-
use crate::parsers::CompressionTypeVariant;
27+
use crate::parsers::{CompressionTypeVariant, CsvQuoteStyle};
2828
use crate::utils::get_available_parallelism;
2929
use crate::{DataFusionError, Result};
3030
#[cfg(feature = "parquet_encryption")]
@@ -2042,6 +2042,17 @@ impl ConfigField for CompressionTypeVariant {
20422042
}
20432043
}
20442044

2045+
impl ConfigField for CsvQuoteStyle {
2046+
fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) {
2047+
v.some(key, self, description)
2048+
}
2049+
2050+
fn set(&mut self, _: &str, value: &str) -> Result<()> {
2051+
*self = CsvQuoteStyle::from_str(value)?;
2052+
Ok(())
2053+
}
2054+
}
2055+
20452056
/// An implementation trait used to recursively walk configuration
20462057
pub trait Visit {
20472058
fn some<V: Display>(&mut self, key: &str, value: V, description: &'static str);
@@ -3114,6 +3125,15 @@ config_namespace! {
31143125
pub terminator: Option<u8>, default = None
31153126
pub escape: Option<u8>, default = None
31163127
pub double_quote: Option<bool>, default = None
3128+
/// Quote style for CSV writing.
3129+
/// One of: "Always", "Necessary", "NonNumeric", "Never"
3130+
pub quote_style: CsvQuoteStyle, default = CsvQuoteStyle::Necessary
3131+
/// Whether to ignore leading whitespace in string values when writing CSV.
3132+
/// Defaults to `false` when `None`.
3133+
pub ignore_leading_whitespace: Option<bool>, default = None
3134+
/// Whether to ignore trailing whitespace in string values when writing CSV.
3135+
/// Defaults to `false` when `None`.
3136+
pub ignore_trailing_whitespace: Option<bool>, default = None
31173137
/// Specifies whether newlines in (quoted) values are supported.
31183138
///
31193139
/// Parsing newlines in quoted values may be affected by execution behaviour such as
@@ -3222,6 +3242,30 @@ impl CsvOptions {
32223242
self
32233243
}
32243244

3245+
/// Set the quote style for CSV writing.
3246+
pub fn with_quote_style(mut self, quote_style: CsvQuoteStyle) -> Self {
3247+
self.quote_style = quote_style;
3248+
self
3249+
}
3250+
3251+
/// Set whether to ignore leading whitespace in string values when writing CSV.
3252+
pub fn with_ignore_leading_whitespace(
3253+
mut self,
3254+
ignore_leading_whitespace: bool,
3255+
) -> Self {
3256+
self.ignore_leading_whitespace = Some(ignore_leading_whitespace);
3257+
self
3258+
}
3259+
3260+
/// Set whether to ignore trailing whitespace in string values when writing CSV.
3261+
pub fn with_ignore_trailing_whitespace(
3262+
mut self,
3263+
ignore_trailing_whitespace: bool,
3264+
) -> Self {
3265+
self.ignore_trailing_whitespace = Some(ignore_trailing_whitespace);
3266+
self
3267+
}
3268+
32253269
/// Specifies whether newlines in (quoted) values are supported.
32263270
///
32273271
/// Parsing newlines in quoted values may be affected by execution behaviour such as

datafusion/common/src/file_options/csv_writer.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,13 @@ impl TryFrom<&CsvOptions> for CsvWriterOptions {
9494
if let Some(v) = &value.double_quote {
9595
builder = builder.with_double_quote(*v)
9696
}
97+
builder = builder.with_quote_style(value.quote_style.into());
98+
if let Some(v) = &value.ignore_leading_whitespace {
99+
builder = builder.with_ignore_leading_whitespace(*v)
100+
}
101+
if let Some(v) = &value.ignore_trailing_whitespace {
102+
builder = builder.with_ignore_trailing_whitespace(*v)
103+
}
97104
Ok(CsvWriterOptions {
98105
writer_options: builder,
99106
compression: value.compression,

datafusion/common/src/parsers.rs

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,3 +73,59 @@ impl CompressionTypeVariant {
7373
!matches!(self, &Self::UNCOMPRESSED)
7474
}
7575
}
76+
77+
/// CSV quote style
78+
///
79+
/// Controls when fields are quoted when writing CSV files.
80+
/// Corresponds to [`arrow::csv::QuoteStyle`].
81+
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
82+
pub enum CsvQuoteStyle {
83+
/// Quote all fields
84+
Always,
85+
/// Only quote fields when necessary (default)
86+
#[default]
87+
Necessary,
88+
/// Quote all non-numeric fields
89+
NonNumeric,
90+
/// Never quote fields
91+
Never,
92+
}
93+
94+
impl FromStr for CsvQuoteStyle {
95+
type Err = DataFusionError;
96+
97+
fn from_str(s: &str) -> Result<Self, Self::Err> {
98+
match s.to_lowercase().as_str() {
99+
"always" => Ok(Self::Always),
100+
"necessary" => Ok(Self::Necessary),
101+
"non_numeric" | "nonnumeric" => Ok(Self::NonNumeric),
102+
"never" => Ok(Self::Never),
103+
_ => Err(DataFusionError::NotImplemented(format!(
104+
"Unsupported CSV quote style {s}"
105+
))),
106+
}
107+
}
108+
}
109+
110+
impl From<CsvQuoteStyle> for arrow::csv::QuoteStyle {
111+
fn from(style: CsvQuoteStyle) -> Self {
112+
match style {
113+
CsvQuoteStyle::Always => Self::Always,
114+
CsvQuoteStyle::NonNumeric => Self::NonNumeric,
115+
CsvQuoteStyle::Never => Self::Never,
116+
CsvQuoteStyle::Necessary => Self::Necessary,
117+
}
118+
}
119+
}
120+
121+
impl Display for CsvQuoteStyle {
122+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
123+
let str = match self {
124+
Self::Always => "Always",
125+
Self::Necessary => "Necessary",
126+
Self::NonNumeric => "NonNumeric",
127+
Self::Never => "Never",
128+
};
129+
write!(f, "{str}")
130+
}
131+
}

datafusion/proto-common/proto/datafusion_common.proto

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -434,6 +434,13 @@ message JsonWriterOptions {
434434
}
435435

436436

437+
enum CsvQuoteStyle {
438+
NECESSARY = 0;
439+
ALWAYS = 1;
440+
NON_NUMERIC = 2;
441+
NEVER = 3;
442+
}
443+
437444
message CsvWriterOptions {
438445
// Compression type
439446
CompressionTypeVariant compression = 1;
@@ -457,6 +464,12 @@ message CsvWriterOptions {
457464
string escape = 10;
458465
// Optional flag whether to double quotes, instead of escaping. Defaults to `true`
459466
bool double_quote = 11;
467+
// Quote style for CSV writing
468+
CsvQuoteStyle quote_style = 12;
469+
// Whether to ignore leading whitespace in string values
470+
bool ignore_leading_whitespace = 13;
471+
// Whether to ignore trailing whitespace in string values
472+
bool ignore_trailing_whitespace = 14;
460473
}
461474

462475
// Options controlling CSV format
@@ -480,6 +493,12 @@ message CsvOptions {
480493
bytes terminator = 17; // Optional terminator character as a byte
481494
bytes truncated_rows = 18; // Indicates if truncated rows are allowed
482495
optional uint32 compression_level = 19; // Optional compression level
496+
// Quote style for CSV writing
497+
CsvQuoteStyle quote_style = 20;
498+
// Whether to ignore leading whitespace in string values
499+
bytes ignore_leading_whitespace = 21;
500+
// Whether to ignore trailing whitespace in string values
501+
bytes ignore_trailing_whitespace = 22;
483502
}
484503

485504
// Options controlling CSV format

datafusion/proto-common/src/from_proto/mod.rs

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ use crate::common::proto_error;
2222
use crate::protobuf_common as protobuf;
2323
use arrow::array::{ArrayRef, AsArray};
2424
use arrow::buffer::Buffer;
25-
use arrow::csv::WriterBuilder;
25+
use arrow::csv::{QuoteStyle, WriterBuilder};
2626
use arrow::datatypes::{
2727
DataType, Field, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, Schema,
2828
TimeUnit, UnionFields, UnionMode, i256,
@@ -964,6 +964,17 @@ impl From<CompressionTypeVariant> for protobuf::CompressionTypeVariant {
964964
}
965965
}
966966

967+
impl From<protobuf::CsvQuoteStyle> for datafusion_common::parsers::CsvQuoteStyle {
968+
fn from(value: protobuf::CsvQuoteStyle) -> Self {
969+
match value {
970+
protobuf::CsvQuoteStyle::Necessary => Self::Necessary,
971+
protobuf::CsvQuoteStyle::Always => Self::Always,
972+
protobuf::CsvQuoteStyle::NonNumeric => Self::NonNumeric,
973+
protobuf::CsvQuoteStyle::Never => Self::Never,
974+
}
975+
}
976+
}
977+
967978
impl TryFrom<&protobuf::CsvWriterOptions> for CsvWriterOptions {
968979
type Error = DataFusionError;
969980

@@ -1020,6 +1031,15 @@ impl TryFrom<&protobuf::CsvOptions> for CsvOptions {
10201031
.then(|| proto_opts.null_regex.clone()),
10211032
comment: proto_opts.comment.first().copied(),
10221033
truncated_rows: proto_opts.truncated_rows.first().map(|h| *h != 0),
1034+
quote_style: proto_opts.quote_style().into(),
1035+
ignore_leading_whitespace: proto_opts
1036+
.ignore_leading_whitespace
1037+
.first()
1038+
.map(|h| *h != 0),
1039+
ignore_trailing_whitespace: proto_opts
1040+
.ignore_trailing_whitespace
1041+
.first()
1042+
.map(|h| *h != 0),
10231043
})
10241044
}
10251045
}
@@ -1281,14 +1301,27 @@ pub(crate) fn csv_writer_options_from_proto(
12811301
return Err(proto_error("Error parsing CSV Escape"));
12821302
}
12831303
}
1304+
let quote_style = match protobuf::CsvQuoteStyle::try_from(writer_options.quote_style)
1305+
{
1306+
Ok(protobuf::CsvQuoteStyle::Always) => QuoteStyle::Always,
1307+
Ok(protobuf::CsvQuoteStyle::NonNumeric) => QuoteStyle::NonNumeric,
1308+
Ok(protobuf::CsvQuoteStyle::Never) => QuoteStyle::Never,
1309+
Ok(protobuf::CsvQuoteStyle::Necessary) => QuoteStyle::Necessary,
1310+
_ => Err(proto_error(
1311+
"Unknown quote style, must be one of: 'Always', 'NonNumeric', 'Never', 'Necessary'",
1312+
))?,
1313+
};
12841314
Ok(builder
12851315
.with_header(writer_options.has_header)
12861316
.with_date_format(writer_options.date_format.clone())
12871317
.with_datetime_format(writer_options.datetime_format.clone())
12881318
.with_timestamp_format(writer_options.timestamp_format.clone())
12891319
.with_time_format(writer_options.time_format.clone())
12901320
.with_null(writer_options.null_value.clone())
1291-
.with_double_quote(writer_options.double_quote))
1321+
.with_double_quote(writer_options.double_quote)
1322+
.with_quote_style(quote_style)
1323+
.with_ignore_leading_whitespace(writer_options.ignore_leading_whitespace)
1324+
.with_ignore_trailing_whitespace(writer_options.ignore_trailing_whitespace))
12921325
}
12931326

12941327
#[cfg(test)]

0 commit comments

Comments
 (0)