Skip to content

Commit 47936bb

Browse files
committed
feat: add support for parquet content defined chunking options
1 parent abf8f61 commit 47936bb

File tree

15 files changed

+488
-99
lines changed

15 files changed

+488
-99
lines changed

Cargo.lock

Lines changed: 44 additions & 59 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,3 +282,21 @@ incremental = false
282282
inherits = "release"
283283
debug = true
284284
strip = false
285+
286+
[patch.crates-io]
287+
arrow = { git = "https://github.com/apache/arrow-rs", tag = "58.1.0-rc1" }
288+
arrow-arith = { git = "https://github.com/apache/arrow-rs", tag = "58.1.0-rc1" }
289+
arrow-array = { git = "https://github.com/apache/arrow-rs", tag = "58.1.0-rc1" }
290+
arrow-buffer = { git = "https://github.com/apache/arrow-rs", tag = "58.1.0-rc1" }
291+
arrow-cast = { git = "https://github.com/apache/arrow-rs", tag = "58.1.0-rc1" }
292+
arrow-csv = { git = "https://github.com/apache/arrow-rs", tag = "58.1.0-rc1" }
293+
arrow-data = { git = "https://github.com/apache/arrow-rs", tag = "58.1.0-rc1" }
294+
arrow-flight = { git = "https://github.com/apache/arrow-rs", tag = "58.1.0-rc1" }
295+
arrow-ipc = { git = "https://github.com/apache/arrow-rs", tag = "58.1.0-rc1" }
296+
arrow-json = { git = "https://github.com/apache/arrow-rs", tag = "58.1.0-rc1" }
297+
arrow-ord = { git = "https://github.com/apache/arrow-rs", tag = "58.1.0-rc1" }
298+
arrow-row = { git = "https://github.com/apache/arrow-rs", tag = "58.1.0-rc1" }
299+
arrow-schema = { git = "https://github.com/apache/arrow-rs", tag = "58.1.0-rc1" }
300+
arrow-select = { git = "https://github.com/apache/arrow-rs", tag = "58.1.0-rc1" }
301+
arrow-string = { git = "https://github.com/apache/arrow-rs", tag = "58.1.0-rc1" }
302+
parquet = { git = "https://github.com/apache/arrow-rs", tag = "58.1.0-rc1" }

datafusion/common/src/config.rs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -845,6 +845,7 @@ config_namespace! {
845845
/// default parquet writer setting
846846
pub bloom_filter_ndv: Option<u64>, default = None
847847

848+
848849
/// (writing) Controls whether DataFusion will attempt to speed up writing
849850
/// parquet files by serializing them in parallel. Each column
850851
/// in each row group in each output file are serialized in parallel
@@ -872,6 +873,27 @@ config_namespace! {
872873
/// writing out already in-memory data, such as from a cached
873874
/// data frame.
874875
pub maximum_buffered_record_batches_per_stream: usize, default = 2
876+
877+
/// (writing) EXPERIMENTAL: Enable content-defined chunking (CDC) when writing
878+
/// parquet files. When true, the other `cdc_*` options control the chunking
879+
/// behavior. When CDC is enabled, parallel writing is automatically disabled
880+
/// since the chunker state must persist across row groups.
881+
pub enable_content_defined_chunking: bool, default = false
882+
883+
/// (writing) Minimum chunk size in bytes for content-defined chunking.
884+
/// The rolling hash will not be updated until this size is reached for each chunk.
885+
/// Default is 256 KiB. Only used when `enable_content_defined_chunking` is true.
886+
pub cdc_min_chunk_size: usize, default = 256 * 1024
887+
888+
/// (writing) Maximum chunk size in bytes for content-defined chunking.
889+
/// The chunker will create a new chunk whenever the chunk size exceeds this value.
890+
/// Default is 1 MiB. Only used when `enable_content_defined_chunking` is true.
891+
pub cdc_max_chunk_size: usize, default = 1024 * 1024
892+
893+
/// (writing) Normalization level for content-defined chunking.
894+
/// Increasing this improves deduplication ratio but increases fragmentation.
895+
/// Recommended range is [-3, 3], default is 0. Only used when `enable_content_defined_chunking` is true.
896+
pub cdc_norm_level: i64, default = 0
875897
}
876898
}
877899

@@ -1820,6 +1842,7 @@ config_field!(usize);
18201842
config_field!(f64);
18211843
config_field!(u64);
18221844
config_field!(u32);
1845+
config_field!(i64);
18231846

18241847
impl ConfigField for u8 {
18251848
fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) {

0 commit comments

Comments
 (0)