Skip to content

Commit 627faba

Browse files
getChanjecsand838alamb
authored
Migrate Avro reader to arrow-avro and remove internal conversion code (#17861)
## Which issue does this PR close? - Closes #14097 ## Rationale for this change DataFusion previously maintained custom Avro-to-Arrow conversion logic. This PR migrates Avro reading to `arrow-avro` to align behavior with upstream Arrow and remove duplicated implementation. ## What changes are included in this PR? - Switched DataFusion Avro reader path to `arrow-avro` (`ReaderBuilder`) - Removed internal/legacy Avro conversion paths that are no longer needed - Updated crate wiring to use `arrow-avro` and removed prior `apache-avro` dependency usage in affected paths - Updated Avro projection flow to use `arrow-avro` projection support - Added/updated upgrade documentation for Avro API and behavior changes ## Are these changes tested? Yes. - Added/updated Avro reader unit tests in `datafusion/datasource-avro` (including projection and timestamp logical types) - Updated SQL logic tests in `datafusion/sqllogictest/test_files/avro.slt` - Integration is covered by existing CI/test suites for affected crates ## Are there any user-facing changes? Yes. 1. `DataFusionError::AvroError` is removed. 2. `From<apache_avro::Error> for DataFusionError` is removed. 3. Re-export changed from `datafusion::apache_avro` to `datafusion::arrow_avro`. 4. Avro feature wiring changed: - `datafusion` crate `avro` feature no longer enables `datafusion-common/avro` - `datafusion-proto` crate `avro` feature no longer enables `datafusion-common/avro` 5. Avro decoding behavior now follows `arrow-avro` semantics, including: - Avro `string` values being read as Arrow `Binary` in this path - `timestamp-*` logical types read as UTC timezone-aware timestamps (`Timestamp(..., Some("+00:00"))`) - `local-timestamp-*` remaining timezone-naive (`Timestamp(..., None)`) Upgrade notes are documented in: `docs/source/library-user-guide/upgrading/53.0.0.md` --------- Co-authored-by: Connor Sanders <170039284+jecsand838@users.noreply.github.com> Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent 37978e3 commit 627faba

19 files changed

Lines changed: 602 additions & 2895 deletions

File tree

Cargo.lock

Lines changed: 43 additions & 98 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,13 @@ arrow = { version = "58.1.0", features = [
9292
"prettyprint",
9393
"chrono-tz",
9494
] }
95+
arrow-avro = { version = "58.1.0", default-features = false, features = [
96+
"deflate",
97+
"snappy",
98+
"zstd",
99+
"bzip2",
100+
"xz",
101+
] }
95102
arrow-buffer = { version = "58.1.0", default-features = false }
96103
arrow-flight = { version = "58.1.0", features = [
97104
"flight-sql-experimental",

datafusion/common/Cargo.toml

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ workspace = true
4141
name = "datafusion_common"
4242

4343
[features]
44-
avro = ["apache-avro"]
4544
backtrace = []
4645
parquet_encryption = [
4746
"parquet",
@@ -66,12 +65,6 @@ harness = false
6665
name = "stats_merge"
6766

6867
[dependencies]
69-
apache-avro = { workspace = true, features = [
70-
"bzip",
71-
"snappy",
72-
"xz",
73-
"zstandard",
74-
], optional = true }
7568
arrow = { workspace = true }
7669
arrow-ipc = { workspace = true }
7770
chrono = { workspace = true }

datafusion/common/src/error.rs

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,6 @@ use std::sync::Arc;
4848
use crate::utils::datafusion_strsim::normalized_levenshtein;
4949
use crate::utils::quote_identifier;
5050
use crate::{Column, DFSchema, Diagnostic, TableReference};
51-
#[cfg(feature = "avro")]
52-
use apache_avro::Error as AvroError;
5351
use arrow::error::ArrowError;
5452
#[cfg(feature = "parquet")]
5553
use parquet::errors::ParquetError;
@@ -76,9 +74,6 @@ pub enum DataFusionError {
7674
/// Error when reading / writing Parquet data.
7775
#[cfg(feature = "parquet")]
7876
ParquetError(Box<ParquetError>),
79-
/// Error when reading Avro data.
80-
#[cfg(feature = "avro")]
81-
AvroError(Box<AvroError>),
8277
/// Error when reading / writing to / from an object_store (e.g. S3 or LocalFile)
8378
#[cfg(feature = "object_store")]
8479
ObjectStore(Box<object_store::Error>),
@@ -332,13 +327,6 @@ impl From<ParquetError> for DataFusionError {
332327
}
333328
}
334329

335-
#[cfg(feature = "avro")]
336-
impl From<AvroError> for DataFusionError {
337-
fn from(e: AvroError) -> Self {
338-
DataFusionError::AvroError(Box::new(e))
339-
}
340-
}
341-
342330
#[cfg(feature = "object_store")]
343331
impl From<object_store::Error> for DataFusionError {
344332
fn from(e: object_store::Error) -> Self {
@@ -389,8 +377,6 @@ impl Error for DataFusionError {
389377
DataFusionError::ArrowError(e, _) => Some(e.as_ref()),
390378
#[cfg(feature = "parquet")]
391379
DataFusionError::ParquetError(e) => Some(e.as_ref()),
392-
#[cfg(feature = "avro")]
393-
DataFusionError::AvroError(e) => Some(e.as_ref()),
394380
#[cfg(feature = "object_store")]
395381
DataFusionError::ObjectStore(e) => Some(e.as_ref()),
396382
DataFusionError::IoError(e) => Some(e),
@@ -520,8 +506,6 @@ impl DataFusionError {
520506
DataFusionError::ArrowError(_, _) => "Arrow error: ",
521507
#[cfg(feature = "parquet")]
522508
DataFusionError::ParquetError(_) => "Parquet error: ",
523-
#[cfg(feature = "avro")]
524-
DataFusionError::AvroError(_) => "Avro error: ",
525509
#[cfg(feature = "object_store")]
526510
DataFusionError::ObjectStore(_) => "Object Store error: ",
527511
DataFusionError::IoError(_) => "IO error: ",
@@ -561,8 +545,6 @@ impl DataFusionError {
561545
}
562546
#[cfg(feature = "parquet")]
563547
DataFusionError::ParquetError(ref desc) => Cow::Owned(desc.to_string()),
564-
#[cfg(feature = "avro")]
565-
DataFusionError::AvroError(ref desc) => Cow::Owned(desc.to_string()),
566548
DataFusionError::IoError(ref desc) => Cow::Owned(desc.to_string()),
567549
#[cfg(feature = "sql")]
568550
DataFusionError::SQL(ref desc, ref backtrace) => {

datafusion/core/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ nested_expressions = ["datafusion-functions-nested"]
4343
# This feature is deprecated. Use the `nested_expressions` feature instead.
4444
array_expressions = ["nested_expressions"]
4545
# Used to enable the avro format
46-
avro = ["datafusion-common/avro", "datafusion-datasource-avro"]
46+
avro = ["datafusion-datasource-avro"]
4747
backtrace = ["datafusion-common/backtrace"]
4848
compression = [
4949
"liblzma",

datafusion/core/src/datasource/file_format/avro.rs

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ mod tests {
109109
"double_col: Float64",
110110
"date_string_col: Binary",
111111
"string_col: Binary",
112-
"timestamp_col: Timestamp(µs)",
112+
"timestamp_col: Timestamp(µs, \"+00:00\")",
113113
],
114114
x
115115
);
@@ -118,18 +118,18 @@ mod tests {
118118
assert_eq!(batches.len(), 1);
119119

120120
assert_snapshot!(batches_to_string(&batches),@r"
121-
+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+
122-
| id | bool_col | tinyint_col | smallint_col | int_col | bigint_col | float_col | double_col | date_string_col | string_col | timestamp_col |
123-
+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+
124-
| 4 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30332f30312f3039 | 30 | 2009-03-01T00:00:00 |
125-
| 5 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30332f30312f3039 | 31 | 2009-03-01T00:01:00 |
126-
| 6 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30342f30312f3039 | 30 | 2009-04-01T00:00:00 |
127-
| 7 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30342f30312f3039 | 31 | 2009-04-01T00:01:00 |
128-
| 2 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30322f30312f3039 | 30 | 2009-02-01T00:00:00 |
129-
| 3 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30322f30312f3039 | 31 | 2009-02-01T00:01:00 |
130-
| 0 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30312f30312f3039 | 30 | 2009-01-01T00:00:00 |
131-
| 1 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30312f30312f3039 | 31 | 2009-01-01T00:01:00 |
132-
+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+
121+
+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+----------------------+
122+
| id | bool_col | tinyint_col | smallint_col | int_col | bigint_col | float_col | double_col | date_string_col | string_col | timestamp_col |
123+
+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+----------------------+
124+
| 4 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30332f30312f3039 | 30 | 2009-03-01T00:00:00Z |
125+
| 5 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30332f30312f3039 | 31 | 2009-03-01T00:01:00Z |
126+
| 6 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30342f30312f3039 | 30 | 2009-04-01T00:00:00Z |
127+
| 7 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30342f30312f3039 | 31 | 2009-04-01T00:01:00Z |
128+
| 2 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30322f30312f3039 | 30 | 2009-02-01T00:00:00Z |
129+
| 3 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30322f30312f3039 | 31 | 2009-02-01T00:01:00Z |
130+
| 0 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30312f30312f3039 | 30 | 2009-01-01T00:00:00Z |
131+
| 1 | false | 1 | 1 | 1 | 10 | 1.1 | 10.1 | 30312f30312f3039 | 31 | 2009-01-01T00:01:00Z |
132+
+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+----------------------+
133133
");
134134
Ok(())
135135
}

datafusion/core/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -786,7 +786,7 @@ pub use object_store;
786786
pub use parquet;
787787

788788
#[cfg(feature = "avro")]
789-
pub use datafusion_datasource_avro::apache_avro;
789+
pub use datafusion_datasource_avro::arrow_avro;
790790

791791
// re-export DataFusion sub-crates at the top level. Use `pub use *`
792792
// so that the contents of the subcrates appears in rustdocs

0 commit comments

Comments
 (0)