Skip to content

Commit dd0ad0e

Browse files
committed
fix: UTF-32 to match Spark output (no BOM) and document target version
Spark 3.5 and 4.1 both emit UTF-32 as UTF-32BE without a BOM. Our previous implementation prepended a 0000FEFF BOM, which didn't match any Spark version. Fix this so encode('A', 'UTF-32') produces 00000041 (4 bytes), matching Spark. Also add a doc comment clarifying: - Target Spark version (3.5 charset behavior, accepts aliases) - UTF-32 semantics (alias for UTF-32BE) - ANSI mode mapping to Spark 3.5 vs 4.0 unmappable-char behavior
1 parent b6dcd45 commit dd0ad0e

2 files changed

Lines changed: 38 additions & 23 deletions

File tree

datafusion/spark/src/function/string/encode.rs

Lines changed: 32 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,28 @@ use datafusion_expr::{
3232
/// Encodes a string or binary value into binary using the specified character encoding.
3333
/// Binary input is interpreted as UTF-8 with lossy conversion (invalid bytes become U+FFFD).
3434
///
35+
/// # Target Spark version
36+
/// Targets Spark 3.5 charset semantics — accepts both canonical names and
37+
/// common aliases (`UTF8`, `LATIN1`, `ISO88591`, `ASCII`, `UTF-32BE`, etc.)
38+
/// regardless of ANSI mode. Spark 4.0 tightened this to a strict charset
39+
/// whitelist that rejects aliases with `INVALID_PARAMETER_VALUE.CHARSET`.
40+
/// That stricter whitelist is not implemented here and can be added as an
41+
/// opt-in follow-up.
42+
///
43+
/// # UTF-32 encoding
44+
/// `UTF-32` is treated as an alias for `UTF-32BE` (no BOM prefix), matching
45+
/// both Spark 3.5 and Spark 4.1.
46+
///
47+
/// # ANSI mode
48+
/// Controls handling of unmappable characters (non-ASCII in `US-ASCII`, code
49+
/// points above `U+00FF` in `ISO-8859-1`). Does not activate Spark 4.0's
50+
/// strict charset whitelist.
51+
///
52+
/// - `spark.sql.ansi.enabled = false` (Spark 3.5 default): silently replace
53+
/// with `?`.
54+
/// - `spark.sql.ansi.enabled = true`: return an error, matching Spark 4.0's
55+
/// `MALFORMED_CHARACTER_CODING`.
56+
///
3557
/// <https://spark.apache.org/docs/latest/api/sql/index.html#encode>
3658
#[derive(Debug, PartialEq, Eq, Hash)]
3759
pub struct SparkEncode {
@@ -132,28 +154,21 @@ fn encode_string(s: &str, charset: &str, enable_ansi_mode: bool) -> Result<Vec<u
132154
}
133155
Ok(bytes)
134156
}
135-
"UTF-32BE" | "UTF32BE" => {
136-
let mut bytes = Vec::new();
157+
// Spark treats UTF-32 as UTF-32BE (no BOM), matching Spark 3.5 and 4.1.
158+
"UTF-32" | "UTF32" | "UTF-32BE" | "UTF32BE" => {
159+
let mut bytes = Vec::with_capacity(s.len() * 4);
137160
for c in s.chars() {
138161
bytes.extend_from_slice(&(c as u32).to_be_bytes());
139162
}
140163
Ok(bytes)
141164
}
142165
"UTF-32LE" | "UTF32LE" => {
143-
let mut bytes = Vec::new();
166+
let mut bytes = Vec::with_capacity(s.len() * 4);
144167
for c in s.chars() {
145168
bytes.extend_from_slice(&(c as u32).to_le_bytes());
146169
}
147170
Ok(bytes)
148171
}
149-
"UTF-32" | "UTF32" => {
150-
// BOM (big-endian marker) followed by UTF-32BE encoded bytes
151-
let mut bytes = vec![0x00, 0x00, 0xFE, 0xFF];
152-
for c in s.chars() {
153-
bytes.extend_from_slice(&(c as u32).to_be_bytes());
154-
}
155-
Ok(bytes)
156-
}
157172
_ => exec_err!(
158173
"Unsupported charset for encode: '{}'. Supported: US-ASCII, ISO-8859-1, UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32, UTF-32BE, UTF-32LE",
159174
charset
@@ -584,14 +599,11 @@ mod tests {
584599
}
585600

586601
#[test]
587-
fn test_encode_utf32_with_bom() {
588-
// UTF-32 = BOM (0000FEFF) + UTF-32BE
602+
fn test_encode_utf32_no_bom() {
603+
// Spark's UTF-32 = UTF-32BE, no BOM prefix. 'A' = U+0041 → 00 00 00 41
589604
let result =
590605
eval_encode_scalar(ScalarValue::Utf8(Some("A".into())), "UTF-32").unwrap();
591-
assert_eq!(
592-
expect_binary_scalar(result),
593-
vec![0x00, 0x00, 0xFE, 0xFF, 0x00, 0x00, 0x00, 0x41]
594-
);
606+
assert_eq!(expect_binary_scalar(result), vec![0x00, 0x00, 0x00, 0x41]);
595607
}
596608

597609
#[test]
@@ -611,14 +623,11 @@ mod tests {
611623
}
612624

613625
#[test]
614-
fn test_encode_emoji_utf32_with_bom() {
615-
// UTF-32 = BOM (0000FEFF) + UTF-32BE: 00 01 F6 00
626+
fn test_encode_emoji_utf32_no_bom() {
627+
// Spark's UTF-32 = UTF-32BE, no BOM prefix. U+1F600 (😀) → 00 01 F6 00
616628
let result =
617629
eval_encode_scalar(ScalarValue::Utf8(Some("😀".into())), "UTF-32").unwrap();
618-
assert_eq!(
619-
expect_binary_scalar(result),
620-
vec![0x00, 0x00, 0xFE, 0xFF, 0x00, 0x01, 0xF6, 0x00]
621-
);
630+
assert_eq!(expect_binary_scalar(result), vec![0x00, 0x01, 0xF6, 0x00]);
622631
}
623632

624633
/// Simple hex encoding for test assertions.

datafusion/sqllogictest/test_files/spark/string/encode.slt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,12 @@ SELECT encode('A'::string, 'utf-32be'::string);
4545
----
4646
00000041
4747

48+
# UTF-32 (Spark 3.5 / 4.1: no BOM, identical to UTF-32BE)
49+
query ?
50+
SELECT encode('A'::string, 'utf-32'::string);
51+
----
52+
00000041
53+
4854
# Case-insensitive charset
4955
query ?
5056
SELECT encode('hello'::string, 'Utf-8'::string);

0 commit comments

Comments
 (0)