fix: UTF-32 to match Spark output (no BOM) and document target version

JeelRajodiya · JeelRajodiya · commit dd0ad0eb2c37 · 2026-04-19T16:17:30.000+05:30
Spark 3.5 and 4.1 both emit UTF-32 as UTF-32BE without a BOM. Our
previous implementation prepended a 0000FEFF BOM, which didn't match
any Spark version. Fix this so encode('A', 'UTF-32') produces
00000041 (4 bytes), matching Spark.

Also add a doc comment clarifying:
- Target Spark version (3.5 charset behavior, accepts aliases)
- UTF-32 semantics (alias for UTF-32BE)
- ANSI mode mapping to Spark 3.5 vs 4.0 unmappable-char behavior
diff --git a/datafusion/spark/src/function/string/encode.rs b/datafusion/spark/src/function/string/encode.rs
@@ -32,6 +32,28 @@ use datafusion_expr::{
 /// Encodes a string or binary value into binary using the specified character encoding.
 /// Binary input is interpreted as UTF-8 with lossy conversion (invalid bytes become U+FFFD).
 ///
+/// # Target Spark version
+/// Targets Spark 3.5 charset semantics — accepts both canonical names and
+/// common aliases (`UTF8`, `LATIN1`, `ISO88591`, `ASCII`, `UTF-32BE`, etc.)
+/// regardless of ANSI mode. Spark 4.0 tightened this to a strict charset
+/// whitelist that rejects aliases with `INVALID_PARAMETER_VALUE.CHARSET`.
+/// That stricter whitelist is not implemented here and can be added as an
+/// opt-in follow-up.
+///
+/// # UTF-32 encoding
+/// `UTF-32` is treated as an alias for `UTF-32BE` (no BOM prefix), matching
+/// both Spark 3.5 and Spark 4.1.
+///
+/// # ANSI mode
+/// Controls handling of unmappable characters (non-ASCII in `US-ASCII`, code
+/// points above `U+00FF` in `ISO-8859-1`). Does not activate Spark 4.0's
+/// strict charset whitelist.
+///
+/// - `spark.sql.ansi.enabled = false` (Spark 3.5 default): silently replace
+///   with `?`.
+/// - `spark.sql.ansi.enabled = true`: return an error, matching Spark 4.0's
+///   `MALFORMED_CHARACTER_CODING`.
+///
 /// <https://spark.apache.org/docs/latest/api/sql/index.html#encode>
 #[derive(Debug, PartialEq, Eq, Hash)]
 pub struct SparkEncode {
@@ -132,28 +154,21 @@ fn encode_string(s: &str, charset: &str, enable_ansi_mode: bool) -> Result<Vec<u
             }
             Ok(bytes)
         }
-        "UTF-32BE" | "UTF32BE" => {
-            let mut bytes = Vec::new();
+        // Spark treats UTF-32 as UTF-32BE (no BOM), matching Spark 3.5 and 4.1.
+        "UTF-32" | "UTF32" | "UTF-32BE" | "UTF32BE" => {
+            let mut bytes = Vec::with_capacity(s.len() * 4);
             for c in s.chars() {
                 bytes.extend_from_slice(&(c as u32).to_be_bytes());
             }
             Ok(bytes)
         }
         "UTF-32LE" | "UTF32LE" => {
-            let mut bytes = Vec::new();
+            let mut bytes = Vec::with_capacity(s.len() * 4);
             for c in s.chars() {
                 bytes.extend_from_slice(&(c as u32).to_le_bytes());
             }
             Ok(bytes)
         }
-        "UTF-32" | "UTF32" => {
-            // BOM (big-endian marker) followed by UTF-32BE encoded bytes
-            let mut bytes = vec![0x00, 0x00, 0xFE, 0xFF];
-            for c in s.chars() {
-                bytes.extend_from_slice(&(c as u32).to_be_bytes());
-            }
-            Ok(bytes)
-        }
         _ => exec_err!(
             "Unsupported charset for encode: '{}'. Supported: US-ASCII, ISO-8859-1, UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32, UTF-32BE, UTF-32LE",
             charset
@@ -584,14 +599,11 @@ mod tests {
     }
 
     #[test]
-    fn test_encode_utf32_with_bom() {
-        // UTF-32 = BOM (0000FEFF) + UTF-32BE
+    fn test_encode_utf32_no_bom() {
+        // Spark's UTF-32 = UTF-32BE, no BOM prefix. 'A' = U+0041 → 00 00 00 41
         let result =
             eval_encode_scalar(ScalarValue::Utf8(Some("A".into())), "UTF-32").unwrap();
-        assert_eq!(
-            expect_binary_scalar(result),
-            vec![0x00, 0x00, 0xFE, 0xFF, 0x00, 0x00, 0x00, 0x41]
-        );
+        assert_eq!(expect_binary_scalar(result), vec![0x00, 0x00, 0x00, 0x41]);
     }
 
     #[test]
@@ -611,14 +623,11 @@ mod tests {
     }
 
     #[test]
-    fn test_encode_emoji_utf32_with_bom() {
-        // UTF-32 = BOM (0000FEFF) + UTF-32BE: 00 01 F6 00
+    fn test_encode_emoji_utf32_no_bom() {
+        // Spark's UTF-32 = UTF-32BE, no BOM prefix. U+1F600 (😀) → 00 01 F6 00
         let result =
             eval_encode_scalar(ScalarValue::Utf8(Some("😀".into())), "UTF-32").unwrap();
-        assert_eq!(
-            expect_binary_scalar(result),
-            vec![0x00, 0x00, 0xFE, 0xFF, 0x00, 0x01, 0xF6, 0x00]
-        );
+        assert_eq!(expect_binary_scalar(result), vec![0x00, 0x01, 0xF6, 0x00]);
     }
 
     /// Simple hex encoding for test assertions.
diff --git a/datafusion/sqllogictest/test_files/spark/string/encode.slt b/datafusion/sqllogictest/test_files/spark/string/encode.slt
@@ -45,6 +45,12 @@ SELECT encode('A'::string, 'utf-32be'::string);
 ----
 00000041
 
+# UTF-32 (Spark 3.5 / 4.1: no BOM, identical to UTF-32BE)
+query ?
+SELECT encode('A'::string, 'utf-32'::string);
+----
+00000041
+
 # Case-insensitive charset
 query ?
 SELECT encode('hello'::string, 'Utf-8'::string);