@@ -32,6 +32,28 @@ use datafusion_expr::{
3232/// Encodes a string or binary value into binary using the specified character encoding.
3333/// Binary input is interpreted as UTF-8 with lossy conversion (invalid bytes become U+FFFD).
3434///
35+ /// # Target Spark version
36+ /// Targets Spark 3.5 charset semantics — accepts both canonical names and
37+ /// common aliases (`UTF8`, `LATIN1`, `ISO88591`, `ASCII`, `UTF-32BE`, etc.)
38+ /// regardless of ANSI mode. Spark 4.0 tightened this to a strict charset
39+ /// whitelist that rejects aliases with `INVALID_PARAMETER_VALUE.CHARSET`.
40+ /// That stricter whitelist is not implemented here and can be added as an
41+ /// opt-in follow-up.
42+ ///
43+ /// # UTF-32 encoding
44+ /// `UTF-32` is treated as an alias for `UTF-32BE` (no BOM prefix), matching
45+ /// both Spark 3.5 and Spark 4.1.
46+ ///
47+ /// # ANSI mode
48+ /// Controls handling of unmappable characters (non-ASCII in `US-ASCII`, code
49+ /// points above `U+00FF` in `ISO-8859-1`). Does not activate Spark 4.0's
50+ /// strict charset whitelist.
51+ ///
52+ /// - `spark.sql.ansi.enabled = false` (Spark 3.5 default): silently replace
53+ /// with `?`.
54+ /// - `spark.sql.ansi.enabled = true`: return an error, matching Spark 4.0's
55+ /// `MALFORMED_CHARACTER_CODING`.
56+ ///
3557/// <https://spark.apache.org/docs/latest/api/sql/index.html#encode>
3658#[ derive( Debug , PartialEq , Eq , Hash ) ]
3759pub struct SparkEncode {
@@ -132,28 +154,21 @@ fn encode_string(s: &str, charset: &str, enable_ansi_mode: bool) -> Result<Vec<u
132154 }
133155 Ok ( bytes)
134156 }
135- "UTF-32BE" | "UTF32BE" => {
136- let mut bytes = Vec :: new ( ) ;
157+ // Spark treats UTF-32 as UTF-32BE (no BOM), matching Spark 3.5 and 4.1.
158+ "UTF-32" | "UTF32" | "UTF-32BE" | "UTF32BE" => {
159+ let mut bytes = Vec :: with_capacity ( s. len ( ) * 4 ) ;
137160 for c in s. chars ( ) {
138161 bytes. extend_from_slice ( & ( c as u32 ) . to_be_bytes ( ) ) ;
139162 }
140163 Ok ( bytes)
141164 }
142165 "UTF-32LE" | "UTF32LE" => {
143- let mut bytes = Vec :: new ( ) ;
166+ let mut bytes = Vec :: with_capacity ( s . len ( ) * 4 ) ;
144167 for c in s. chars ( ) {
145168 bytes. extend_from_slice ( & ( c as u32 ) . to_le_bytes ( ) ) ;
146169 }
147170 Ok ( bytes)
148171 }
149- "UTF-32" | "UTF32" => {
150- // BOM (big-endian marker) followed by UTF-32BE encoded bytes
151- let mut bytes = vec ! [ 0x00 , 0x00 , 0xFE , 0xFF ] ;
152- for c in s. chars ( ) {
153- bytes. extend_from_slice ( & ( c as u32 ) . to_be_bytes ( ) ) ;
154- }
155- Ok ( bytes)
156- }
157172 _ => exec_err ! (
158173 "Unsupported charset for encode: '{}'. Supported: US-ASCII, ISO-8859-1, UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32, UTF-32BE, UTF-32LE" ,
159174 charset
@@ -584,14 +599,11 @@ mod tests {
584599 }
585600
586601 #[ test]
587- fn test_encode_utf32_with_bom ( ) {
588- // UTF-32 = BOM (0000FEFF) + UTF-32BE
602+ fn test_encode_utf32_no_bom ( ) {
603+ // Spark's UTF-32 = UTF-32BE, no BOM prefix. 'A' = U+0041 → 00 00 00 41
589604 let result =
590605 eval_encode_scalar ( ScalarValue :: Utf8 ( Some ( "A" . into ( ) ) ) , "UTF-32" ) . unwrap ( ) ;
591- assert_eq ! (
592- expect_binary_scalar( result) ,
593- vec![ 0x00 , 0x00 , 0xFE , 0xFF , 0x00 , 0x00 , 0x00 , 0x41 ]
594- ) ;
606+ assert_eq ! ( expect_binary_scalar( result) , vec![ 0x00 , 0x00 , 0x00 , 0x41 ] ) ;
595607 }
596608
597609 #[ test]
@@ -611,14 +623,11 @@ mod tests {
611623 }
612624
613625 #[ test]
614- fn test_encode_emoji_utf32_with_bom ( ) {
615- // UTF-32 = BOM (0000FEFF) + UTF-32BE: 00 01 F6 00
626+ fn test_encode_emoji_utf32_no_bom ( ) {
627+ // Spark's UTF-32 = UTF-32BE, no BOM prefix. U+1F600 (😀) → 00 01 F6 00
616628 let result =
617629 eval_encode_scalar ( ScalarValue :: Utf8 ( Some ( "😀" . into ( ) ) ) , "UTF-32" ) . unwrap ( ) ;
618- assert_eq ! (
619- expect_binary_scalar( result) ,
620- vec![ 0x00 , 0x00 , 0xFE , 0xFF , 0x00 , 0x01 , 0xF6 , 0x00 ]
621- ) ;
630+ assert_eq ! ( expect_binary_scalar( result) , vec![ 0x00 , 0x01 , 0xF6 , 0x00 ] ) ;
622631 }
623632
624633 /// Simple hex encoding for test assertions.
0 commit comments