@@ -70,20 +70,46 @@ impl SparkEncode {
7070}
7171
7272/// Encodes a single string value using the specified charset.
73- fn encode_string ( s : & str , charset : & str ) -> Result < Vec < u8 > > {
73+ /// In ANSI mode, unmappable characters cause an error.
74+ /// In legacy mode, unmappable characters are replaced with `?`.
75+ fn encode_string ( s : & str , charset : & str , enable_ansi_mode : bool ) -> Result < Vec < u8 > > {
7476 match charset {
7577 "UTF-8" | "UTF8" => Ok ( s. as_bytes ( ) . to_vec ( ) ) ,
76- "US-ASCII" | "ASCII" => Ok ( s
77- . chars ( )
78- . map ( |c| if c. is_ascii ( ) { c as u8 } else { b'?' } )
79- . collect ( ) ) ,
80- "ISO-8859-1" | "ISO88591" | "LATIN1" => Ok ( s
81- . chars ( )
82- . map ( |c| {
78+ "US-ASCII" | "ASCII" => {
79+ let mut bytes = Vec :: with_capacity ( s. len ( ) ) ;
80+ for c in s. chars ( ) {
81+ if c. is_ascii ( ) {
82+ bytes. push ( c as u8 ) ;
83+ } else if enable_ansi_mode {
84+ return exec_err ! (
85+ "cannot encode character '{}' (U+{:04X}) in US-ASCII" ,
86+ c,
87+ c as u32
88+ ) ;
89+ } else {
90+ bytes. push ( b'?' ) ;
91+ }
92+ }
93+ Ok ( bytes)
94+ }
95+ "ISO-8859-1" | "ISO88591" | "LATIN1" => {
96+ let mut bytes = Vec :: with_capacity ( s. len ( ) ) ;
97+ for c in s. chars ( ) {
8398 let cp = c as u32 ;
84- if cp > 255 { b'?' } else { cp as u8 }
85- } )
86- . collect ( ) ) ,
99+ if cp <= 255 {
100+ bytes. push ( cp as u8 ) ;
101+ } else if enable_ansi_mode {
102+ return exec_err ! (
103+ "cannot encode character '{}' (U+{:04X}) in ISO-8859-1" ,
104+ c,
105+ c as u32
106+ ) ;
107+ } else {
108+ bytes. push ( b'?' ) ;
109+ }
110+ }
111+ Ok ( bytes)
112+ }
87113 "UTF-16BE" | "UTF16BE" => {
88114 let mut bytes = Vec :: new ( ) ;
89115 for code_unit in s. encode_utf16 ( ) {
@@ -139,6 +165,7 @@ fn encode_string(s: &str, charset: &str) -> Result<Vec<u8>> {
139165fn encode_array < ' a , S : StringArrayType < ' a > > (
140166 string_array : & S ,
141167 charset : & str ,
168+ enable_ansi_mode : bool ,
142169) -> Result < ArrayRef > {
143170 let mut builder =
144171 BinaryBuilder :: with_capacity ( string_array. len ( ) , string_array. len ( ) * 4 ) ;
@@ -147,7 +174,7 @@ fn encode_array<'a, S: StringArrayType<'a>>(
147174 builder. append_null ( ) ;
148175 } else {
149176 let s = string_array. value ( i) ;
150- let encoded = encode_string ( s, charset) ?;
177+ let encoded = encode_string ( s, charset, enable_ansi_mode ) ?;
151178 builder. append_value ( & encoded) ;
152179 }
153180 }
@@ -159,6 +186,7 @@ fn encode_array<'a, S: StringArrayType<'a>>(
159186fn encode_binary_array < ' a , B : arrow:: array:: BinaryArrayType < ' a > > (
160187 binary_array : & ' a B ,
161188 charset : & str ,
189+ enable_ansi_mode : bool ,
162190) -> Result < ArrayRef > {
163191 let mut builder =
164192 BinaryBuilder :: with_capacity ( binary_array. len ( ) , binary_array. len ( ) * 4 ) ;
@@ -167,22 +195,38 @@ fn encode_binary_array<'a, B: arrow::array::BinaryArrayType<'a>>(
167195 builder. append_null ( ) ;
168196 } else {
169197 let s = String :: from_utf8_lossy ( binary_array. value ( i) ) ;
170- let encoded = encode_string ( & s, charset) ?;
198+ let encoded = encode_string ( & s, charset, enable_ansi_mode ) ?;
171199 builder. append_value ( & encoded) ;
172200 }
173201 }
174202 Ok ( Arc :: new ( builder. finish ( ) ) )
175203}
176204
177205/// Dispatches to the correct typed array encoder based on the DataType.
178- fn encode_dispatch ( arr : & ArrayRef , charset : & str ) -> Result < ArrayRef > {
206+ fn encode_dispatch (
207+ arr : & ArrayRef ,
208+ charset : & str ,
209+ enable_ansi_mode : bool ,
210+ ) -> Result < ArrayRef > {
179211 match arr. data_type ( ) {
180- DataType :: Utf8 => encode_array ( & arr. as_string :: < i32 > ( ) , charset) ,
181- DataType :: LargeUtf8 => encode_array ( & arr. as_string :: < i64 > ( ) , charset) ,
182- DataType :: Utf8View => encode_array ( & arr. as_string_view ( ) , charset) ,
183- DataType :: Binary => encode_binary_array ( & arr. as_binary :: < i32 > ( ) , charset) ,
184- DataType :: LargeBinary => encode_binary_array ( & arr. as_binary :: < i64 > ( ) , charset) ,
185- DataType :: BinaryView => encode_binary_array ( & arr. as_binary_view ( ) , charset) ,
212+ DataType :: Utf8 => {
213+ encode_array ( & arr. as_string :: < i32 > ( ) , charset, enable_ansi_mode)
214+ }
215+ DataType :: LargeUtf8 => {
216+ encode_array ( & arr. as_string :: < i64 > ( ) , charset, enable_ansi_mode)
217+ }
218+ DataType :: Utf8View => {
219+ encode_array ( & arr. as_string_view ( ) , charset, enable_ansi_mode)
220+ }
221+ DataType :: Binary => {
222+ encode_binary_array ( & arr. as_binary :: < i32 > ( ) , charset, enable_ansi_mode)
223+ }
224+ DataType :: LargeBinary => {
225+ encode_binary_array ( & arr. as_binary :: < i64 > ( ) , charset, enable_ansi_mode)
226+ }
227+ DataType :: BinaryView => {
228+ encode_binary_array ( & arr. as_binary_view ( ) , charset, enable_ansi_mode)
229+ }
186230 DataType :: Null => {
187231 let mut builder = BinaryBuilder :: new ( ) ;
188232 for _ in 0 ..arr. len ( ) {
@@ -265,6 +309,7 @@ impl ScalarUDFImpl for SparkEncode {
265309 }
266310
267311 let charset = extract_charset ( & args. args [ 1 ] ) ?;
312+ let enable_ansi_mode = args. config_options . execution . enable_ansi_mode ;
268313
269314 // Determine if the result should be scalar or array
270315 let len = args. args . iter ( ) . find_map ( |arg| match arg {
@@ -279,7 +324,7 @@ impl ScalarUDFImpl for SparkEncode {
279324 ColumnarValue :: Array ( array) => Arc :: clone ( array) ,
280325 } ;
281326
282- let result = encode_dispatch ( & string_arr, & charset) ?;
327+ let result = encode_dispatch ( & string_arr, & charset, enable_ansi_mode ) ?;
283328
284329 if is_scalar {
285330 ScalarValue :: try_from_array ( & result, 0 ) . map ( ColumnarValue :: Scalar )
@@ -297,10 +342,16 @@ mod tests {
297342 use datafusion_expr:: ScalarUDF ;
298343
299344 /// Helper to invoke encode as a scalar with two literal string arguments.
300- fn eval_encode_scalar ( input : ScalarValue , charset : & str ) -> Result < ColumnarValue > {
345+ fn eval_encode_scalar_with_ansi (
346+ input : ScalarValue ,
347+ charset : & str ,
348+ enable_ansi_mode : bool ,
349+ ) -> Result < ColumnarValue > {
301350 let func = SparkEncode :: new ( ) ;
302351 let input_field = Arc :: new ( Field :: new ( "input" , input. data_type ( ) , true ) ) ;
303352 let charset_field = Arc :: new ( Field :: new ( "charset" , DataType :: Utf8 , false ) ) ;
353+ let mut config = ConfigOptions :: default ( ) ;
354+ config. execution . enable_ansi_mode = enable_ansi_mode;
304355 func. invoke_with_args ( ScalarFunctionArgs {
305356 args : vec ! [
306357 ColumnarValue :: Scalar ( input) ,
@@ -309,10 +360,15 @@ mod tests {
309360 arg_fields : vec ! [ input_field, charset_field] ,
310361 number_rows : 1 ,
311362 return_field : Arc :: new ( Field :: new ( "encode" , DataType :: Binary , true ) ) ,
312- config_options : Arc :: new ( ConfigOptions :: default ( ) ) ,
363+ config_options : Arc :: new ( config ) ,
313364 } )
314365 }
315366
367+ /// Helper with ANSI mode disabled (legacy behavior).
368+ fn eval_encode_scalar ( input : ScalarValue , charset : & str ) -> Result < ColumnarValue > {
369+ eval_encode_scalar_with_ansi ( input, charset, false )
370+ }
371+
316372 fn expect_binary_scalar ( result : ColumnarValue ) -> Vec < u8 > {
317373 match result {
318374 ColumnarValue :: Scalar ( ScalarValue :: Binary ( Some ( bytes) ) ) => bytes,
@@ -398,6 +454,52 @@ mod tests {
398454 ) ;
399455 }
400456
457+ #[ test]
458+ fn test_encode_ascii_unmappable_legacy_mode ( ) {
459+ // Legacy mode: non-ASCII chars replaced with '?'
460+ let result = eval_encode_scalar (
461+ ScalarValue :: Utf8 ( Some ( "\u{00E9} " . into ( ) ) ) , // é
462+ "US-ASCII" ,
463+ )
464+ . unwrap ( ) ;
465+ assert_eq ! ( expect_binary_scalar( result) , vec![ b'?' ] ) ;
466+ }
467+
468+ #[ test]
469+ fn test_encode_ascii_unmappable_ansi_mode ( ) {
470+ // ANSI mode: non-ASCII chars cause an error
471+ let result = eval_encode_scalar_with_ansi (
472+ ScalarValue :: Utf8 ( Some ( "\u{00E9} " . into ( ) ) ) ,
473+ "US-ASCII" ,
474+ true ,
475+ ) ;
476+ assert ! ( result. is_err( ) ) ;
477+ assert ! ( result. unwrap_err( ) . to_string( ) . contains( "cannot encode" ) ) ;
478+ }
479+
480+ #[ test]
481+ fn test_encode_iso8859_unmappable_legacy_mode ( ) {
482+ // Legacy mode: chars > U+00FF replaced with '?'
483+ let result = eval_encode_scalar (
484+ ScalarValue :: Utf8 ( Some ( "\u{0100} " . into ( ) ) ) , // Ā (U+0100)
485+ "ISO-8859-1" ,
486+ )
487+ . unwrap ( ) ;
488+ assert_eq ! ( expect_binary_scalar( result) , vec![ b'?' ] ) ;
489+ }
490+
491+ #[ test]
492+ fn test_encode_iso8859_unmappable_ansi_mode ( ) {
493+ // ANSI mode: chars > U+00FF cause an error
494+ let result = eval_encode_scalar_with_ansi (
495+ ScalarValue :: Utf8 ( Some ( "\u{0100} " . into ( ) ) ) ,
496+ "ISO-8859-1" ,
497+ true ,
498+ ) ;
499+ assert ! ( result. is_err( ) ) ;
500+ assert ! ( result. unwrap_err( ) . to_string( ) . contains( "cannot encode" ) ) ;
501+ }
502+
401503 #[ test]
402504 fn test_encode_null_input ( ) {
403505 let func = SparkEncode :: new ( ) ;
0 commit comments