Skip to content

Commit 835ae8d

Browse files
committed
feat: Add ANSI mode support for unmappable characters
In ANSI mode (default), encoding a character that cannot be represented in the target charset (e.g. non-ASCII char in US-ASCII) returns an error. In legacy mode, unmappable characters are silently replaced with '?'.
1 parent e169a4f commit 835ae8d

1 file changed

Lines changed: 125 additions & 23 deletions

File tree

  • datafusion/spark/src/function/string

datafusion/spark/src/function/string/encode.rs

Lines changed: 125 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -70,20 +70,46 @@ impl SparkEncode {
7070
}
7171

7272
/// Encodes a single string value using the specified charset.
73-
fn encode_string(s: &str, charset: &str) -> Result<Vec<u8>> {
73+
/// In ANSI mode, unmappable characters cause an error.
74+
/// In legacy mode, unmappable characters are replaced with `?`.
75+
fn encode_string(s: &str, charset: &str, enable_ansi_mode: bool) -> Result<Vec<u8>> {
7476
match charset {
7577
"UTF-8" | "UTF8" => Ok(s.as_bytes().to_vec()),
76-
"US-ASCII" | "ASCII" => Ok(s
77-
.chars()
78-
.map(|c| if c.is_ascii() { c as u8 } else { b'?' })
79-
.collect()),
80-
"ISO-8859-1" | "ISO88591" | "LATIN1" => Ok(s
81-
.chars()
82-
.map(|c| {
78+
"US-ASCII" | "ASCII" => {
79+
let mut bytes = Vec::with_capacity(s.len());
80+
for c in s.chars() {
81+
if c.is_ascii() {
82+
bytes.push(c as u8);
83+
} else if enable_ansi_mode {
84+
return exec_err!(
85+
"cannot encode character '{}' (U+{:04X}) in US-ASCII",
86+
c,
87+
c as u32
88+
);
89+
} else {
90+
bytes.push(b'?');
91+
}
92+
}
93+
Ok(bytes)
94+
}
95+
"ISO-8859-1" | "ISO88591" | "LATIN1" => {
96+
let mut bytes = Vec::with_capacity(s.len());
97+
for c in s.chars() {
8398
let cp = c as u32;
84-
if cp > 255 { b'?' } else { cp as u8 }
85-
})
86-
.collect()),
99+
if cp <= 255 {
100+
bytes.push(cp as u8);
101+
} else if enable_ansi_mode {
102+
return exec_err!(
103+
"cannot encode character '{}' (U+{:04X}) in ISO-8859-1",
104+
c,
105+
c as u32
106+
);
107+
} else {
108+
bytes.push(b'?');
109+
}
110+
}
111+
Ok(bytes)
112+
}
87113
"UTF-16BE" | "UTF16BE" => {
88114
let mut bytes = Vec::new();
89115
for code_unit in s.encode_utf16() {
@@ -139,6 +165,7 @@ fn encode_string(s: &str, charset: &str) -> Result<Vec<u8>> {
139165
fn encode_array<'a, S: StringArrayType<'a>>(
140166
string_array: &S,
141167
charset: &str,
168+
enable_ansi_mode: bool,
142169
) -> Result<ArrayRef> {
143170
let mut builder =
144171
BinaryBuilder::with_capacity(string_array.len(), string_array.len() * 4);
@@ -147,7 +174,7 @@ fn encode_array<'a, S: StringArrayType<'a>>(
147174
builder.append_null();
148175
} else {
149176
let s = string_array.value(i);
150-
let encoded = encode_string(s, charset)?;
177+
let encoded = encode_string(s, charset, enable_ansi_mode)?;
151178
builder.append_value(&encoded);
152179
}
153180
}
@@ -159,6 +186,7 @@ fn encode_array<'a, S: StringArrayType<'a>>(
159186
fn encode_binary_array<'a, B: arrow::array::BinaryArrayType<'a>>(
160187
binary_array: &'a B,
161188
charset: &str,
189+
enable_ansi_mode: bool,
162190
) -> Result<ArrayRef> {
163191
let mut builder =
164192
BinaryBuilder::with_capacity(binary_array.len(), binary_array.len() * 4);
@@ -167,22 +195,38 @@ fn encode_binary_array<'a, B: arrow::array::BinaryArrayType<'a>>(
167195
builder.append_null();
168196
} else {
169197
let s = String::from_utf8_lossy(binary_array.value(i));
170-
let encoded = encode_string(&s, charset)?;
198+
let encoded = encode_string(&s, charset, enable_ansi_mode)?;
171199
builder.append_value(&encoded);
172200
}
173201
}
174202
Ok(Arc::new(builder.finish()))
175203
}
176204

177205
/// Dispatches to the correct typed array encoder based on the DataType.
178-
fn encode_dispatch(arr: &ArrayRef, charset: &str) -> Result<ArrayRef> {
206+
fn encode_dispatch(
207+
arr: &ArrayRef,
208+
charset: &str,
209+
enable_ansi_mode: bool,
210+
) -> Result<ArrayRef> {
179211
match arr.data_type() {
180-
DataType::Utf8 => encode_array(&arr.as_string::<i32>(), charset),
181-
DataType::LargeUtf8 => encode_array(&arr.as_string::<i64>(), charset),
182-
DataType::Utf8View => encode_array(&arr.as_string_view(), charset),
183-
DataType::Binary => encode_binary_array(&arr.as_binary::<i32>(), charset),
184-
DataType::LargeBinary => encode_binary_array(&arr.as_binary::<i64>(), charset),
185-
DataType::BinaryView => encode_binary_array(&arr.as_binary_view(), charset),
212+
DataType::Utf8 => {
213+
encode_array(&arr.as_string::<i32>(), charset, enable_ansi_mode)
214+
}
215+
DataType::LargeUtf8 => {
216+
encode_array(&arr.as_string::<i64>(), charset, enable_ansi_mode)
217+
}
218+
DataType::Utf8View => {
219+
encode_array(&arr.as_string_view(), charset, enable_ansi_mode)
220+
}
221+
DataType::Binary => {
222+
encode_binary_array(&arr.as_binary::<i32>(), charset, enable_ansi_mode)
223+
}
224+
DataType::LargeBinary => {
225+
encode_binary_array(&arr.as_binary::<i64>(), charset, enable_ansi_mode)
226+
}
227+
DataType::BinaryView => {
228+
encode_binary_array(&arr.as_binary_view(), charset, enable_ansi_mode)
229+
}
186230
DataType::Null => {
187231
let mut builder = BinaryBuilder::new();
188232
for _ in 0..arr.len() {
@@ -265,6 +309,7 @@ impl ScalarUDFImpl for SparkEncode {
265309
}
266310

267311
let charset = extract_charset(&args.args[1])?;
312+
let enable_ansi_mode = args.config_options.execution.enable_ansi_mode;
268313

269314
// Determine if the result should be scalar or array
270315
let len = args.args.iter().find_map(|arg| match arg {
@@ -279,7 +324,7 @@ impl ScalarUDFImpl for SparkEncode {
279324
ColumnarValue::Array(array) => Arc::clone(array),
280325
};
281326

282-
let result = encode_dispatch(&string_arr, &charset)?;
327+
let result = encode_dispatch(&string_arr, &charset, enable_ansi_mode)?;
283328

284329
if is_scalar {
285330
ScalarValue::try_from_array(&result, 0).map(ColumnarValue::Scalar)
@@ -297,10 +342,16 @@ mod tests {
297342
use datafusion_expr::ScalarUDF;
298343

299344
/// Helper to invoke encode as a scalar with two literal string arguments.
300-
fn eval_encode_scalar(input: ScalarValue, charset: &str) -> Result<ColumnarValue> {
345+
fn eval_encode_scalar_with_ansi(
346+
input: ScalarValue,
347+
charset: &str,
348+
enable_ansi_mode: bool,
349+
) -> Result<ColumnarValue> {
301350
let func = SparkEncode::new();
302351
let input_field = Arc::new(Field::new("input", input.data_type(), true));
303352
let charset_field = Arc::new(Field::new("charset", DataType::Utf8, false));
353+
let mut config = ConfigOptions::default();
354+
config.execution.enable_ansi_mode = enable_ansi_mode;
304355
func.invoke_with_args(ScalarFunctionArgs {
305356
args: vec![
306357
ColumnarValue::Scalar(input),
@@ -309,10 +360,15 @@ mod tests {
309360
arg_fields: vec![input_field, charset_field],
310361
number_rows: 1,
311362
return_field: Arc::new(Field::new("encode", DataType::Binary, true)),
312-
config_options: Arc::new(ConfigOptions::default()),
363+
config_options: Arc::new(config),
313364
})
314365
}
315366

367+
/// Helper with ANSI mode disabled (legacy behavior).
368+
fn eval_encode_scalar(input: ScalarValue, charset: &str) -> Result<ColumnarValue> {
369+
eval_encode_scalar_with_ansi(input, charset, false)
370+
}
371+
316372
fn expect_binary_scalar(result: ColumnarValue) -> Vec<u8> {
317373
match result {
318374
ColumnarValue::Scalar(ScalarValue::Binary(Some(bytes))) => bytes,
@@ -398,6 +454,52 @@ mod tests {
398454
);
399455
}
400456

457+
#[test]
458+
fn test_encode_ascii_unmappable_legacy_mode() {
459+
// Legacy mode: non-ASCII chars replaced with '?'
460+
let result = eval_encode_scalar(
461+
ScalarValue::Utf8(Some("\u{00E9}".into())), // é
462+
"US-ASCII",
463+
)
464+
.unwrap();
465+
assert_eq!(expect_binary_scalar(result), vec![b'?']);
466+
}
467+
468+
#[test]
469+
fn test_encode_ascii_unmappable_ansi_mode() {
470+
// ANSI mode: non-ASCII chars cause an error
471+
let result = eval_encode_scalar_with_ansi(
472+
ScalarValue::Utf8(Some("\u{00E9}".into())),
473+
"US-ASCII",
474+
true,
475+
);
476+
assert!(result.is_err());
477+
assert!(result.unwrap_err().to_string().contains("cannot encode"));
478+
}
479+
480+
#[test]
481+
fn test_encode_iso8859_unmappable_legacy_mode() {
482+
// Legacy mode: chars > U+00FF replaced with '?'
483+
let result = eval_encode_scalar(
484+
ScalarValue::Utf8(Some("\u{0100}".into())), // Ā (U+0100)
485+
"ISO-8859-1",
486+
)
487+
.unwrap();
488+
assert_eq!(expect_binary_scalar(result), vec![b'?']);
489+
}
490+
491+
#[test]
492+
fn test_encode_iso8859_unmappable_ansi_mode() {
493+
// ANSI mode: chars > U+00FF cause an error
494+
let result = eval_encode_scalar_with_ansi(
495+
ScalarValue::Utf8(Some("\u{0100}".into())),
496+
"ISO-8859-1",
497+
true,
498+
);
499+
assert!(result.is_err());
500+
assert!(result.unwrap_err().to_string().contains("cannot encode"));
501+
}
502+
401503
#[test]
402504
fn test_encode_null_input() {
403505
let func = SparkEncode::new();

0 commit comments

Comments
 (0)