Skip to content

Commit 93f1ec9

Browse files
committed
feat: Add UTF-32 charset support
1 parent db519be commit 93f1ec9

1 file changed

Lines changed: 83 additions & 1 deletion

File tree

  • datafusion/spark/src/function/string

β€Ždatafusion/spark/src/function/string/encode.rsβ€Ž

Lines changed: 83 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,8 +106,30 @@ fn encode_string(s: &str, charset: &str) -> Result<Vec<u8>> {
106106
}
107107
Ok(bytes)
108108
}
109+
"UTF-32BE" | "UTF32BE" => {
110+
let mut bytes = Vec::new();
111+
for c in s.chars() {
112+
bytes.extend_from_slice(&(c as u32).to_be_bytes());
113+
}
114+
Ok(bytes)
115+
}
116+
"UTF-32LE" | "UTF32LE" => {
117+
let mut bytes = Vec::new();
118+
for c in s.chars() {
119+
bytes.extend_from_slice(&(c as u32).to_le_bytes());
120+
}
121+
Ok(bytes)
122+
}
123+
"UTF-32" | "UTF32" => {
124+
// BOM (big-endian marker) followed by UTF-32BE encoded bytes
125+
let mut bytes = vec![0x00, 0x00, 0xFE, 0xFF];
126+
for c in s.chars() {
127+
bytes.extend_from_slice(&(c as u32).to_be_bytes());
128+
}
129+
Ok(bytes)
130+
}
109131
_ => exec_err!(
110-
"Unsupported charset for encode: '{}'. Supported: US-ASCII, ISO-8859-1, UTF-8, UTF-16, UTF-16BE, UTF-16LE",
132+
"Unsupported charset for encode: '{}'. Supported: US-ASCII, ISO-8859-1, UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32, UTF-32BE, UTF-32LE",
111133
charset
112134
),
113135
}
@@ -547,6 +569,66 @@ mod tests {
547569
);
548570
}
549571

572+
#[test]
573+
fn test_encode_utf32be() {
574+
// 'A' = U+0041 β†’ 00 00 00 41, 'B' = U+0042 β†’ 00 00 00 42
575+
let result =
576+
eval_encode_scalar(ScalarValue::Utf8(Some("AB".into())), "UTF-32BE").unwrap();
577+
assert_eq!(
578+
expect_binary_scalar(result),
579+
vec![0x00, 0x00, 0x00, 0x41, 0x00, 0x00, 0x00, 0x42]
580+
);
581+
}
582+
583+
#[test]
584+
fn test_encode_utf32le() {
585+
// 'A' = U+0041 β†’ 41 00 00 00, 'B' = U+0042 β†’ 42 00 00 00
586+
let result =
587+
eval_encode_scalar(ScalarValue::Utf8(Some("AB".into())), "UTF-32LE").unwrap();
588+
assert_eq!(
589+
expect_binary_scalar(result),
590+
vec![0x41, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00]
591+
);
592+
}
593+
594+
#[test]
595+
fn test_encode_utf32_with_bom() {
596+
// UTF-32 = BOM (0000FEFF) + UTF-32BE
597+
let result =
598+
eval_encode_scalar(ScalarValue::Utf8(Some("A".into())), "UTF-32").unwrap();
599+
assert_eq!(
600+
expect_binary_scalar(result),
601+
vec![0x00, 0x00, 0xFE, 0xFF, 0x00, 0x00, 0x00, 0x41]
602+
);
603+
}
604+
605+
#[test]
606+
fn test_encode_emoji_utf32be() {
607+
// U+1F600 (πŸ˜€) β†’ 00 01 F6 00
608+
let result =
609+
eval_encode_scalar(ScalarValue::Utf8(Some("πŸ˜€".into())), "UTF-32BE").unwrap();
610+
assert_eq!(expect_binary_scalar(result), vec![0x00, 0x01, 0xF6, 0x00]);
611+
}
612+
613+
#[test]
614+
fn test_encode_emoji_utf32le() {
615+
// U+1F600 (πŸ˜€) β†’ 00 F6 01 00 (little-endian)
616+
let result =
617+
eval_encode_scalar(ScalarValue::Utf8(Some("πŸ˜€".into())), "UTF-32LE").unwrap();
618+
assert_eq!(expect_binary_scalar(result), vec![0x00, 0xF6, 0x01, 0x00]);
619+
}
620+
621+
#[test]
622+
fn test_encode_emoji_utf32_with_bom() {
623+
// UTF-32 = BOM (0000FEFF) + UTF-32BE: 00 01 F6 00
624+
let result =
625+
eval_encode_scalar(ScalarValue::Utf8(Some("πŸ˜€".into())), "UTF-32").unwrap();
626+
assert_eq!(
627+
expect_binary_scalar(result),
628+
vec![0x00, 0x00, 0xFE, 0xFF, 0x00, 0x01, 0xF6, 0x00]
629+
);
630+
}
631+
550632
/// Simple hex encoding for test assertions.
551633
fn hex_encode(bytes: &[u8]) -> String {
552634
bytes.iter().map(|b| format!("{b:02X}")).collect::<String>()

0 commit comments

Comments
Β (0)