Skip to content

Commit 55f4694

Browse files
committed
feat: Add UTF-32 charset support and fix unused import
1 parent 732a604 commit 55f4694

2 files changed

Lines changed: 65 additions & 2 deletions

File tree

datafusion/core/src/execution/session_state.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ use crate::datasource::provider_as_source;
3030
use crate::execution::SessionStateDefaults;
3131
use crate::execution::context::{EmptySerializerRegistry, FunctionFactory, QueryPlanner};
3232
use crate::physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner};
33-
use arrow_schema::{DataType, FieldRef};
33+
use arrow_schema::FieldRef;
3434
use datafusion_catalog::MemoryCatalogProviderList;
3535
use datafusion_catalog::information_schema::{
3636
INFORMATION_SCHEMA, InformationSchemaProvider,

datafusion/spark/src/function/string/encode.rs

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,8 +106,30 @@ fn encode_string(s: &str, charset: &str) -> Result<Vec<u8>> {
106106
}
107107
Ok(bytes)
108108
}
109+
"UTF-32BE" | "UTF32BE" => {
110+
let mut bytes = Vec::new();
111+
for c in s.chars() {
112+
bytes.extend_from_slice(&(c as u32).to_be_bytes());
113+
}
114+
Ok(bytes)
115+
}
116+
"UTF-32LE" | "UTF32LE" => {
117+
let mut bytes = Vec::new();
118+
for c in s.chars() {
119+
bytes.extend_from_slice(&(c as u32).to_le_bytes());
120+
}
121+
Ok(bytes)
122+
}
123+
"UTF-32" | "UTF32" => {
124+
// BOM (big-endian marker) followed by UTF-32BE encoded bytes
125+
let mut bytes = vec![0x00, 0x00, 0xFE, 0xFF];
126+
for c in s.chars() {
127+
bytes.extend_from_slice(&(c as u32).to_be_bytes());
128+
}
129+
Ok(bytes)
130+
}
109131
_ => exec_err!(
110-
"Unsupported charset for encode: '{}'. Supported: US-ASCII, ISO-8859-1, UTF-8, UTF-16, UTF-16BE, UTF-16LE",
132+
"Unsupported charset for encode: '{}'. Supported: US-ASCII, ISO-8859-1, UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32, UTF-32BE, UTF-32LE",
111133
charset
112134
),
113135
}
@@ -547,6 +569,47 @@ mod tests {
547569
);
548570
}
549571

572+
#[test]
573+
fn test_encode_utf32be() {
574+
// 'A' = U+0041 → 00 00 00 41, 'B' = U+0042 → 00 00 00 42
575+
let result =
576+
eval_encode_scalar(ScalarValue::Utf8(Some("AB".into())), "UTF-32BE").unwrap();
577+
assert_eq!(
578+
expect_binary_scalar(result),
579+
vec![0x00, 0x00, 0x00, 0x41, 0x00, 0x00, 0x00, 0x42]
580+
);
581+
}
582+
583+
#[test]
584+
fn test_encode_utf32le() {
585+
// 'A' = U+0041 → 41 00 00 00, 'B' = U+0042 → 42 00 00 00
586+
let result =
587+
eval_encode_scalar(ScalarValue::Utf8(Some("AB".into())), "UTF-32LE").unwrap();
588+
assert_eq!(
589+
expect_binary_scalar(result),
590+
vec![0x41, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00]
591+
);
592+
}
593+
594+
#[test]
595+
fn test_encode_utf32_with_bom() {
596+
// UTF-32 = BOM (0000FEFF) + UTF-32BE
597+
let result =
598+
eval_encode_scalar(ScalarValue::Utf8(Some("A".into())), "UTF-32").unwrap();
599+
assert_eq!(
600+
expect_binary_scalar(result),
601+
vec![0x00, 0x00, 0xFE, 0xFF, 0x00, 0x00, 0x00, 0x41]
602+
);
603+
}
604+
605+
#[test]
606+
fn test_encode_emoji_utf32be() {
607+
// U+1F600 (😀) → 00 01 F6 00
608+
let result =
609+
eval_encode_scalar(ScalarValue::Utf8(Some("😀".into())), "UTF-32BE").unwrap();
610+
assert_eq!(expect_binary_scalar(result), vec![0x00, 0x01, 0xF6, 0x00]);
611+
}
612+
550613
/// Simple hex encoding for test assertions.
551614
fn hex_encode(bytes: &[u8]) -> String {
552615
bytes.iter().map(|b| format!("{b:02X}")).collect::<String>()

0 commit comments

Comments
 (0)