|
17 | 17 |
|
18 | 18 | use crate::utils::{make_scalar_function, utf8_to_int_type}; |
19 | 19 | use arrow::array::{ |
20 | | - Array, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait, PrimitiveArray, |
21 | | - StringArrayType, |
| 20 | + Array, ArrayRef, ArrowPrimitiveType, AsArray, GenericStringArray, OffsetSizeTrait, |
| 21 | + PrimitiveArray, StringViewArray, |
22 | 22 | }; |
23 | 23 | use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type}; |
24 | 24 | use datafusion_common::Result; |
@@ -104,65 +104,98 @@ fn character_length(args: &[ArrayRef]) -> Result<ArrayRef> { |
104 | 104 | match args[0].data_type() { |
105 | 105 | DataType::Utf8 => { |
106 | 106 | let string_array = args[0].as_string::<i32>(); |
107 | | - character_length_general::<Int32Type, _>(&string_array) |
| 107 | + character_length_offsets::<Int32Type, i32>(string_array) |
108 | 108 | } |
109 | 109 | DataType::LargeUtf8 => { |
110 | 110 | let string_array = args[0].as_string::<i64>(); |
111 | | - character_length_general::<Int64Type, _>(&string_array) |
| 111 | + character_length_offsets::<Int64Type, i64>(string_array) |
112 | 112 | } |
113 | 113 | DataType::Utf8View => { |
114 | 114 | let string_array = args[0].as_string_view(); |
115 | | - character_length_general::<Int32Type, _>(&string_array) |
| 115 | + character_length_string_view::<Int32Type>(string_array) |
116 | 116 | } |
117 | 117 | _ => unreachable!("CharacterLengthFunc"), |
118 | 118 | } |
119 | 119 | } |
120 | 120 |
|
121 | | -fn character_length_general<'a, T, V>(array: &V) -> Result<ArrayRef> |
| 121 | +/// Optimized character_length for offset-based string arrays (Utf8/LargeUtf8). |
| 122 | +/// For ASCII-only arrays, computes lengths directly from the offsets buffer |
| 123 | +/// without touching the string data at all. |
| 124 | +fn character_length_offsets<T, O>(array: &GenericStringArray<O>) -> Result<ArrayRef> |
122 | 125 | where |
123 | 126 | T: ArrowPrimitiveType, |
124 | 127 | T::Native: OffsetSizeTrait, |
125 | | - V: StringArrayType<'a>, |
| 128 | + O: OffsetSizeTrait, |
126 | 129 | { |
127 | | - // String characters are variable length encoded in UTF-8, counting the |
128 | | - // number of chars requires expensive decoding, however checking if the |
129 | | - // string is ASCII only is relatively cheap. |
130 | | - // If strings are ASCII only, count bytes instead. |
131 | | - let is_array_ascii_only = array.is_ascii(); |
132 | 130 | let nulls = array.nulls().cloned(); |
133 | | - let array = { |
134 | | - if is_array_ascii_only { |
135 | | - let values: Vec<_> = (0..array.len()) |
136 | | - .map(|i| { |
137 | | - // Safety: we are iterating with array.len() so the index is always valid |
138 | | - let value = unsafe { array.value_unchecked(i) }; |
| 131 | + let offsets = array.offsets(); |
| 132 | + |
| 133 | + if array.is_ascii() { |
| 134 | + // ASCII: byte length == char length, compute from offsets only |
| 135 | + let values: Vec<T::Native> = offsets |
| 136 | + .windows(2) |
| 137 | + .map(|w| T::Native::usize_as((w[1] - w[0]).as_usize())) |
| 138 | + .collect(); |
| 139 | + Ok(Arc::new(PrimitiveArray::<T>::new(values.into(), nulls))) |
| 140 | + } else { |
| 141 | + let values: Vec<T::Native> = (0..array.len()) |
| 142 | + .map(|i| { |
| 143 | + // Safety: i is within bounds |
| 144 | + let value = unsafe { array.value_unchecked(i) }; |
| 145 | + if value.is_empty() { |
| 146 | + T::default_value() |
| 147 | + } else if value.is_ascii() { |
139 | 148 | T::Native::usize_as(value.len()) |
140 | | - }) |
141 | | - .collect(); |
142 | | - PrimitiveArray::<T>::new(values.into(), nulls) |
143 | | - } else { |
144 | | - let values: Vec<_> = (0..array.len()) |
145 | | - .map(|i| { |
146 | | - // Safety: we are iterating with array.len() so the index is always valid |
147 | | - if array.is_null(i) { |
| 149 | + } else { |
| 150 | + T::Native::usize_as(value.chars().count()) |
| 151 | + } |
| 152 | + }) |
| 153 | + .collect(); |
| 154 | + Ok(Arc::new(PrimitiveArray::<T>::new(values.into(), nulls))) |
| 155 | + } |
| 156 | +} |
| 157 | + |
| 158 | +/// Optimized character_length for StringViewArray. |
| 159 | +/// For ASCII-only arrays, reads string lengths directly from the view metadata |
| 160 | +/// without touching string data. |
| 161 | +fn character_length_string_view<T>(array: &StringViewArray) -> Result<ArrayRef> |
| 162 | +where |
| 163 | + T: ArrowPrimitiveType, |
| 164 | + T::Native: OffsetSizeTrait, |
| 165 | +{ |
| 166 | + let nulls = array.nulls().cloned(); |
| 167 | + let views = array.views(); |
| 168 | + |
| 169 | + if array.is_ascii() { |
| 170 | + // ASCII: byte length == char length, read length from view (first 4 bytes) |
| 171 | + let values: Vec<T::Native> = views |
| 172 | + .iter() |
| 173 | + .map(|view| { |
| 174 | + let len = (*view as u32) as usize; |
| 175 | + T::Native::usize_as(len) |
| 176 | + }) |
| 177 | + .collect(); |
| 178 | + Ok(Arc::new(PrimitiveArray::<T>::new(values.into(), nulls))) |
| 179 | + } else { |
| 180 | + let values: Vec<T::Native> = (0..array.len()) |
| 181 | + .map(|i| { |
| 182 | + if array.is_null(i) { |
| 183 | + T::default_value() |
| 184 | + } else { |
| 185 | + // Safety: i is within bounds |
| 186 | + let value = unsafe { array.value_unchecked(i) }; |
| 187 | + if value.is_empty() { |
148 | 188 | T::default_value() |
| 189 | + } else if value.is_ascii() { |
| 190 | + T::Native::usize_as(value.len()) |
149 | 191 | } else { |
150 | | - let value = unsafe { array.value_unchecked(i) }; |
151 | | - if value.is_empty() { |
152 | | - T::default_value() |
153 | | - } else if value.is_ascii() { |
154 | | - T::Native::usize_as(value.len()) |
155 | | - } else { |
156 | | - T::Native::usize_as(value.chars().count()) |
157 | | - } |
| 192 | + T::Native::usize_as(value.chars().count()) |
158 | 193 | } |
159 | | - }) |
160 | | - .collect(); |
161 | | - PrimitiveArray::<T>::new(values.into(), nulls) |
162 | | - } |
163 | | - }; |
164 | | - |
165 | | - Ok(Arc::new(array)) |
| 194 | + } |
| 195 | + }) |
| 196 | + .collect(); |
| 197 | + Ok(Arc::new(PrimitiveArray::<T>::new(values.into(), nulls))) |
| 198 | + } |
166 | 199 | } |
167 | 200 |
|
168 | 201 | #[cfg(test)] |
|
0 commit comments