Skip to content

Commit 8a45d02

Browse files
Jefffreydqkqdalamb
authored
feat: support ListView and LargeListView in ScalarValue (#21669)
## Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes #123` indicates that this PR will close issue #123. --> - Closes #18886 - Previous iteration: #18884 ## Rationale for this change <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> More support for listview types in the codebase ## What changes are included in this PR? <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> Added `ListView` and `LargeListView` to `ScalarValue` with all accompanying changes Support `ListView` and `LargeListView` in proto, both for the arrow datatype & the newly introduced scalarvalue variants. ## Are these changes tested? <!-- We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 2. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? --> Yes, added tests ## Are there any user-facing changes? <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. --> No <!-- If there are any breaking changes to public APIs, please add the `api change` label. --> --------- Co-authored-by: Khanh Duong <dqkqdlot@gmail.com> Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent 64619a6 commit 8a45d02

10 files changed

Lines changed: 980 additions & 360 deletions

File tree

datafusion/common/src/scalar/mod.rs

Lines changed: 532 additions & 93 deletions
Large diffs are not rendered by default.

datafusion/common/src/utils/mod.rs

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ use arrow::array::{
3030
Array, ArrayRef, FixedSizeListArray, LargeListArray, ListArray, OffsetSizeTrait,
3131
cast::AsArray,
3232
};
33-
use arrow::buffer::OffsetBuffer;
33+
use arrow::array::{LargeListViewArray, ListViewArray};
34+
use arrow::buffer::{OffsetBuffer, ScalarBuffer};
3435
use arrow::compute::{SortColumn, SortOptions, partition};
3536
use arrow::datatypes::{DataType, Field, SchemaRef};
3637
#[cfg(feature = "sql")]
@@ -480,6 +481,34 @@ impl SingleRowListArrayBuilder {
480481
ScalarValue::FixedSizeList(Arc::new(self.build_fixed_size_list_array(list_size)))
481482
}
482483

484+
/// Build a single element [`ListViewArray`]
485+
pub fn build_list_view_array(self) -> ListViewArray {
486+
let (field, arr) = self.into_field_and_arr();
487+
let offsets = ScalarBuffer::from(vec![0]);
488+
let sizes = ScalarBuffer::from(vec![i32::try_from(arr.len()).expect(
489+
"Trying to construct a ListView where element length exceeds i32::MAX",
490+
)]);
491+
ListViewArray::new(field, offsets, sizes, arr, None)
492+
}
493+
494+
/// Build a single element [`ListViewArray`] and wrap as [`ScalarValue::ListView`]
495+
pub fn build_list_view_scalar(self) -> ScalarValue {
496+
ScalarValue::ListView(Arc::new(self.build_list_view_array()))
497+
}
498+
499+
/// Build a single element [`LargeListViewArray`]
500+
pub fn build_large_list_view_array(self) -> LargeListViewArray {
501+
let (field, arr) = self.into_field_and_arr();
502+
let offsets = ScalarBuffer::from(vec![0]);
503+
let sizes = ScalarBuffer::from(vec![arr.len() as i64]);
504+
LargeListViewArray::new(field, offsets, sizes, arr, None)
505+
}
506+
507+
/// Build a single element [`LargeListViewArray`] and wrap as [`ScalarValue::LargeListView`]
508+
pub fn build_large_list_view_scalar(self) -> ScalarValue {
509+
ScalarValue::LargeListView(Arc::new(self.build_large_list_view_array()))
510+
}
511+
483512
/// Helper function: convert this builder into a tuple of field and array
484513
fn into_field_and_arr(self) -> (Arc<Field>, ArrayRef) {
485514
let Self {

datafusion/proto-common/proto/datafusion_common.proto

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ message Union{
199199
repeated int32 type_ids = 3;
200200
}
201201

202-
// Used for List/FixedSizeList/LargeList/Struct/Map
202+
// Used for List/FixedSizeList/LargeList/ListView/LargeListView/Struct/Map
203203
message ScalarNestedValue {
204204
message Dictionary {
205205
bytes ipc_message = 1;
@@ -306,6 +306,8 @@ message ScalarValue{
306306
ScalarNestedValue large_list_value = 16;
307307
ScalarNestedValue list_value = 17;
308308
ScalarNestedValue fixed_size_list_value = 18;
309+
ScalarNestedValue list_view_value = 46;
310+
ScalarNestedValue large_list_view_value = 47;
309311
ScalarNestedValue struct_value = 32;
310312
ScalarNestedValue map_value = 41;
311313

@@ -398,6 +400,8 @@ message ArrowType{
398400
List LIST = 25;
399401
List LARGE_LIST = 26;
400402
FixedSizeList FIXED_SIZE_LIST = 27;
403+
List LIST_VIEW = 43;
404+
List LARGE_LIST_VIEW = 44;
401405
Struct STRUCT = 28;
402406
Union UNION = 29;
403407
Dictionary DICTIONARY = 30;

datafusion/proto-common/src/from_proto/mod.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,16 @@ impl TryFrom<&protobuf::arrow_type::ArrowTypeEnum> for DataType {
296296
let list_size = list.list_size;
297297
DataType::FixedSizeList(Arc::new(list_type), list_size)
298298
}
299+
arrow_type::ArrowTypeEnum::ListView(list) => {
300+
let list_type =
301+
list.as_ref().field_type.as_deref().required("field_type")?;
302+
DataType::ListView(Arc::new(list_type))
303+
}
304+
arrow_type::ArrowTypeEnum::LargeListView(list) => {
305+
let list_type =
306+
list.as_ref().field_type.as_deref().required("field_type")?;
307+
DataType::LargeListView(Arc::new(list_type))
308+
}
299309
arrow_type::ArrowTypeEnum::Struct(strct) => DataType::Struct(
300310
parse_proto_fields_to_fields(&strct.sub_field_types)?.into(),
301311
),
@@ -405,6 +415,8 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue {
405415
Value::ListValue(v)
406416
| Value::FixedSizeListValue(v)
407417
| Value::LargeListValue(v)
418+
| Value::ListViewValue(v)
419+
| Value::LargeListViewValue(v)
408420
| Value::StructValue(v)
409421
| Value::MapValue(v) => {
410422
let protobuf::ScalarNestedValue {
@@ -517,6 +529,12 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue {
517529
Value::FixedSizeListValue(_) => {
518530
Self::FixedSizeList(arr.as_fixed_size_list().to_owned().into())
519531
}
532+
Value::ListViewValue(_) => {
533+
Self::ListView(arr.as_list_view::<i32>().to_owned().into())
534+
}
535+
Value::LargeListViewValue(_) => {
536+
Self::LargeListView(arr.as_list_view::<i64>().to_owned().into())
537+
}
520538
Value::StructValue(_) => {
521539
Self::Struct(arr.as_struct().to_owned().into())
522540
}

datafusion/proto-common/src/generated/pbjson.rs

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,12 @@ impl serde::Serialize for ArrowType {
264264
arrow_type::ArrowTypeEnum::FixedSizeList(v) => {
265265
struct_ser.serialize_field("FIXEDSIZELIST", v)?;
266266
}
267+
arrow_type::ArrowTypeEnum::ListView(v) => {
268+
struct_ser.serialize_field("LISTVIEW", v)?;
269+
}
270+
arrow_type::ArrowTypeEnum::LargeListView(v) => {
271+
struct_ser.serialize_field("LARGELISTVIEW", v)?;
272+
}
267273
arrow_type::ArrowTypeEnum::Struct(v) => {
268274
struct_ser.serialize_field("STRUCT", v)?;
269275
}
@@ -332,6 +338,10 @@ impl<'de> serde::Deserialize<'de> for ArrowType {
332338
"LARGELIST",
333339
"FIXED_SIZE_LIST",
334340
"FIXEDSIZELIST",
341+
"LIST_VIEW",
342+
"LISTVIEW",
343+
"LARGE_LIST_VIEW",
344+
"LARGELISTVIEW",
335345
"STRUCT",
336346
"UNION",
337347
"DICTIONARY",
@@ -376,6 +386,8 @@ impl<'de> serde::Deserialize<'de> for ArrowType {
376386
List,
377387
LargeList,
378388
FixedSizeList,
389+
ListView,
390+
LargeListView,
379391
Struct,
380392
Union,
381393
Dictionary,
@@ -436,6 +448,8 @@ impl<'de> serde::Deserialize<'de> for ArrowType {
436448
"LIST" => Ok(GeneratedField::List),
437449
"LARGELIST" | "LARGE_LIST" => Ok(GeneratedField::LargeList),
438450
"FIXEDSIZELIST" | "FIXED_SIZE_LIST" => Ok(GeneratedField::FixedSizeList),
451+
"LISTVIEW" | "LIST_VIEW" => Ok(GeneratedField::ListView),
452+
"LARGELISTVIEW" | "LARGE_LIST_VIEW" => Ok(GeneratedField::LargeListView),
439453
"STRUCT" => Ok(GeneratedField::Struct),
440454
"UNION" => Ok(GeneratedField::Union),
441455
"DICTIONARY" => Ok(GeneratedField::Dictionary),
@@ -694,6 +708,20 @@ impl<'de> serde::Deserialize<'de> for ArrowType {
694708
return Err(serde::de::Error::duplicate_field("FIXEDSIZELIST"));
695709
}
696710
arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::FixedSizeList)
711+
;
712+
}
713+
GeneratedField::ListView => {
714+
if arrow_type_enum__.is_some() {
715+
return Err(serde::de::Error::duplicate_field("LISTVIEW"));
716+
}
717+
arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::ListView)
718+
;
719+
}
720+
GeneratedField::LargeListView => {
721+
if arrow_type_enum__.is_some() {
722+
return Err(serde::de::Error::duplicate_field("LARGELISTVIEW"));
723+
}
724+
arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::LargeListView)
697725
;
698726
}
699727
GeneratedField::Struct => {
@@ -7978,6 +8006,12 @@ impl serde::Serialize for ScalarValue {
79788006
scalar_value::Value::FixedSizeListValue(v) => {
79798007
struct_ser.serialize_field("fixedSizeListValue", v)?;
79808008
}
8009+
scalar_value::Value::ListViewValue(v) => {
8010+
struct_ser.serialize_field("listViewValue", v)?;
8011+
}
8012+
scalar_value::Value::LargeListViewValue(v) => {
8013+
struct_ser.serialize_field("largeListViewValue", v)?;
8014+
}
79818015
scalar_value::Value::StructValue(v) => {
79828016
struct_ser.serialize_field("structValue", v)?;
79838017
}
@@ -8115,6 +8149,10 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
81158149
"listValue",
81168150
"fixed_size_list_value",
81178151
"fixedSizeListValue",
8152+
"list_view_value",
8153+
"listViewValue",
8154+
"large_list_view_value",
8155+
"largeListViewValue",
81188156
"struct_value",
81198157
"structValue",
81208158
"map_value",
@@ -8185,6 +8223,8 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
81858223
LargeListValue,
81868224
ListValue,
81878225
FixedSizeListValue,
8226+
ListViewValue,
8227+
LargeListViewValue,
81888228
StructValue,
81898229
MapValue,
81908230
Decimal32Value,
@@ -8249,6 +8289,8 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
82498289
"largeListValue" | "large_list_value" => Ok(GeneratedField::LargeListValue),
82508290
"listValue" | "list_value" => Ok(GeneratedField::ListValue),
82518291
"fixedSizeListValue" | "fixed_size_list_value" => Ok(GeneratedField::FixedSizeListValue),
8292+
"listViewValue" | "list_view_value" => Ok(GeneratedField::ListViewValue),
8293+
"largeListViewValue" | "large_list_view_value" => Ok(GeneratedField::LargeListViewValue),
82528294
"structValue" | "struct_value" => Ok(GeneratedField::StructValue),
82538295
"mapValue" | "map_value" => Ok(GeneratedField::MapValue),
82548296
"decimal32Value" | "decimal32_value" => Ok(GeneratedField::Decimal32Value),
@@ -8417,6 +8459,20 @@ impl<'de> serde::Deserialize<'de> for ScalarValue {
84178459
return Err(serde::de::Error::duplicate_field("fixedSizeListValue"));
84188460
}
84198461
value__ = map_.next_value::<::std::option::Option<_>>()?.map(scalar_value::Value::FixedSizeListValue)
8462+
;
8463+
}
8464+
GeneratedField::ListViewValue => {
8465+
if value__.is_some() {
8466+
return Err(serde::de::Error::duplicate_field("listViewValue"));
8467+
}
8468+
value__ = map_.next_value::<::std::option::Option<_>>()?.map(scalar_value::Value::ListViewValue)
8469+
;
8470+
}
8471+
GeneratedField::LargeListViewValue => {
8472+
if value__.is_some() {
8473+
return Err(serde::de::Error::duplicate_field("largeListViewValue"));
8474+
}
8475+
value__ = map_.next_value::<::std::option::Option<_>>()?.map(scalar_value::Value::LargeListViewValue)
84208476
;
84218477
}
84228478
GeneratedField::StructValue => {

datafusion/proto-common/src/generated/prost.rs

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ pub struct Union {
191191
#[prost(int32, repeated, tag = "3")]
192192
pub type_ids: ::prost::alloc::vec::Vec<i32>,
193193
}
194-
/// Used for List/FixedSizeList/LargeList/Struct/Map
194+
/// Used for List/FixedSizeList/LargeList/ListView/LargeListView/Struct/Map
195195
#[derive(Clone, PartialEq, ::prost::Message)]
196196
pub struct ScalarNestedValue {
197197
#[prost(bytes = "vec", tag = "1")]
@@ -327,7 +327,7 @@ pub struct ScalarFixedSizeBinary {
327327
pub struct ScalarValue {
328328
#[prost(
329329
oneof = "scalar_value::Value",
330-
tags = "33, 1, 2, 3, 23, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 32, 41, 43, 44, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 22, 30, 25, 31, 34, 42, 45"
330+
tags = "33, 1, 2, 3, 23, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 46, 47, 32, 41, 43, 44, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 22, 30, 25, 31, 34, 42, 45"
331331
)]
332332
pub value: ::core::option::Option<scalar_value::Value>,
333333
}
@@ -378,6 +378,10 @@ pub mod scalar_value {
378378
ListValue(super::ScalarNestedValue),
379379
#[prost(message, tag = "18")]
380380
FixedSizeListValue(super::ScalarNestedValue),
381+
#[prost(message, tag = "46")]
382+
ListViewValue(super::ScalarNestedValue),
383+
#[prost(message, tag = "47")]
384+
LargeListViewValue(super::ScalarNestedValue),
381385
#[prost(message, tag = "32")]
382386
StructValue(super::ScalarNestedValue),
383387
#[prost(message, tag = "41")]
@@ -467,7 +471,7 @@ pub struct Decimal256 {
467471
pub struct ArrowType {
468472
#[prost(
469473
oneof = "arrow_type::ArrowTypeEnum",
470-
tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 35, 32, 15, 34, 16, 31, 17, 18, 19, 20, 21, 22, 23, 40, 41, 24, 36, 25, 26, 27, 28, 29, 30, 33, 42"
474+
tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 35, 32, 15, 34, 16, 31, 17, 18, 19, 20, 21, 22, 23, 40, 41, 24, 36, 25, 26, 27, 43, 44, 28, 29, 30, 33, 42"
471475
)]
472476
pub arrow_type_enum: ::core::option::Option<arrow_type::ArrowTypeEnum>,
473477
}
@@ -548,6 +552,10 @@ pub mod arrow_type {
548552
LargeList(::prost::alloc::boxed::Box<super::List>),
549553
#[prost(message, tag = "27")]
550554
FixedSizeList(::prost::alloc::boxed::Box<super::FixedSizeList>),
555+
#[prost(message, tag = "43")]
556+
ListView(::prost::alloc::boxed::Box<super::List>),
557+
#[prost(message, tag = "44")]
558+
LargeListView(::prost::alloc::boxed::Box<super::List>),
551559
#[prost(message, tag = "28")]
552560
Struct(super::Struct),
553561
#[prost(message, tag = "29")]

datafusion/proto-common/src/to_proto/mod.rs

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,14 @@ impl TryFrom<&DataType> for protobuf::arrow_type::ArrowTypeEnum {
171171
DataType::LargeList(item_type) => Self::LargeList(Box::new(protobuf::List {
172172
field_type: Some(Box::new(item_type.as_ref().try_into()?)),
173173
})),
174+
DataType::ListView(item_type) => Self::ListView(Box::new(protobuf::List {
175+
field_type: Some(Box::new(item_type.as_ref().try_into()?)),
176+
})),
177+
DataType::LargeListView(item_type) => {
178+
Self::LargeListView(Box::new(protobuf::List {
179+
field_type: Some(Box::new(item_type.as_ref().try_into()?)),
180+
}))
181+
}
174182
DataType::Struct(struct_fields) => Self::Struct(protobuf::Struct {
175183
sub_field_types: convert_arc_fields_to_proto_fields(struct_fields)?,
176184
}),
@@ -227,11 +235,6 @@ impl TryFrom<&DataType> for protobuf::arrow_type::ArrowTypeEnum {
227235
values_field: Some(Box::new(values_field.as_ref().try_into()?)),
228236
}))
229237
}
230-
DataType::ListView(_) | DataType::LargeListView(_) => {
231-
return Err(Error::General(format!(
232-
"Proto serialization error: {val} not yet supported"
233-
)));
234-
}
235238
};
236239

237240
Ok(res)
@@ -383,6 +386,12 @@ impl TryFrom<&ScalarValue> for protobuf::ScalarValue {
383386
ScalarValue::FixedSizeList(arr) => {
384387
encode_scalar_nested_value(arr.to_owned() as ArrayRef, val)
385388
}
389+
ScalarValue::ListView(arr) => {
390+
encode_scalar_nested_value(arr.to_owned() as ArrayRef, val)
391+
}
392+
ScalarValue::LargeListView(arr) => {
393+
encode_scalar_nested_value(arr.to_owned() as ArrayRef, val)
394+
}
386395
ScalarValue::Struct(arr) => {
387396
encode_scalar_nested_value(arr.to_owned() as ArrayRef, val)
388397
}
@@ -1042,8 +1051,8 @@ fn create_proto_scalar<I, T: FnOnce(&I) -> protobuf::scalar_value::Value>(
10421051
Ok(protobuf::ScalarValue { value: Some(value) })
10431052
}
10441053

1045-
// Nested ScalarValue types (List / FixedSizeList / LargeList / Struct / Map) are serialized using
1046-
// Arrow IPC messages as a single column RecordBatch
1054+
// Nested ScalarValue types (List / FixedSizeList / LargeList / ListView / LargeListView / Struct / Map)
1055+
// are serialized using Arrow IPC messages as a single column RecordBatch
10471056
fn encode_scalar_nested_value(
10481057
arr: ArrayRef,
10491058
val: &ScalarValue,
@@ -1105,6 +1114,16 @@ fn encode_scalar_nested_value(
11051114
scalar_list_value,
11061115
)),
11071116
}),
1117+
ScalarValue::ListView(_) => Ok(protobuf::ScalarValue {
1118+
value: Some(protobuf::scalar_value::Value::ListViewValue(
1119+
scalar_list_value,
1120+
)),
1121+
}),
1122+
ScalarValue::LargeListView(_) => Ok(protobuf::ScalarValue {
1123+
value: Some(protobuf::scalar_value::Value::LargeListViewValue(
1124+
scalar_list_value,
1125+
)),
1126+
}),
11081127
ScalarValue::Struct(_) => Ok(protobuf::ScalarValue {
11091128
value: Some(protobuf::scalar_value::Value::StructValue(
11101129
scalar_list_value,

0 commit comments

Comments
 (0)