Skip to content

Commit 02c9283

Browse files
Add support for VARIANT (#317)
This PR adds handling for VARIANT in duckdb-python. duckdblabs/duckdb-internal#7460
2 parents 7abde9b + 6e7bc3f commit 02c9283

9 files changed

Lines changed: 250 additions & 0 deletions

File tree

_duckdb-stubs/_sqltypes.pyi

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ __all__: list[str] = [
2929
"UTINYINT",
3030
"UUID",
3131
"VARCHAR",
32+
"VARIANT",
3233
"DuckDBPyType",
3334
]
3435

@@ -74,3 +75,4 @@ USMALLINT: DuckDBPyType # value = USMALLINT
7475
UTINYINT: DuckDBPyType # value = UTINYINT
7576
UUID: DuckDBPyType # value = UUID
7677
VARCHAR: DuckDBPyType # value = VARCHAR
78+
VARIANT: DuckDBPyType # value = VARIANT

duckdb/experimental/spark/sql/type_utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
UnsignedLongType,
3838
UnsignedShortType,
3939
UUIDType,
40+
VariantType,
4041
)
4142

4243
_sqltype_to_spark_class = {
@@ -74,6 +75,7 @@
7475
"float": FloatType,
7576
"double": DoubleType,
7677
"decimal": DecimalType,
78+
"variant": VariantType,
7779
}
7880

7981

duckdb/experimental/spark/sql/types.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
"UnsignedIntegerType",
5858
"UnsignedLongType",
5959
"UnsignedShortType",
60+
"VariantType",
6061
]
6162

6263

@@ -187,6 +188,13 @@ def __init__(self) -> None: # noqa: D107
187188
super().__init__(DuckDBPyType("BOOLEAN"))
188189

189190

191+
class VariantType(AtomicType, metaclass=DataTypeSingleton):
192+
"""Variant (semi-structured) data type."""
193+
194+
def __init__(self) -> None: # noqa: D107
195+
super().__init__(DuckDBPyType("VARIANT"))
196+
197+
190198
class DateType(AtomicType, metaclass=DataTypeSingleton):
191199
"""Date (datetime.date) data type."""
192200

duckdb/sqltypes/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
UTINYINT,
3030
UUID,
3131
VARCHAR,
32+
VARIANT,
3233
DuckDBPyType,
3334
)
3435

@@ -61,5 +62,6 @@
6162
"UTINYINT",
6263
"UUID",
6364
"VARCHAR",
65+
"VARIANT",
6466
"DuckDBPyType",
6567
]

src/duckdb_py/native/python_objects.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@
1313

1414
#include "datetime.h" // Python datetime initialize #1
1515

16+
#include <duckdb/common/types/variant_value.hpp>
17+
#include <duckdb/function/scalar/variant_utils.hpp>
18+
1619
namespace duckdb {
1720

1821
PyDictionary::PyDictionary(py::object dict) {
@@ -445,6 +448,7 @@ static bool KeyIsHashable(const LogicalType &type) {
445448
case LogicalTypeId::LIST:
446449
case LogicalTypeId::ARRAY:
447450
case LogicalTypeId::MAP:
451+
case LogicalTypeId::VARIANT:
448452
return false;
449453
case LogicalTypeId::UNION: {
450454
idx_t count = UnionType::GetMemberCount(type);
@@ -700,6 +704,14 @@ py::object PythonObject::FromValue(const Value &val, const LogicalType &type,
700704
return import_cache.datetime.timedelta()(py::arg("days") = days,
701705
py::arg("microseconds") = interval_value.micros);
702706
}
707+
case LogicalTypeId::VARIANT: {
708+
Vector tmp(val);
709+
RecursiveUnifiedVectorFormat format;
710+
Vector::RecursiveToUnifiedFormat(tmp, 1, format);
711+
UnifiedVariantVectorData vector_data(format);
712+
auto variant_val = VariantUtils::ConvertVariantToValue(vector_data, 0, 0);
713+
return FromValue(variant_val, variant_val.type(), client_properties);
714+
}
703715

704716
default:
705717
throw NotImplementedException("Unsupported type: \"%s\"", type.ToString());

src/duckdb_py/numpy/array_wrapper.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
#include "duckdb_python/pyresult.hpp"
1212
#include "duckdb/common/types/uuid.hpp"
1313

14+
#include <duckdb/function/scalar/variant_utils.hpp>
15+
1416
namespace duckdb {
1517

1618
namespace duckdb_py_convert {
@@ -302,6 +304,19 @@ struct UnionConvert {
302304
}
303305
};
304306

307+
struct VariantConvert {
308+
static py::object ConvertValue(Vector &input, idx_t chunk_offset, NumpyAppendData &append_data) {
309+
auto &client_properties = append_data.client_properties;
310+
auto val = input.GetValue(chunk_offset);
311+
Vector tmp(val);
312+
RecursiveUnifiedVectorFormat format;
313+
Vector::RecursiveToUnifiedFormat(tmp, 1, format);
314+
UnifiedVariantVectorData vector_data(format);
315+
auto variant_val = VariantUtils::ConvertVariantToValue(vector_data, 0, 0);
316+
return PythonObject::FromValue(variant_val, variant_val.type(), client_properties);
317+
}
318+
};
319+
305320
struct MapConvert {
306321
static py::dict ConvertValue(Vector &input, idx_t chunk_offset, NumpyAppendData &append_data) {
307322
auto &client_properties = append_data.client_properties;
@@ -687,6 +702,9 @@ void ArrayWrapper::Append(idx_t current_offset, Vector &input, idx_t source_size
687702
case LogicalTypeId::STRUCT:
688703
may_have_null = ConvertNested<py::object, duckdb_py_convert::StructConvert>(append_data);
689704
break;
705+
case LogicalTypeId::VARIANT:
706+
may_have_null = ConvertNested<py::object, duckdb_py_convert::VariantConvert>(append_data);
707+
break;
690708
case LogicalTypeId::UUID:
691709
may_have_null = ConvertColumn<hugeint_t, PyObject *, duckdb_py_convert::UUIDConvert>(append_data);
692710
break;

src/duckdb_py/numpy/raw_array_wrapper.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ static idx_t GetNumpyTypeWidth(const LogicalType &type) {
6060
case LogicalTypeId::UNION:
6161
case LogicalTypeId::UUID:
6262
case LogicalTypeId::ARRAY:
63+
case LogicalTypeId::VARIANT:
6364
return sizeof(PyObject *);
6465
default:
6566
throw NotImplementedException("Unsupported type \"%s\" for DuckDB -> NumPy conversion", type.ToString());
@@ -122,6 +123,7 @@ string RawArrayWrapper::DuckDBToNumpyDtype(const LogicalType &type) {
122123
case LogicalTypeId::UNION:
123124
case LogicalTypeId::UUID:
124125
case LogicalTypeId::ARRAY:
126+
case LogicalTypeId::VARIANT:
125127
return "object";
126128
case LogicalTypeId::ENUM: {
127129
auto size = EnumType::GetSize(type);

src/duckdb_py/typing/typing.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ static void DefineBaseTypes(py::handle &m) {
3737
m.attr("BLOB") = make_shared_ptr<DuckDBPyType>(LogicalType::BLOB);
3838
m.attr("BIT") = make_shared_ptr<DuckDBPyType>(LogicalType::BIT);
3939
m.attr("INTERVAL") = make_shared_ptr<DuckDBPyType>(LogicalType::INTERVAL);
40+
m.attr("VARIANT") = make_shared_ptr<DuckDBPyType>(LogicalType::VARIANT());
4041
}
4142

4243
void DuckDBPyTyping::Initialize(py::module_ &parent) {

tests/fast/test_variant.py

Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
import numpy as np
2+
import pytest
3+
4+
import duckdb
5+
6+
7+
class TestVariantFetchall:
8+
"""Tests for fetchall/fetchone with VARIANT columns (should all pass)."""
9+
10+
def test_integer(self):
11+
result = duckdb.sql("SELECT 42::VARIANT AS v").fetchone()
12+
assert result[0] == 42
13+
14+
def test_string(self):
15+
result = duckdb.sql("SELECT 'hello'::VARIANT AS v").fetchone()
16+
assert result[0] == "hello"
17+
18+
def test_boolean(self):
19+
result = duckdb.sql("SELECT true::VARIANT AS v").fetchone()
20+
assert result[0] is True
21+
22+
def test_double(self):
23+
result = duckdb.sql("SELECT 3.14::DOUBLE::VARIANT AS v").fetchone()
24+
assert abs(result[0] - 3.14) < 1e-10
25+
26+
def test_null(self):
27+
result = duckdb.sql("SELECT NULL::VARIANT AS v").fetchone()
28+
assert result[0] is None
29+
30+
def test_list(self):
31+
result = duckdb.sql("SELECT [1, 2, 3]::VARIANT AS v").fetchone()
32+
assert result[0] == [1, 2, 3]
33+
34+
def test_struct(self):
35+
result = duckdb.sql("SELECT {'a': 1, 'b': 2}::VARIANT AS v").fetchone()
36+
assert result[0] == {"a": 1, "b": 2}
37+
38+
def test_nested_struct(self):
39+
result = duckdb.sql("SELECT {'x': {'y': 42}}::VARIANT AS v").fetchone()
40+
assert result[0] == {"x": {"y": 42}}
41+
42+
def test_map(self):
43+
result = duckdb.sql("SELECT MAP {'key1': 'val1', 'key2': 'val2'}::VARIANT AS v").fetchone()
44+
val = result[0]
45+
# VARIANT converts maps to a list of key/value structs
46+
assert val == [{"key": "key1", "value": "val1"}, {"key": "key2", "value": "val2"}]
47+
48+
def test_multiple_rows_mixed_types(self):
49+
result = duckdb.sql("""
50+
SELECT * FROM (
51+
VALUES (42::VARIANT), ('hello'::VARIANT), (true::VARIANT), ([1,2]::VARIANT)
52+
) AS t(v)
53+
""").fetchall()
54+
assert result[0][0] == 42
55+
assert result[1][0] == "hello"
56+
assert result[2][0] is True
57+
assert result[3][0] == [1, 2]
58+
59+
def test_variant_from_table(self):
60+
con = duckdb.connect()
61+
con.execute("CREATE TABLE t (v VARIANT)")
62+
con.execute("INSERT INTO t VALUES (42::VARIANT), ('hello'::VARIANT)")
63+
result = con.execute("SELECT * FROM t").fetchall()
64+
assert result[0][0] == 42
65+
assert result[1][0] == "hello"
66+
67+
def test_variant_as_map_key(self):
68+
"""The original repro that motivated VARIANT support."""
69+
result = duckdb.sql("""
70+
SELECT MAP {42::VARIANT: 'answer'} AS m
71+
""").fetchone()
72+
# MAP with VARIANT keys is returned as a struct with key/value arrays
73+
assert result[0] == {"key": [42], "value": ["answer"]}
74+
75+
76+
class TestVariantFetchNumpy:
77+
"""Tests for fetchnumpy with VARIANT columns."""
78+
79+
def test_single_row(self):
80+
result = duckdb.sql("SELECT 42::VARIANT AS v").fetchnumpy()
81+
assert result["v"][0] == 42
82+
83+
def test_multiple_rows(self):
84+
"""Exercises chunk_offset > 0 — this was broken by Bug A/B."""
85+
result = duckdb.sql("""
86+
SELECT * FROM (
87+
VALUES (1::VARIANT), (2::VARIANT), (3::VARIANT)
88+
) AS t(v)
89+
""").fetchnumpy()
90+
values = list(result["v"])
91+
assert values == [1, 2, 3]
92+
93+
def test_null_handling(self):
94+
result = duckdb.sql("""
95+
SELECT * FROM (
96+
VALUES (42::VARIANT), (NULL::VARIANT), (99::VARIANT)
97+
) AS t(v)
98+
""").fetchnumpy()
99+
arr = result["v"]
100+
assert arr[0] == 42
101+
assert arr[1] is np.ma.masked or arr[1] is None
102+
assert arr[2] == 99
103+
104+
def test_mixed_types(self):
105+
result = duckdb.sql("""
106+
SELECT * FROM (
107+
VALUES (42::VARIANT), ('hello'::VARIANT), (true::VARIANT)
108+
) AS t(v)
109+
""").fetchnumpy()
110+
values = list(result["v"])
111+
assert values[0] == 42
112+
assert values[1] == "hello"
113+
assert values[2] is True
114+
115+
116+
class TestVariantFetchDF:
117+
"""Tests for Pandas df() with VARIANT columns (goes through numpy)."""
118+
119+
def test_basic(self):
120+
df = duckdb.sql("SELECT 42::VARIANT AS v").df()
121+
assert df["v"].iloc[0] == 42
122+
123+
def test_multiple_types(self):
124+
df = duckdb.sql("""
125+
SELECT * FROM (
126+
VALUES (42::VARIANT), ('hello'::VARIANT), (true::VARIANT)
127+
) AS t(v)
128+
""").df()
129+
assert df["v"].iloc[0] == 42
130+
assert df["v"].iloc[1] == "hello"
131+
assert df["v"].iloc[2] is True
132+
133+
def test_null_handling(self):
134+
df = duckdb.sql("""
135+
SELECT * FROM (
136+
VALUES (42::VARIANT), (NULL::VARIANT), (99::VARIANT)
137+
) AS t(v)
138+
""").df()
139+
assert df["v"].iloc[0] == 42
140+
assert df["v"].iloc[2] == 99
141+
142+
143+
class TestVariantArrow:
144+
"""Tests for Arrow/Polars — blocked on DuckDB core Arrow support."""
145+
146+
@pytest.mark.xfail(strict=True, reason="Arrow export for VARIANT not yet supported in DuckDB core")
147+
def test_to_arrow_table(self):
148+
duckdb.sql("SELECT 42::VARIANT AS v").arrow()
149+
150+
@pytest.mark.xfail(strict=True, reason="Arrow export for VARIANT not yet supported in DuckDB core")
151+
def test_fetch_arrow_reader(self):
152+
duckdb.sql("SELECT 42::VARIANT AS v").fetch_arrow_reader()
153+
154+
@pytest.mark.xfail(strict=True, reason="Polars uses Arrow, which doesn't support VARIANT yet")
155+
def test_polars(self):
156+
duckdb.sql("SELECT 42::VARIANT AS v").pl()
157+
158+
159+
class TestVariantIngestion:
160+
"""Tests for Python → DuckDB VARIANT ingestion."""
161+
162+
def test_insert_with_params(self):
163+
con = duckdb.connect()
164+
con.execute("CREATE TABLE t (v VARIANT)")
165+
con.execute("INSERT INTO t VALUES ($1::VARIANT)", [42])
166+
result = con.execute("SELECT * FROM t").fetchone()
167+
assert result[0] == 42
168+
169+
170+
class TestVariantType:
171+
"""Tests for VARIANT in the type system."""
172+
173+
def test_type_from_string(self):
174+
t = duckdb.type("VARIANT")
175+
assert t.id == "variant"
176+
177+
def test_variant_constant(self):
178+
from duckdb.sqltypes import VARIANT
179+
180+
assert VARIANT is not None
181+
assert VARIANT.id == "variant"
182+
183+
def test_children_raises(self):
184+
t = duckdb.type("VARIANT")
185+
with pytest.raises(duckdb.InvalidInputException, match="not nested"):
186+
_ = t.children
187+
188+
def test_sqltypes_variant(self):
189+
from duckdb.sqltypes import VARIANT
190+
191+
assert VARIANT.id == "variant"
192+
193+
194+
class TestVariantPySpark:
195+
"""Tests for PySpark VARIANT type mapping."""
196+
197+
def test_variant_converts_to_variant_type(self):
198+
from duckdb.experimental.spark.sql.type_utils import convert_type
199+
from duckdb.experimental.spark.sql.types import VariantType
200+
201+
t = duckdb.type("VARIANT")
202+
spark_type = convert_type(t)
203+
assert isinstance(spark_type, VariantType)

0 commit comments

Comments
 (0)