Skip to content

Commit f1711b5

Browse files
Support for Pandas 3.0.0 (#277)
Related #37 Two small code fixes: * Add support for native `StringDType` * Fixes scope bug in lazy attribute reading in ConvertDateTimeTypes Support for `timedelta64[s|ms|us|ns]`: * Where Pandas used to only support timedelate64[ns], Pandas 3 now infers the resolution. We added support for reading all of the new types. * We now output timedelta[us] whenever timedelatas are read from DuckDB, since that is DuckDB's own resolution. **This is a breaking change.** Pyproject.toml updates: * env markers to allow higher version of dependencies (pandas, pyarrow and adbc-driver-manager) Lots of changes to tests: * Removes the ArrowPandas and NumpyPandas helpers. We will now test using whatever default backend for almost all tests, which makes most sense for regular values. * We do still test arrow dtypes, but we now do so explicitly. * Support both `zoneinfo.ZoneInfo` objects (3.0.0 and up) and `tzdata` (before) Also see: - [Migration Guide](https://pandas.pydata.org/docs/dev/user_guide/migration-3-strings.html#string-migration-guide) - [Changelog](https://pandas.pydata.org/docs/dev/whatsnew/v3.0.0.html)
2 parents e32ed3e + 8361d73 commit f1711b5

50 files changed

Lines changed: 830 additions & 1003 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

pyproject.toml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,8 @@ stubdeps = [ # dependencies used for typehints in the stubs
234234
"typing-extensions",
235235
]
236236
test = [ # dependencies used for running tests
237-
"adbc-driver-manager; sys_platform != 'win32' or platform_machine != 'ARM64'",
237+
"adbc-driver-manager>=1.10.0; python_version >= '3.10' and (sys_platform != 'win32' or platform_machine != 'ARM64')",
238+
"adbc-driver-manager>=1.7.0; python_version < '3.10' and (sys_platform != 'win32' or platform_machine != 'ARM64')",
238239
"pytest",
239240
"pytest-reraise",
240241
"pytest-timeout",
@@ -252,8 +253,10 @@ test = [ # dependencies used for running tests
252253
"requests",
253254
"urllib3",
254255
"fsspec>=2022.11.0; sys_platform != 'win32' or platform_machine != 'ARM64'",
255-
"pandas>=2.0.0",
256-
"pyarrow>=18.0.0; sys_platform != 'win32' or platform_machine != 'ARM64'",
256+
"pandas>=3.0.0; python_version > '3.10'",
257+
"pandas<3.0.0; python_version < '3.11'",
258+
"pyarrow>=23.0.0; python_version >= '3.10' and (sys_platform != 'win32' or platform_machine != 'ARM64')",
259+
"pyarrow>=18.0.0; python_version < '3.10' and (sys_platform != 'win32' or platform_machine != 'ARM64')",
257260
"torch>=2.2.2; python_version < '3.14' and ( sys_platform != 'darwin' or platform_machine != 'x86_64' or python_version < '3.13' ) and ( sys_platform != 'win32' or platform_machine != 'ARM64' or python_version > '3.11' )",
258261
"tensorflow==2.14.0; sys_platform == 'darwin' and python_version < '3.12'",
259262
"tensorflow-cpu>=2.14.0; sys_platform == 'linux' and platform_machine != 'aarch64' and python_version < '3.12'",

src/duckdb_py/include/duckdb_python/numpy/numpy_type.hpp

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -18,25 +18,28 @@ namespace duckdb {
1818
// Pandas Specific Types (e.g., categorical, datetime_tz,...)
1919
enum class NumpyNullableType : uint8_t {
2020
//! NumPy dtypes
21-
BOOL, //! bool_, bool8
22-
INT_8, //! byte, int8
23-
UINT_8, //! ubyte, uint8
24-
INT_16, //! int16, short
25-
UINT_16, //! uint16, ushort
26-
INT_32, //! int32, intc
27-
UINT_32, //! uint32, uintc,
28-
INT_64, //! int64, int0, int_, intp, matrix
29-
UINT_64, //! uint64, uint, uint0, uintp
30-
FLOAT_16, //! float16, half
31-
FLOAT_32, //! float32, single
32-
FLOAT_64, //! float64, float_, double
33-
OBJECT, //! object
34-
UNICODE, //! <U1, unicode_, str_, str0
35-
DATETIME_S, //! datetime64[s], <M8[s]
36-
DATETIME_MS, //! datetime64[ms], <M8[ms]
37-
DATETIME_NS, //! datetime64[ns], <M8[ns]
38-
DATETIME_US, //! datetime64[us], <M8[us]
39-
TIMEDELTA, //! timedelta64[D], timedelta64
21+
BOOL, //! bool_, bool8
22+
INT_8, //! byte, int8
23+
UINT_8, //! ubyte, uint8
24+
INT_16, //! int16, short
25+
UINT_16, //! uint16, ushort
26+
INT_32, //! int32, intc
27+
UINT_32, //! uint32, uintc,
28+
INT_64, //! int64, int0, int_, intp, matrix
29+
UINT_64, //! uint64, uint, uint0, uintp
30+
FLOAT_16, //! float16, half
31+
FLOAT_32, //! float32, single
32+
FLOAT_64, //! float64, float_, double
33+
OBJECT, //! object
34+
UNICODE, //! <U1, unicode_, str_, str0
35+
DATETIME_S, //! datetime64[s], <M8[s]
36+
DATETIME_MS, //! datetime64[ms], <M8[ms]
37+
DATETIME_NS, //! datetime64[ns], <M8[ns]
38+
DATETIME_US, //! datetime64[us], <M8[us]
39+
TIMEDELTA_NS, //! timedelta64[ns]
40+
TIMEDELTA_US, //! timedelta64[us]
41+
TIMEDELTA_MS, //! timedelta64[ms]
42+
TIMEDELTA_S, //! timedelta64[s]
4043

4144
//! ------------------------------------------------------------
4245
//! Extension Types

src/duckdb_py/numpy/array_wrapper.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ struct IntervalConvert {
112112
template <class DUCKDB_T, class NUMPY_T>
113113
static int64_t ConvertValue(interval_t val, NumpyAppendData &append_data) {
114114
(void)append_data;
115-
return Interval::GetNanoseconds(val);
115+
return Interval::GetMicro(val);
116116
}
117117

118118
template <class NUMPY_T, bool PANDAS>

src/duckdb_py/numpy/numpy_scan.cpp

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,10 @@ void NumpyScan::Scan(PandasColumnBindData &bind_data, idx_t count, idx_t offset,
302302
}
303303
break;
304304
}
305-
case NumpyNullableType::TIMEDELTA: {
305+
case NumpyNullableType::TIMEDELTA_NS:
306+
case NumpyNullableType::TIMEDELTA_US:
307+
case NumpyNullableType::TIMEDELTA_MS:
308+
case NumpyNullableType::TIMEDELTA_S: {
306309
auto src_ptr = reinterpret_cast<const int64_t *>(array.data());
307310
auto tgt_ptr = FlatVector::GetData<interval_t>(out);
308311
auto &mask = FlatVector::Validity(out);
@@ -314,7 +317,25 @@ void NumpyScan::Scan(PandasColumnBindData &bind_data, idx_t count, idx_t offset,
314317
mask.SetInvalid(row);
315318
continue;
316319
}
317-
int64_t micro = src_ptr[source_idx] / 1000;
320+
321+
int64_t micro;
322+
switch (bind_data.numpy_type.type) {
323+
case NumpyNullableType::TIMEDELTA_NS:
324+
micro = src_ptr[source_idx] / 1000; // ns -> us
325+
break;
326+
case NumpyNullableType::TIMEDELTA_US:
327+
micro = src_ptr[source_idx]; // already us
328+
break;
329+
case NumpyNullableType::TIMEDELTA_MS:
330+
micro = src_ptr[source_idx] * 1000; // ms -> us
331+
break;
332+
case NumpyNullableType::TIMEDELTA_S:
333+
micro = src_ptr[source_idx] * 1000000; // s -> us
334+
break;
335+
default:
336+
throw InternalException("Unexpected timedelta type");
337+
}
338+
318339
int64_t days = micro / Interval::MICROS_PER_DAY;
319340
micro = micro % Interval::MICROS_PER_DAY;
320341
int64_t months = days / Interval::DAYS_PER_MONTH;

src/duckdb_py/numpy/raw_array_wrapper.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ string RawArrayWrapper::DuckDBToNumpyDtype(const LogicalType &type) {
108108
case LogicalTypeId::DATE:
109109
return "datetime64[us]";
110110
case LogicalTypeId::INTERVAL:
111-
return "timedelta64[ns]";
111+
return "timedelta64[us]";
112112
case LogicalTypeId::TIME:
113113
case LogicalTypeId::TIME_TZ:
114114
case LogicalTypeId::VARCHAR:

src/duckdb_py/numpy/type.cpp

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,23 @@ static NumpyNullableType ConvertNumpyTypeInternal(const string &col_type_str) {
5858
if (col_type_str == "string") {
5959
return NumpyNullableType::STRING;
6060
}
61+
if (col_type_str == "str") {
62+
return NumpyNullableType::STRING;
63+
}
6164
if (col_type_str == "object") {
6265
return NumpyNullableType::OBJECT;
6366
}
6467
if (col_type_str == "timedelta64[ns]") {
65-
return NumpyNullableType::TIMEDELTA;
68+
return NumpyNullableType::TIMEDELTA_NS;
69+
}
70+
if (col_type_str == "timedelta64[us]") {
71+
return NumpyNullableType::TIMEDELTA_US;
72+
}
73+
if (col_type_str == "timedelta64[ms]") {
74+
return NumpyNullableType::TIMEDELTA_MS;
75+
}
76+
if (col_type_str == "timedelta64[s]") {
77+
return NumpyNullableType::TIMEDELTA_S;
6678
}
6779
// We use 'StartsWith' because it might have ', tz' at the end, indicating timezone
6880
if (StringUtil::StartsWith(col_type_str, "datetime64[ns")) {
@@ -140,7 +152,10 @@ LogicalType NumpyToLogicalType(const NumpyType &col_type) {
140152
return LogicalType::VARCHAR;
141153
case NumpyNullableType::OBJECT:
142154
return LogicalType::VARCHAR;
143-
case NumpyNullableType::TIMEDELTA:
155+
case NumpyNullableType::TIMEDELTA_NS:
156+
case NumpyNullableType::TIMEDELTA_US:
157+
case NumpyNullableType::TIMEDELTA_MS:
158+
case NumpyNullableType::TIMEDELTA_S:
144159
return LogicalType::INTERVAL;
145160
case NumpyNullableType::DATETIME_MS: {
146161
if (col_type.has_timezone) {

src/duckdb_py/pyresult.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -304,7 +304,7 @@ void DuckDBPyResult::ConvertDateTimeTypes(PandasDataFrame &df, bool date_as_obje
304304
// We need to create the column anew because the exact dt changed to a new timezone
305305
ReplaceDFColumn(df, names[i].c_str(), i, new_value);
306306
} else if (date_as_object && result->types[i] == LogicalType::DATE) {
307-
auto new_value = df[names[i].c_str()].attr("dt").attr("date");
307+
py::object new_value = df[names[i].c_str()].attr("dt").attr("date");
308308
ReplaceDFColumn(df, names[i].c_str(), i, new_value);
309309
}
310310
}

tests/conftest.py

Lines changed: 21 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import warnings
33
from importlib import import_module
44
from pathlib import Path
5-
from typing import Any, Union
5+
from typing import Union
66

77
import pytest
88

@@ -19,13 +19,27 @@
1919
pandas = None
2020
pyarrow_dtype = None
2121

22-
# Check if pandas has arrow dtypes enabled
23-
try:
24-
from pandas.compat import pa_version_under7p0
2522

26-
pyarrow_dtypes_enabled = not pa_version_under7p0
27-
except ImportError:
28-
pyarrow_dtypes_enabled = False
23+
# Version-aware helpers for Pandas 2.x vs 3.0 compatibility
24+
def _get_pandas_ge_3():
25+
if pandas is None:
26+
return False
27+
from packaging.version import Version
28+
29+
return Version(pandas.__version__) >= Version("3.0.0")
30+
31+
32+
PANDAS_GE_3 = _get_pandas_ge_3()
33+
34+
35+
def is_string_dtype(dtype):
36+
"""Check if a dtype is a string dtype (works across Pandas 2.x and 3.0).
37+
38+
Uses pd.api.types.is_string_dtype() which handles:
39+
- Pandas 2.x: object dtype for strings
40+
- Pandas 3.0+: str (StringDtype) for strings
41+
"""
42+
return pandas.api.types.is_string_dtype(dtype)
2943

3044

3145
def import_pandas():
@@ -113,78 +127,6 @@ def pandas_supports_arrow_backend():
113127
return pandas_2_or_higher()
114128

115129

116-
def numpy_pandas_df(*args, **kwargs):
117-
return import_pandas().DataFrame(*args, **kwargs)
118-
119-
120-
def arrow_pandas_df(*args, **kwargs):
121-
df = numpy_pandas_df(*args, **kwargs)
122-
return df.convert_dtypes(dtype_backend="pyarrow")
123-
124-
125-
class NumpyPandas:
126-
def __init__(self) -> None:
127-
self.backend = "numpy_nullable"
128-
self.DataFrame = numpy_pandas_df
129-
self.pandas = import_pandas()
130-
131-
def __getattr__(self, name: str) -> Any: # noqa: ANN401
132-
return getattr(self.pandas, name)
133-
134-
135-
def convert_arrow_to_numpy_backend(df):
136-
names = df.columns
137-
df_content = {}
138-
for name in names:
139-
df_content[name] = df[name].array.__arrow_array__()
140-
# This should convert the pyarrow chunked arrays into numpy arrays
141-
return import_pandas().DataFrame(df_content)
142-
143-
144-
def convert_to_numpy(df):
145-
if (
146-
pyarrow_dtypes_enabled
147-
and pyarrow_dtype is not None
148-
and any(True for x in df.dtypes if isinstance(x, pyarrow_dtype))
149-
):
150-
return convert_arrow_to_numpy_backend(df)
151-
return df
152-
153-
154-
def convert_and_equal(df1, df2, **kwargs):
155-
df1 = convert_to_numpy(df1)
156-
df2 = convert_to_numpy(df2)
157-
import_pandas().testing.assert_frame_equal(df1, df2, **kwargs)
158-
159-
160-
class ArrowMockTesting:
161-
def __init__(self) -> None:
162-
self.testing = import_pandas().testing
163-
self.assert_frame_equal = convert_and_equal
164-
165-
def __getattr__(self, name: str) -> Any: # noqa: ANN401
166-
return getattr(self.testing, name)
167-
168-
169-
# This converts dataframes constructed with 'DataFrame(...)' to pyarrow backed dataframes
170-
# Assert equal does the opposite, turning all pyarrow backed dataframes into numpy backed ones
171-
# this is done because we don't produce pyarrow backed dataframes yet
172-
class ArrowPandas:
173-
def __init__(self) -> None:
174-
self.pandas = import_pandas()
175-
if pandas_2_or_higher() and pyarrow_dtypes_enabled:
176-
self.backend = "pyarrow"
177-
self.DataFrame = arrow_pandas_df
178-
else:
179-
# For backwards compatible reasons, just mock regular pandas
180-
self.backend = "numpy_nullable"
181-
self.DataFrame = self.pandas.DataFrame
182-
self.testing = ArrowMockTesting()
183-
184-
def __getattr__(self, name: str) -> Any: # noqa: ANN401
185-
return getattr(self.pandas, name)
186-
187-
188130
@pytest.fixture
189131
def require():
190132
def _require(extension_name, db_name="") -> Union[duckdb.DuckDBPyConnection, None]:

tests/coverage/test_pandas_categorical_coverage.py

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
import pytest
2-
from conftest import NumpyPandas
1+
import pandas as pd
32

43
import duckdb
54

@@ -9,23 +8,23 @@ def check_result_list(res):
98
assert res_item[0] == res_item[1]
109

1110

12-
def check_create_table(category, pandas):
11+
def check_create_table(category):
1312
conn = duckdb.connect()
1413

1514
conn.execute("PRAGMA enable_verification")
16-
df_in = pandas.DataFrame(
15+
df_in = pd.DataFrame(
1716
{
18-
"x": pandas.Categorical(category, ordered=True),
19-
"y": pandas.Categorical(category, ordered=True),
17+
"x": pd.Categorical(category, ordered=True),
18+
"y": pd.Categorical(category, ordered=True),
2019
"z": category,
2120
}
2221
)
2322

2423
category.append("bla")
2524

26-
df_in_diff = pandas.DataFrame( # noqa: F841
25+
df_in_diff = pd.DataFrame( # noqa: F841
2726
{
28-
"k": pandas.Categorical(category, ordered=True),
27+
"k": pd.Categorical(category, ordered=True),
2928
}
3029
)
3130

@@ -68,14 +67,11 @@ def check_create_table(category, pandas):
6867
conn.execute("DROP TABLE t1")
6968

7069

71-
# TODO: extend tests with ArrowPandas # noqa: TD002, TD003
7270
class TestCategory:
73-
@pytest.mark.parametrize("pandas", [NumpyPandas()])
74-
def test_category_string_uint16(self, duckdb_cursor, pandas):
71+
def test_category_string_uint16(self, duckdb_cursor):
7572
category = [str(i) for i in range(300)]
76-
check_create_table(category, pandas)
73+
check_create_table(category)
7774

78-
@pytest.mark.parametrize("pandas", [NumpyPandas()])
79-
def test_category_string_uint32(self, duckdb_cursor, pandas):
75+
def test_category_string_uint32(self, duckdb_cursor):
8076
category = [str(i) for i in range(70000)]
81-
check_create_table(category, pandas)
77+
check_create_table(category)

tests/extensions/test_httpfs.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import datetime
22
import os
33

4+
import pandas as pd
45
import pytest
5-
from conftest import ArrowPandas, NumpyPandas
66

77
import duckdb
88

@@ -34,8 +34,7 @@ def test_s3fs(self, require):
3434
res = rel.fetchone()
3535
assert res == (1, 0, datetime.date(1965, 2, 28), 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 6, 0, 0, 0, 0)
3636

37-
@pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()])
38-
def test_httpfs(self, require, pandas):
37+
def test_httpfs(self, require):
3938
connection = require("httpfs")
4039
try:
4140
connection.execute("""
@@ -51,14 +50,14 @@ def test_httpfs(self, require, pandas):
5150
raise
5251

5352
result_df = connection.fetchdf()
54-
exp_result = pandas.DataFrame(
53+
exp_result = pd.DataFrame(
5554
{
56-
"id": pandas.Series([1, 2, 3], dtype="int32"),
55+
"id": pd.Series([1, 2, 3], dtype="int32"),
5756
"first_name": ["Amanda", "Albert", "Evelyn"],
5857
"last_name": ["Jordan", "Freeman", "Morgan"],
5958
}
6059
)
61-
pandas.testing.assert_frame_equal(result_df, exp_result)
60+
pd.testing.assert_frame_equal(result_df, exp_result, check_dtype=False)
6261

6362
def test_http_exception(self, require):
6463
connection = require("httpfs")

0 commit comments

Comments
 (0)