Skip to content

Commit 58e68f6

Browse files
Fix numeric conversion logic (#332)
Fixes #115 , #171 and #330
2 parents 0020492 + 16621e8 commit 58e68f6

4 files changed

Lines changed: 268 additions & 161 deletions

File tree

src/duckdb_py/include/duckdb_python/python_conversion.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ enum class PythonObjectType {
4545

4646
PythonObjectType GetPythonObjectType(py::handle &ele);
4747

48-
bool TryTransformPythonNumeric(Value &res, py::handle ele, const LogicalType &target_type = LogicalType::UNKNOWN);
48+
LogicalType SniffPythonIntegerType(py::handle ele);
4949
bool DictionaryHasMapFormat(const PyDictionary &dict);
5050
void TransformPythonObject(py::handle ele, Vector &vector, idx_t result_offset, bool nan_as_null = true);
5151
Value TransformPythonValue(py::handle ele, const LogicalType &target_type = LogicalType::UNKNOWN,

src/duckdb_py/native/python_conversion.cpp

Lines changed: 128 additions & 156 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,36 @@
1313

1414
namespace duckdb {
1515

16+
// Like DefaultCastAs, but handles UNION targets by finding the first compatible member. DefaultCastAs raises a
17+
// Conversion Error when multiple UNION members have the same type (e.g. UNION(u1 DOUBLE, u2 DOUBLE)), so for UNION
18+
// targets we resolve the member ourselves.
19+
static Value CastToTarget(Value val, const LogicalType &target_type) {
20+
if (target_type.id() != LogicalTypeId::UNION) {
21+
return val.DefaultCastAs(target_type);
22+
}
23+
24+
auto member_count = UnionType::GetMemberCount(target_type);
25+
auto &source_type = val.type();
26+
27+
// First pass: if there's an exact type match we use that
28+
for (idx_t i = 0; i < member_count; i++) {
29+
if (UnionType::GetMemberType(target_type, i) == source_type) {
30+
return Value::UNION(UnionType::CopyMemberTypes(target_type), NumericCast<uint8_t>(i), std::move(val));
31+
}
32+
}
33+
34+
// Second pass: if there's a type we can implicitly cast to, we do that
35+
for (idx_t i = 0; i < member_count; i++) {
36+
auto member_type = UnionType::GetMemberType(target_type, i);
37+
Value candidate = val;
38+
if (candidate.DefaultTryCastAs(member_type)) {
39+
return Value::UNION(UnionType::CopyMemberTypes(target_type), NumericCast<uint8_t>(i), std::move(candidate));
40+
}
41+
}
42+
throw ConversionException("Could not convert value of type %s to %s", source_type.ToString(),
43+
target_type.ToString());
44+
}
45+
1646
static Value EmptyMapValue() {
1747
auto map_type = LogicalType::MAP(LogicalType::SQLNULL, LogicalType::SQLNULL);
1848
return Value::MAP(ListType::GetChildType(map_type), vector<Value>());
@@ -92,7 +122,7 @@ Value TransformDictionaryToStruct(const PyDictionary &dict, const LogicalType &t
92122
child_list_t<Value> struct_values;
93123
for (idx_t i = 0; i < dict.len; i++) {
94124
auto &key = struct_target ? StructType::GetChildName(target_type, i) : struct_keys[i];
95-
auto value_index = key_mapping[key];
125+
auto value_index = struct_target ? key_mapping[key] : i;
96126
auto &child_type = struct_target ? StructType::GetChildType(target_type, i) : LogicalType::UNKNOWN;
97127
auto val = TransformPythonValue(dict.values.attr("__getitem__")(value_index), child_type);
98128
struct_values.emplace_back(make_pair(std::move(key), std::move(val)));
@@ -230,150 +260,108 @@ Value TransformTupleToStruct(py::handle ele, const LogicalType &target_type = Lo
230260
return result;
231261
}
232262

233-
bool TryTransformPythonIntegerToDouble(Value &res, py::handle ele) {
234-
double number = PyLong_AsDouble(ele.ptr());
235-
if (number == -1.0 && PyErr_Occurred()) {
263+
// Tries to convert a Python integer that overflows int64/uint64 into a HUGEINT or UHUGEINT Value
264+
// by decomposing it into upper and lower 64-bit components. Tries HUGEINT first; falls back to
265+
// UHUGEINT for large positive values. Returns false if the value doesn't fit in 128 bits.
266+
static bool TryTransformPythonLongToHugeInt(py::handle ele, const LogicalType &target_type, Value &result) {
267+
auto ptr = ele.ptr();
268+
269+
// Extract lower 64 bits (two's complement, works for negative values too)
270+
uint64_t lower = PyLong_AsUnsignedLongLongMask(ptr);
271+
if (lower == static_cast<uint64_t>(-1) && PyErr_Occurred()) {
236272
PyErr_Clear();
237273
return false;
238274
}
239-
res = Value::DOUBLE(number);
275+
276+
// Extract upper bits by right-shifting by 64
277+
py::int_ shift_amount(64);
278+
py::object upper_obj = py::reinterpret_steal<py::object>(PyNumber_Rshift(ptr, shift_amount.ptr()));
279+
280+
// Try signed 128-bit (hugeint) first
281+
int overflow;
282+
int64_t upper_signed = PyLong_AsLongLongAndOverflow(upper_obj.ptr(), &overflow);
283+
if (overflow == 0 && !(upper_signed == -1 && PyErr_Occurred())) {
284+
auto val = Value::HUGEINT(hugeint_t {upper_signed, lower});
285+
if (target_type.id() == LogicalTypeId::UNKNOWN || target_type.id() == LogicalTypeId::HUGEINT) {
286+
result = val;
287+
} else {
288+
result = CastToTarget(std::move(val), target_type);
289+
}
290+
return true;
291+
}
292+
PyErr_Clear();
293+
294+
// Try unsigned 128-bit (uhugeint)
295+
uint64_t upper_unsigned = PyLong_AsUnsignedLongLong(upper_obj.ptr());
296+
if (PyErr_Occurred()) {
297+
PyErr_Clear();
298+
return false;
299+
}
300+
301+
auto val = Value::UHUGEINT(uhugeint_t {upper_unsigned, lower});
302+
if (target_type.id() == LogicalTypeId::UNKNOWN || target_type.id() == LogicalTypeId::UHUGEINT) {
303+
result = val;
304+
} else {
305+
result = CastToTarget(std::move(val), target_type);
306+
}
240307
return true;
241308
}
242309

243-
void TransformPythonUnsigned(uint64_t value, Value &res) {
244-
if (value > (uint64_t)std::numeric_limits<uint32_t>::max()) {
245-
res = Value::UBIGINT(value);
246-
} else if (value > (int64_t)std::numeric_limits<uint16_t>::max()) {
247-
res = Value::UINTEGER(value);
248-
} else if (value > (int64_t)std::numeric_limits<uint16_t>::max()) {
249-
res = Value::USMALLINT(value);
250-
} else {
251-
res = Value::UTINYINT(value);
310+
// Throwing wrapper for contexts that require a result (e.g. prepared statement parameters).
311+
static Value TransformPythonLongToHugeInt(py::handle ele, const LogicalType &target_type) {
312+
Value result;
313+
if (!TryTransformPythonLongToHugeInt(ele, target_type, result)) {
314+
throw InvalidInputException("Python integer too large for 128-bit integer type: %s", std::string(py::str(ele)));
252315
}
316+
return result;
253317
}
254318

255-
bool TrySniffPythonNumeric(Value &res, int64_t value) {
319+
// Picks the tightest DuckDB integer type (>=INT32) for an int64 value when no target type is specified.
320+
static Value SniffIntegerValue(int64_t value) {
256321
if (value < (int64_t)std::numeric_limits<int32_t>::min() || value > (int64_t)std::numeric_limits<int32_t>::max()) {
257-
res = Value::BIGINT(value);
258-
} else {
259-
// To match default duckdb behavior, numeric values without a specified type should not become a smaller type
260-
// than INT32
261-
res = Value::INTEGER(value);
322+
return Value::BIGINT(value);
262323
}
263-
return true;
324+
return Value::INTEGER(value);
264325
}
265326

266-
// TODO: add support for HUGEINT
267-
bool TryTransformPythonNumeric(Value &res, py::handle ele, const LogicalType &target_type) {
327+
// Sniffs the tightest DuckDB integer type for a Python integer.
328+
// Progressively widens: int64 → uint64 → hugeint → uhugeint.
329+
// Returns SQLNULL if the value doesn't fit in any DuckDB integer type (> 128-bit).
330+
LogicalType SniffPythonIntegerType(py::handle ele) {
268331
auto ptr = ele.ptr();
269332

333+
// Step 1: Try int64
270334
int overflow;
271-
int64_t value = PyLong_AsLongLongAndOverflow(ptr, &overflow);
272-
if (overflow == -1) {
273-
PyErr_Clear();
274-
if (target_type.id() == LogicalTypeId::BIGINT) {
275-
throw InvalidInputException(StringUtil::Format("Failed to cast value: Python value '%s' to INT64",
276-
std::string(pybind11::str(ele))));
277-
}
278-
auto cast_as = target_type.id() == LogicalTypeId::UNKNOWN ? LogicalType::HUGEINT : target_type;
279-
auto numeric_string = std::string(py::str(ele));
280-
res = Value(numeric_string).DefaultCastAs(cast_as);
281-
return true;
282-
} else if (overflow == 1) {
283-
if (target_type.InternalType() == PhysicalType::INT64) {
284-
throw InvalidInputException(StringUtil::Format("Failed to cast value: Python value '%s' to INT64",
285-
std::string(pybind11::str(ele))));
286-
}
287-
uint64_t unsigned_value = PyLong_AsUnsignedLongLong(ptr);
288-
if (PyErr_Occurred()) {
289-
PyErr_Clear();
290-
return TryTransformPythonIntegerToDouble(res, ele);
291-
} else {
292-
TransformPythonUnsigned(unsigned_value, res);
293-
}
294-
PyErr_Clear();
295-
return true;
296-
} else if (value == -1 && PyErr_Occurred()) {
297-
return false;
298-
}
299-
300-
// The value is int64_t or smaller
335+
const int64_t value = PyLong_AsLongLongAndOverflow(ptr, &overflow);
301336

302-
switch (target_type.id()) {
303-
case LogicalTypeId::UNKNOWN:
304-
return TrySniffPythonNumeric(res, value);
305-
case LogicalTypeId::HUGEINT: {
306-
res = Value::HUGEINT(value);
307-
return true;
308-
}
309-
case LogicalTypeId::UHUGEINT: {
310-
if (value < 0) {
311-
return false;
312-
}
313-
res = Value::UHUGEINT(value);
314-
return true;
315-
}
316-
case LogicalTypeId::BIGINT: {
317-
res = Value::BIGINT(value);
318-
return true;
319-
}
320-
case LogicalTypeId::INTEGER: {
321-
if (value < NumericLimits<int32_t>::Minimum() || value > NumericLimits<int32_t>::Maximum()) {
322-
return false;
323-
}
324-
res = Value::INTEGER(value);
325-
return true;
326-
}
327-
case LogicalTypeId::SMALLINT: {
328-
if (value < NumericLimits<int16_t>::Minimum() || value > NumericLimits<int16_t>::Maximum()) {
329-
return false;
330-
}
331-
res = Value::SMALLINT(value);
332-
return true;
333-
}
334-
case LogicalTypeId::TINYINT: {
335-
if (value < NumericLimits<int8_t>::Minimum() || value > NumericLimits<int8_t>::Maximum()) {
336-
return false;
337-
}
338-
res = Value::TINYINT(value);
339-
return true;
340-
}
341-
case LogicalTypeId::UBIGINT: {
342-
if (value < 0) {
343-
return false;
344-
}
345-
res = Value::UBIGINT(value);
346-
return true;
347-
}
348-
case LogicalTypeId::UINTEGER: {
349-
if (value < 0 || value > (int64_t)NumericLimits<uint32_t>::Maximum()) {
350-
return false;
351-
}
352-
res = Value::UINTEGER(value);
353-
return true;
354-
}
355-
case LogicalTypeId::USMALLINT: {
356-
if (value < 0 || value > (int64_t)NumericLimits<uint16_t>::Maximum()) {
357-
return false;
337+
if (overflow == 0) {
338+
if (value == -1 && PyErr_Occurred()) {
339+
PyErr_Clear();
340+
return LogicalType::SQLNULL;
358341
}
359-
res = Value::USMALLINT(value);
360-
return true;
361-
}
362-
case LogicalTypeId::UTINYINT: {
363-
if (value < 0 || value > (int64_t)NumericLimits<uint8_t>::Maximum()) {
364-
return false;
342+
if (value < static_cast<int64_t>(std::numeric_limits<int32_t>::min()) ||
343+
value > static_cast<int64_t>(std::numeric_limits<int32_t>::max())) {
344+
return LogicalType::BIGINT;
365345
}
366-
res = Value::UTINYINT(value);
367-
return true;
346+
return LogicalType::INTEGER;
368347
}
369-
default: {
370-
if (!TrySniffPythonNumeric(res, value)) {
371-
return false;
348+
PyErr_Clear();
349+
350+
// Step 2: For positive overflow, try uint64
351+
if (overflow == 1) {
352+
(void)PyLong_AsUnsignedLongLong(ptr);
353+
if (!PyErr_Occurred()) {
354+
return LogicalType::UBIGINT;
372355
}
373-
res = res.DefaultCastAs(target_type, true);
374-
return true;
356+
PyErr_Clear();
375357
}
358+
359+
// Step 3: Try 128-bit (hugeint/uhugeint)
360+
Value res;
361+
if (!TryTransformPythonLongToHugeInt(ele, LogicalType::UNKNOWN, res)) {
362+
return LogicalType::SQLNULL;
376363
}
364+
return res.type();
377365
}
378366

379367
Value TransformDictionary(const PyDictionary &dict) {
@@ -476,33 +464,22 @@ struct PythonValueConversion {
476464
target_type.ToString());
477465
}
478466
default:
479-
throw ConversionException("Could not convert 'float' to type %s", target_type.ToString());
467+
result = CastToTarget(Value::DOUBLE(val), target_type);
468+
break;
480469
}
481470
}
482-
static void HandleLongAsDouble(Value &result, const LogicalType &target_type, double val) {
483-
auto cast_as = target_type.id() == LogicalTypeId::UNKNOWN ? LogicalType::DOUBLE : target_type;
484-
result = Value::DOUBLE(val).DefaultCastAs(cast_as);
471+
static void HandleLongOverflow(Value &result, const LogicalType &target_type, py::handle ele) {
472+
result = TransformPythonLongToHugeInt(ele, target_type);
485473
}
486474
static void HandleUnsignedBigint(Value &result, const LogicalType &target_type, uint64_t val) {
487475
auto cast_as = target_type.id() == LogicalTypeId::UNKNOWN ? LogicalType::UBIGINT : target_type;
488-
result = Value::UBIGINT(val).DefaultCastAs(cast_as);
476+
result = CastToTarget(Value::UBIGINT(val), cast_as);
489477
}
490478
static void HandleBigint(Value &res, const LogicalType &target_type, int64_t value) {
491-
switch (target_type.id()) {
492-
case LogicalTypeId::UNKNOWN: {
493-
if (value < (int64_t)std::numeric_limits<int32_t>::min() ||
494-
value > (int64_t)std::numeric_limits<int32_t>::max()) {
495-
res = Value::BIGINT(value);
496-
} else {
497-
// To match default duckdb behavior, numeric values without a specified type should not become a smaller
498-
// type than INT32
499-
res = Value::INTEGER(value);
500-
}
501-
break;
502-
}
503-
default:
504-
res = Value::BIGINT(value).DefaultCastAs(target_type);
505-
break;
479+
if (target_type.id() == LogicalTypeId::UNKNOWN) {
480+
res = SniffIntegerValue(value);
481+
} else {
482+
res = CastToTarget(SniffIntegerValue(value), target_type);
506483
}
507484
}
508485

@@ -511,7 +488,7 @@ struct PythonValueConversion {
511488
(target_type.id() == LogicalTypeId::VARCHAR && !target_type.HasAlias())) {
512489
result = Value(value);
513490
} else {
514-
result = Value(value).DefaultCastAs(target_type);
491+
result = CastToTarget(Value(value), target_type);
515492
}
516493
}
517494

@@ -648,13 +625,13 @@ struct PythonVectorConversion {
648625
break;
649626
}
650627
default:
651-
throw TypeMismatchException(
652-
LogicalType::DOUBLE, result.GetType(),
653-
"Python Conversion Failure: Expected a value of type %s, but got a value of type double");
628+
FallbackValueConversion(result, result_offset, CastToTarget(Value::DOUBLE(val), result.GetType()));
629+
break;
654630
}
655631
}
656-
static void HandleLongAsDouble(Vector &result, const idx_t &result_offset, double val) {
657-
FallbackValueConversion(result, result_offset, Value::DOUBLE(val));
632+
static void HandleLongOverflow(Vector &result, const idx_t &result_offset, py::handle ele) {
633+
Value result_val = TransformPythonLongToHugeInt(ele, result.GetType());
634+
FallbackValueConversion(result, result_offset, std::move(result_val));
658635
}
659636
static void HandleUnsignedBigint(Vector &result, const idx_t &result_offset, uint64_t value) {
660637
// this code path is only called for values in the range of [INT64_MAX...UINT64_MAX]
@@ -669,7 +646,7 @@ struct PythonVectorConversion {
669646
FlatVector::GetData<uint64_t>(result)[result_offset] = value;
670647
break;
671648
default:
672-
FallbackValueConversion(result, result_offset, Value::UBIGINT(value));
649+
FallbackValueConversion(result, result_offset, CastToTarget(Value::UBIGINT(value), result.GetType()));
673650
break;
674651
}
675652
}
@@ -740,7 +717,7 @@ struct PythonVectorConversion {
740717
break;
741718
}
742719
default:
743-
FallbackValueConversion(result, result_offset, Value::BIGINT(value));
720+
FallbackValueConversion(result, result_offset, CastToTarget(Value::BIGINT(value), result.GetType()));
744721
break;
745722
}
746723
}
@@ -966,12 +943,7 @@ void TransformPythonObjectInternal(py::handle ele, A &result, const B &param, bo
966943
conversion_target);
967944
}
968945
}
969-
double number = PyLong_AsDouble(ele.ptr());
970-
if (number == -1.0 && PyErr_Occurred()) {
971-
PyErr_Clear();
972-
throw InvalidInputException("An error occurred attempting to convert a python integer");
973-
}
974-
OP::HandleLongAsDouble(result, param, number);
946+
OP::HandleLongOverflow(result, param, ele);
975947
} else if (value == -1 && PyErr_Occurred()) {
976948
throw InvalidInputException("An error occurred attempting to convert a python integer");
977949
} else {

0 commit comments

Comments
 (0)