Skip to content

Commit cf46050

Browse files
[PyArrow] Filter Pushdown of IS IN now uses the dedicated pyarrow.compure.Expression.isin method (#335)
Continued from #73 Null handling seems to work fine, afaics. We are using [`Expression.isin()`](https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Expression.html#pyarrow.dataset.Expression.isin), which does not have a `skipp_nulls` kwarg. It uses the same default semantics as [`pyarrow.compute.is_in`](https://arrow.apache.org/docs/python/generated/pyarrow.compute.is_in.html#pyarrow.compute.is_in), which is `skip_nulls = False`. This is not exactly the same as duckdb's semantics, because DuckDB will not return NULL values even if NULL is in the IN list: ``` D select * from VALUES(1), (2), (3), (NULL) as t(myint) where myint in (2, NULL); ┌───────┐ │ myint │ │ int32 │ ├───────┤ │ 2 │ └───────┘ ``` But this is no big deal. We might let more rows through than is needed, but duckdb will filter them out anyway. Also see the test.
2 parents 542ecd6 + 3eaf3b3 commit cf46050

2 files changed

Lines changed: 39 additions & 11 deletions

File tree

src/duckdb_py/arrow/pyarrow_filter_pushdown.cpp

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,15 @@ py::object GetScalar(Value &constant, const string &timezone_config, const Arrow
160160
}
161161
}
162162

163+
static py::list TransformInList(const InFilter &in) {
164+
py::list res;
165+
ClientProperties default_properties;
166+
for (auto &val : in.values) {
167+
res.append(PythonObject::FromValue(val, val.type(), default_properties));
168+
}
169+
return res;
170+
}
171+
163172
py::object TransformFilterRecursive(TableFilter &filter, vector<string> column_ref, const string &timezone_config,
164173
const ArrowType &type) {
165174
auto &import_cache = *DuckDBPyConnection::ImportCache();
@@ -282,17 +291,9 @@ py::object TransformFilterRecursive(TableFilter &filter, vector<string> column_r
282291
}
283292
case TableFilterType::IN_FILTER: {
284293
auto &in_filter = filter.Cast<InFilter>();
285-
ConjunctionOrFilter or_filter;
286-
value_set_t unique_values;
287-
for (const auto &value : in_filter.values) {
288-
if (unique_values.find(value) == unique_values.end()) {
289-
unique_values.insert(value);
290-
}
291-
}
292-
for (const auto &value : unique_values) {
293-
or_filter.child_filters.push_back(make_uniq<ConstantFilter>(ExpressionType::COMPARE_EQUAL, value));
294-
}
295-
return TransformFilterRecursive(or_filter, column_ref, timezone_config, type);
294+
auto constant_field = field(py::tuple(py::cast(column_ref)));
295+
auto in_list = TransformInList(in_filter);
296+
return constant_field.attr("isin")(std::move(in_list));
296297
}
297298
case TableFilterType::DYNAMIC_FILTER: {
298299
//! Ignore dynamic filters for now, not necessary for correctness

tests/fast/arrow/test_filter_pushdown.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -921,6 +921,33 @@ def test_in_filter_pushdown(self, duckdb_cursor):
921921
duckdb_conn.register("duck_probe_arrow", duck_probe_arrow)
922922
assert duckdb_conn.execute("SELECT * from duck_probe_arrow where a = any([1,999])").fetchall() == [(1,), (999,)]
923923

924+
@pytest.mark.timeout(10)
925+
def test_in_filter_pushdown_large_list(self, duckdb_cursor):
926+
"""Large IN lists must not hang. Regression test for https://github.com/duckdb/duckdb-python/issues/52."""
927+
arrow_table = pa.table({"a": pa.array(range(5000))})
928+
in_list = ", ".join(str(i) for i in range(0, 5000, 2))
929+
result = duckdb.sql(f"SELECT count(*) FROM arrow_table WHERE a IN ({in_list})").fetchone()
930+
assert result == (2500,)
931+
932+
def test_in_filter_pushdown_with_nulls(self, duckdb_cursor):
933+
arrow_table = pa.table({"a": pa.array([1, 2, None, 4, None, 6])})
934+
# IN list without NULL: null rows should not match
935+
result = duckdb.sql("SELECT a FROM arrow_table WHERE a IN (1, 4) ORDER BY a").fetchall()
936+
assert result == [(1,), (4,)]
937+
# IN list with NULL: null rows still should not match (SQL semantics)
938+
result = duckdb.sql("SELECT a FROM arrow_table WHERE a IN (1, 4, NULL) ORDER BY a").fetchall()
939+
assert result == [(1,), (4,)]
940+
941+
def test_in_filter_pushdown_varchar(self, duckdb_cursor):
942+
arrow_table = pa.table({"s": pa.array(["alice", "bob", "charlie", "dave", None])})
943+
result = duckdb.sql("SELECT s FROM arrow_table WHERE s IN ('bob', 'dave') ORDER BY s").fetchall()
944+
assert result == [("bob",), ("dave",)]
945+
946+
def test_in_filter_pushdown_float(self, duckdb_cursor):
947+
arrow_table = pa.table({"f": pa.array([1.0, 2.5, 3.75, 4.0, None], type=pa.float64())})
948+
result = duckdb.sql("SELECT f FROM arrow_table WHERE f IN (2.5, 4.0) ORDER BY f").fetchall()
949+
assert result == [(2.5,), (4.0,)]
950+
924951
def test_pushdown_of_optional_filter(self, duckdb_cursor):
925952
cardinality_table = pa.Table.from_pydict(
926953
{

0 commit comments

Comments
 (0)