Skip to content

Commit 38532ff

Browse files
committed
Cache arrow schema to avoid repeated lookups
1 parent 7094be5 commit 38532ff

2 files changed

Lines changed: 33 additions & 4 deletions

File tree

src/duckdb_py/arrow/arrow_array_stream.cpp

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -176,8 +176,16 @@ void PythonTableArrowArrayStreamFactory::GetSchemaInternal(py::handle arrow_obj_
176176
}
177177

178178
void PythonTableArrowArrayStreamFactory::GetSchema(uintptr_t factory_ptr, ArrowSchemaWrapper &schema) {
179-
py::gil_scoped_acquire acquire;
180179
auto factory = static_cast<PythonTableArrowArrayStreamFactory *>(reinterpret_cast<void *>(factory_ptr)); // NOLINT
180+
181+
// Fast path: return cached schema without GIL or Python calls
182+
if (factory->schema_cached) {
183+
schema.arrow_schema = factory->cached_schema; // struct copy
184+
schema.arrow_schema.release = nullptr; // non-owning copy
185+
return;
186+
}
187+
188+
py::gil_scoped_acquire acquire;
181189
D_ASSERT(factory->arrow_object);
182190
py::handle arrow_obj_handle(factory->arrow_object);
183191

@@ -188,8 +196,11 @@ void PythonTableArrowArrayStreamFactory::GetSchema(uintptr_t factory_ptr, ArrowS
188196
auto schema_capsule = arrow_obj_handle.attr("__arrow_c_schema__")();
189197
auto capsule = py::reinterpret_borrow<py::capsule>(schema_capsule);
190198
auto arrow_schema = capsule.get_pointer<struct ArrowSchema>();
191-
schema.arrow_schema = *arrow_schema;
192-
arrow_schema->release = nullptr; // take ownership
199+
factory->cached_schema = *arrow_schema; // factory takes ownership
200+
arrow_schema->release = nullptr;
201+
factory->schema_cached = true;
202+
schema.arrow_schema = factory->cached_schema; // non-owning copy
203+
schema.arrow_schema.release = nullptr;
193204
return;
194205
}
195206
// Otherwise try to use .schema with _export_to_c
@@ -211,6 +222,13 @@ void PythonTableArrowArrayStreamFactory::GetSchema(uintptr_t factory_ptr, ArrowS
211222
return; // stream_capsule goes out of scope, stream released by capsule destructor
212223
}
213224
GetSchemaInternal(arrow_obj_handle, schema);
225+
226+
// Cache for Table and Dataset (immutable schema)
227+
if (type == PyArrowObjectType::Table || type == PyArrowObjectType::Dataset) {
228+
factory->cached_schema = schema.arrow_schema; // factory takes ownership
229+
schema.arrow_schema.release = nullptr; // caller gets non-owning copy
230+
factory->schema_cached = true;
231+
}
214232
}
215233

216234
} // namespace duckdb

src/duckdb_py/include/duckdb_python/arrow/arrow_array_stream.hpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,15 @@ class PythonTableArrowArrayStreamFactory {
6161
public:
6262
explicit PythonTableArrowArrayStreamFactory(PyObject *arrow_table, const ClientProperties &client_properties_p,
6363
PyArrowObjectType arrow_type_p)
64-
: arrow_object(arrow_table), client_properties(client_properties_p), cached_arrow_type(arrow_type_p) {};
64+
: arrow_object(arrow_table), client_properties(client_properties_p), cached_arrow_type(arrow_type_p) {
65+
cached_schema.release = nullptr;
66+
}
67+
68+
~PythonTableArrowArrayStreamFactory() {
69+
if (cached_schema.release) {
70+
cached_schema.release(&cached_schema);
71+
}
72+
}
6573

6674
//! Produces an Arrow Scanner, should be only called once when initializing Scan States
6775
static unique_ptr<ArrowArrayStreamWrapper> Produce(uintptr_t factory, ArrowStreamParameters &parameters);
@@ -77,6 +85,9 @@ class PythonTableArrowArrayStreamFactory {
7785
const PyArrowObjectType cached_arrow_type;
7886

7987
private:
88+
ArrowSchema cached_schema;
89+
bool schema_cached = false;
90+
8091
static py::object ProduceScanner(py::object &arrow_scanner, py::handle &arrow_obj_handle,
8192
ArrowStreamParameters &parameters, const ClientProperties &client_properties);
8293
};

0 commit comments

Comments
 (0)