test: add lifetime test for Arrow Table conversion in DataFrame

kosiew · kosiew · commit 632f614aab4b · 2025-09-08T11:33:31.000+08:00
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -1581,6 +1581,19 @@ def test_empty_to_arrow_table(df):
     assert set(pyarrow_table.column_names) == {"a", "b", "c"}
 
 
+def test_to_arrow_table_lifetime(ctx):
+    df = ctx.sql("SELECT 1 AS a")
+    table = df.to_arrow_table()
+
+    # Drop the DataFrame and force Rust-side resources to be freed. The
+    # PyArrow Table should still remain valid because it now owns the
+    # buffers exported from Rust.
+    del df
+    gc.collect()
+
+    assert table.to_pylist() == [{"a": 1}]
+
+
 def test_iter_batches_dataframe(fail_collect):
     ctx = SessionContext()
 
diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -19,14 +19,14 @@ use std::collections::HashMap;
 use std::ffi::CString;
 use std::sync::Arc;
 
-use arrow::array::{new_null_array, RecordBatch, RecordBatchReader};
+use arrow::array::{new_null_array, RecordBatch, RecordBatchIterator, RecordBatchReader};
 use arrow::compute::can_cast_types;
 use arrow::error::ArrowError;
 use arrow::ffi::FFI_ArrowSchema;
 use arrow::ffi_stream::FFI_ArrowArrayStream;
 use arrow::pyarrow::FromPyArrow;
 use datafusion::arrow::datatypes::{Schema, SchemaRef};
-use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow};
+use datafusion::arrow::pyarrow::{IntoPyArrow, PyArrowType, ToPyArrow};
 use datafusion::arrow::util::pretty;
 use datafusion::common::UnnestOptions;
 use datafusion::config::{CsvOptions, ParquetColumnOptions, ParquetOptions, TableParquetOptions};
@@ -918,15 +918,31 @@ impl PyDataFrame {
     }
 
     /// Convert to Arrow Table
-    /// Collect the batches and pass to Arrow Table
+    ///
+    /// Collect [`RecordBatch`]es in Rust and stream them to PyArrow
+    /// to construct a `pyarrow.Table` without creating intermediate
+    /// Python `RecordBatch` objects. The `RecordBatchIterator` owns the
+    /// `RecordBatch`es so the underlying buffers live long enough for
+    /// PyArrow to consume them, avoiding use-after-free or double-free
+    /// issues.
     fn to_arrow_table(&self, py: Python<'_>) -> PyResult<PyObject> {
-        let batches = self.collect(py)?.into_pyobject(py)?;
-        let schema = self.schema().into_pyobject(py)?;
+        // Collect the batches on the Rust side
+        let df = self.df.as_ref().clone();
+        let batches = wait_for_future(py, df.collect())?.map_err(PyDataFusionError::from)?;
+
+        // Build a RecordBatchReader owning the batches. PyArrow will
+        // read from this stream and build the Table in C++.
+        let schema = batches
+            .first()
+            .map(|b| b.schema())
+            .unwrap_or_else(|| Arc::new(Schema::empty()));
+        let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema);
+        let reader: Box<dyn RecordBatchReader + Send> = Box::new(reader);
+        let py_reader = reader.into_pyarrow(py)?;
 
-        // Instantiate pyarrow Table object and use its from_batches method
-        let table_class = py.import("pyarrow")?.getattr("Table")?;
-        let args = PyTuple::new(py, &[batches, schema])?;
-        let table: PyObject = table_class.call_method1("from_batches", args)?.into();
+        // `read_all` constructs the pyarrow.Table directly from the
+        // stream without materializing Python RecordBatch objects.
+        let table = py_reader.call_method0(py, "read_all")?;
         Ok(table)
     }