feat: enhance stream execution to maintain session context

kosiew · kosiew · commit e70b16ebbb8d · 2025-09-08T12:11:07.000+08:00
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -1573,6 +1573,30 @@ async def test_execute_stream_partitioned_async(df):
         assert not remaining_batches
 
 
+def test_stream_keeps_context_alive():
+    ctx = SessionContext()
+    batch = pa.record_batch([pa.array([1])], names=["a"])
+    df = ctx.create_dataframe([[batch]])
+
+    stream = df.execute_stream()
+    capsule = df.__arrow_c_stream__()
+
+    del df
+    del ctx
+    gc.collect()
+
+    # PyRecordBatchStream should still yield the batch
+    batches = list(stream)
+    assert len(batches) == 1
+    assert batches[0].equals(batch)
+
+    # Arrow C stream should remain consumable
+    reader = pa.RecordBatchReader._import_from_c_capsule(capsule)
+    table = pa.Table.from_batches(reader)
+    expected = pa.Table.from_batches([batch])
+    assert table.equals(expected)
+
+
 def test_empty_to_arrow_table(df):
     # Convert empty datafusion dataframe to pyarrow Table
     pyarrow_table = df.limit(0).to_arrow_table()
diff --git a/src/context.rs b/src/context.rs
@@ -1129,10 +1129,11 @@ impl PySessionContext {
         part: usize,
         py: Python,
     ) -> PyDataFusionResult<PyRecordBatchStream> {
-        let ctx: TaskContext = TaskContext::from(&self.ctx.state());
+        let state = self.ctx.state();
+        let ctx: TaskContext = TaskContext::from(&state);
         let plan = plan.plan.clone();
         let stream = spawn_future(py, async move { plan.execute(part, Arc::new(ctx)) })?;
-        Ok(PyRecordBatchStream::new(stream))
+        Ok(PyRecordBatchStream::new(stream, state))
     }
 }
 
diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -34,6 +34,7 @@ use datafusion::dataframe::{DataFrame, DataFrameWriteOptions};
 use datafusion::datasource::TableProvider;
 use datafusion::error::DataFusionError;
 use datafusion::execution::SendableRecordBatchStream;
+use datafusion::execution::session_state::SessionState;
 use datafusion::parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel};
 use datafusion::prelude::*;
 use datafusion_ffi::table_provider::FFI_TableProvider;
@@ -360,6 +361,8 @@ impl PyDataFrame {
 /// converted via `record_batch_into_schema` to apply schema changes per batch.
 struct PartitionedDataFrameStreamReader {
     streams: Vec<SendableRecordBatchStream>,
+    // Hold a reference to the session state to keep the context alive
+    _state: Arc<SessionState>,
     schema: SchemaRef,
     projection: Option<SchemaRef>,
     current: usize,
@@ -951,6 +954,7 @@ impl PyDataFrame {
         requested_schema: Option<Bound<'py, PyCapsule>>,
     ) -> PyDataFusionResult<Bound<'py, PyCapsule>> {
         let df = self.df.as_ref().clone();
+        let state = df.session_state().clone();
         let streams = spawn_future(py, async move { df.execute_stream_partitioned().await })?;
 
         let mut schema: Schema = self.df.schema().to_owned().into();
@@ -969,6 +973,7 @@ impl PyDataFrame {
         let schema_ref = Arc::new(schema.clone());
 
         let reader = PartitionedDataFrameStreamReader {
+            _state: state,
             streams,
             schema: schema_ref,
             projection,
@@ -985,14 +990,21 @@ impl PyDataFrame {
 
     fn execute_stream(&self, py: Python) -> PyDataFusionResult<PyRecordBatchStream> {
         let df = self.df.as_ref().clone();
+        let state = df.session_state().clone();
         let stream = spawn_future(py, async move { df.execute_stream().await })?;
-        Ok(PyRecordBatchStream::new(stream))
+        Ok(PyRecordBatchStream::new(stream, state))
     }
 
     fn execute_stream_partitioned(&self, py: Python) -> PyResult<Vec<PyRecordBatchStream>> {
         let df = self.df.as_ref().clone();
+        let state = df.session_state().clone();
         let streams = spawn_future(py, async move { df.execute_stream_partitioned().await })?;
-        Ok(streams.into_iter().map(PyRecordBatchStream::new).collect())
+        Ok(
+            streams
+                .into_iter()
+                .map(|stream| PyRecordBatchStream::new(stream, state.clone()))
+                .collect(),
+        )
     }
 
     /// Convert to pandas dataframe with pyarrow
diff --git a/src/record_batch.rs b/src/record_batch.rs
@@ -21,6 +21,7 @@ use crate::errors::PyDataFusionError;
 use crate::utils::wait_for_future;
 use datafusion::arrow::pyarrow::ToPyArrow;
 use datafusion::arrow::record_batch::RecordBatch;
+use datafusion::execution::session_state::SessionState;
 use datafusion::physical_plan::SendableRecordBatchStream;
 use futures::StreamExt;
 use pyo3::exceptions::{PyStopAsyncIteration, PyStopIteration};
@@ -66,12 +67,16 @@ pub(crate) fn record_batches_to_pyarrow(
 #[pyclass(name = "RecordBatchStream", module = "datafusion", subclass)]
 pub struct PyRecordBatchStream {
     stream: Arc<Mutex<SendableRecordBatchStream>>,
+    // Hold on to the session state to ensure the underlying context
+    // remains alive for the duration of the stream
+    _state: Arc<SessionState>,
 }
 
 impl PyRecordBatchStream {
-    pub fn new(stream: SendableRecordBatchStream) -> Self {
+    pub fn new(stream: SendableRecordBatchStream, state: Arc<SessionState>) -> Self {
         Self {
             stream: Arc::new(Mutex::new(stream)),
+            _state: state,
         }
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -1129,10 +1129,11 @@ impl PySessionContext {`
`1129`	`1129`	`part: usize,`
`1130`	`1130`	`py: Python,`
`1131`	`1131`	`) -> PyDataFusionResult<PyRecordBatchStream> {`
`1132`		`- let ctx: TaskContext = TaskContext::from(&self.ctx.state());`
	`1132`	`+ let state = self.ctx.state();`
	`1133`	`+ let ctx: TaskContext = TaskContext::from(&state);`
`1133`	`1134`	`let plan = plan.plan.clone();`
`1134`	`1135`	`let stream = spawn_future(py, async move { plan.execute(part, Arc::new(ctx)) })?;`
`1135`		`- Ok(PyRecordBatchStream::new(stream))`
	`1136`	`+ Ok(PyRecordBatchStream::new(stream, state))`
`1136`	`1137`	`}`
`1137`	`1138`	`}`
`1138`	`1139`
Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,7 @@ use crate::errors::PyDataFusionError;`
`21`	`21`	`use crate::utils::wait_for_future;`
`22`	`22`	`use datafusion::arrow::pyarrow::ToPyArrow;`
`23`	`23`	`use datafusion::arrow::record_batch::RecordBatch;`
	`24`	`+use datafusion::execution::session_state::SessionState;`
`24`	`25`	`use datafusion::physical_plan::SendableRecordBatchStream;`
`25`	`26`	`use futures::StreamExt;`
`26`	`27`	`use pyo3::exceptions::{PyStopAsyncIteration, PyStopIteration};`
`@@ -66,12 +67,16 @@ pub(crate) fn record_batches_to_pyarrow(`
`66`	`67`	`#[pyclass(name = "RecordBatchStream", module = "datafusion", subclass)]`
`67`	`68`	`pub struct PyRecordBatchStream {`
`68`	`69`	`stream: Arc<Mutex<SendableRecordBatchStream>>,`
	`70`	`+ // Hold on to the session state to ensure the underlying context`
	`71`	`+ // remains alive for the duration of the stream`
	`72`	`+ _state: Arc<SessionState>,`
`69`	`73`	`}`
`70`	`74`
`71`	`75`	`impl PyRecordBatchStream {`
`72`		`- pub fn new(stream: SendableRecordBatchStream) -> Self {`
	`76`	`+ pub fn new(stream: SendableRecordBatchStream, state: Arc<SessionState>) -> Self {`
`73`	`77`	`Self {`
`74`	`78`	`stream: Arc::new(Mutex::new(stream)),`
	`79`	`+ _state: state,`
`75`	`80`	`}`
`76`	`81`	`}`
`77`	`82`	`}`