refactor: improve session context management in DataFrame stream readers

kosiew · kosiew · commit 9c362aa6fbac · 2025-09-08T16:25:06.000+08:00
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -1118,8 +1118,9 @@ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
 
         The returned capsule holds a reference to the originating
         :class:`SessionContext`, keeping it alive until the stream is fully
-        consumed. This makes it safe to drop the original context after obtaining
-        the stream.
+        consumed. The stream is explicitly closed before the context is
+        released, so it is safe to drop the original context after obtaining the
+        stream.
 
         Args:
             requested_schema: Attempt to provide the DataFrame using this schema.
diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -366,13 +366,19 @@ impl PyDataFrame {
 /// converted via `record_batch_into_schema` to apply schema changes per batch.
 struct PartitionedDataFrameStreamReader {
     streams: Vec<Arc<Mutex<SendableRecordBatchStream>>>,
-    // Hold a reference to the session context to keep it alive
-    _ctx: Arc<SessionContext>,
     schema: SchemaRef,
     projection: Option<SchemaRef>,
     current: usize,
 }
 
+/// Wrapper that keeps the [`SessionContext`] alive while a
+/// [`PartitionedDataFrameStreamReader`] is exported through the Arrow C Stream
+/// interface.
+struct StreamWithContext {
+    reader: PartitionedDataFrameStreamReader,
+    ctx: Arc<SessionContext>,
+}
+
 impl Iterator for PartitionedDataFrameStreamReader {
     type Item = Result<RecordBatch, ArrowError>;
 
@@ -419,6 +425,27 @@ impl RecordBatchReader for PartitionedDataFrameStreamReader {
     }
 }
 
+impl Iterator for StreamWithContext {
+    type Item = Result<RecordBatch, ArrowError>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.reader.next()
+    }
+}
+
+impl RecordBatchReader for StreamWithContext {
+    fn schema(&self) -> SchemaRef {
+        self.reader.schema()
+    }
+}
+
+impl Drop for StreamWithContext {
+    fn drop(&mut self) {
+        // Explicitly close streams before the context is released
+        self.reader.streams.clear();
+    }
+}
+
 #[pymethods]
 impl PyDataFrame {
     /// Enable selection for `df[col]`, `df[col1, col2, col3]`, and `df[[col1, col2, col3]]`
@@ -984,12 +1011,12 @@ impl PyDataFrame {
         let schema_ref = Arc::new(schema.clone());
 
         let reader = PartitionedDataFrameStreamReader {
-            _ctx: ctx,
             streams,
             schema: schema_ref,
             projection,
             current: 0,
         };
+        let reader = StreamWithContext { reader, ctx };
         let reader: Box<dyn RecordBatchReader + Send> = Box::new(reader);
 
         let stream = FFI_ArrowArrayStream::new(reader);