UNPICK revert Arrow streaming

kosiew · kosiew · commit a5efa671667d · 2025-08-23T00:04:46.000+08:00
diff --git a/docs/source/user-guide/io/arrow.rst b/docs/source/user-guide/io/arrow.rst
@@ -59,10 +59,11 @@ Exporting from DataFusion
 
 DataFusion DataFrames implement ``__arrow_c_stream__`` PyCapsule interface, so any
 Python library that accepts these can import a DataFusion DataFrame directly.
-The exported stream yields record batches lazily using DataFusion's
-``execute_stream`` mechanism, allowing consumers to process results incrementally
-without buffering the entire dataset in memory. This streaming behavior helps
-avoid out-of-memory failures when working with large queries.
+
+.. warning::
+    It is important to note that this will cause the DataFrame execution to happen, which may be
+    a time consuming task. That is, you will cause a
+    :py:func:`datafusion.dataframe.DataFrame.collect` operation call to occur.
 
 
 .. ipython:: python
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -20,7 +20,6 @@
 import re
 import threading
 import time
-import tracemalloc
 from typing import Any
 
 import pyarrow as pa
@@ -1568,23 +1567,6 @@ async def test_execute_stream_partitioned_async(df):
         assert not remaining_batches
 
 
-def test_arrow_c_stream_streaming(large_df):
-    df = large_df.repartition(4)
-    capsule = df.__arrow_c_stream__()
-    ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p
-    ctypes.pythonapi.PyCapsule_GetPointer.argtypes = [ctypes.py_object, ctypes.c_char_p]
-    ptr = ctypes.pythonapi.PyCapsule_GetPointer(capsule, b"arrow_array_stream")
-    reader = pa.RecordBatchReader._import_from_c(ptr)
-
-    tracemalloc.start()
-    batch_count = sum(1 for _ in reader)
-    current, peak = tracemalloc.get_traced_memory()
-    tracemalloc.stop()
-
-    assert batch_count > 1
-    assert peak < 50 * MB
-
-
 def test_empty_to_arrow_table(df):
     # Convert empty datafusion dataframe to pyarrow Table
     pyarrow_table = df.limit(0).to_arrow_table()
diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -19,13 +19,13 @@ use std::collections::HashMap;
 use std::ffi::CString;
 use std::sync::Arc;
 
-use arrow::array::{new_null_array, RecordBatch, RecordBatchReader};
+use arrow::array::{new_null_array, RecordBatch, RecordBatchIterator, RecordBatchReader};
 use arrow::compute::can_cast_types;
 use arrow::error::ArrowError;
 use arrow::ffi::FFI_ArrowSchema;
 use arrow::ffi_stream::FFI_ArrowArrayStream;
 use arrow::pyarrow::FromPyArrow;
-use datafusion::arrow::datatypes::{Schema, SchemaRef};
+use datafusion::arrow::datatypes::Schema;
 use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow};
 use datafusion::arrow::util::pretty;
 use datafusion::common::UnnestOptions;
@@ -879,17 +879,8 @@ impl PyDataFrame {
         py: Python<'py>,
         requested_schema: Option<Bound<'py, PyCapsule>>,
     ) -> PyDataFusionResult<Bound<'py, PyCapsule>> {
-        // execute query lazily using a stream
-        let rt = &get_tokio_runtime().0;
-        let df = self.df.as_ref().clone();
-        let fut: JoinHandle<datafusion::common::Result<SendableRecordBatchStream>> =
-            rt.spawn(async move { df.execute_stream().await });
-        let stream = wait_for_future(py, async { fut.await.map_err(to_datafusion_err) })???;
-
-        // Determine the schema and handle optional projection
-        let stream_schema = stream.schema();
-        let mut schema: Schema = stream_schema.as_ref().to_owned().into();
-        let mut project = false;
+        let mut batches = wait_for_future(py, self.df.as_ref().clone().collect())??;
+        let mut schema: Schema = self.df.schema().to_owned().into();
 
         if let Some(schema_capsule) = requested_schema {
             validate_pycapsule(&schema_capsule, "arrow_schema")?;
@@ -898,12 +889,17 @@ impl PyDataFrame {
             let desired_schema = Schema::try_from(schema_ptr)?;
 
             schema = project_schema(schema, desired_schema)?;
-            project = schema != *stream_schema.as_ref();
+
+            batches = batches
+                .into_iter()
+                .map(|record_batch| record_batch_into_schema(record_batch, &schema))
+                .collect::<Result<Vec<RecordBatch>, ArrowError>>()?;
         }
 
-        let schema_ref: SchemaRef = Arc::new(schema);
-        let reader: Box<dyn RecordBatchReader + Send> =
-            Box::new(ArrowStreamReader::new(stream, schema_ref, project));
+        let batches_wrapped = batches.into_iter().map(Ok);
+
+        let reader = RecordBatchIterator::new(batches_wrapped, Arc::new(schema));
+        let reader: Box<dyn RecordBatchReader + Send> = Box::new(reader);
 
         let ffi_stream = FFI_ArrowArrayStream::new(reader);
         let stream_capsule_name = CString::new("arrow_array_stream").unwrap();
@@ -998,51 +994,6 @@ impl PyDataFrame {
     }
 }
 
-struct ArrowStreamReader {
-    stream: SendableRecordBatchStream,
-    schema: SchemaRef,
-    project: bool,
-}
-
-impl ArrowStreamReader {
-    fn new(stream: SendableRecordBatchStream, schema: SchemaRef, project: bool) -> Self {
-        Self {
-            stream,
-            schema,
-            project,
-        }
-    }
-}
-
-impl RecordBatchReader for ArrowStreamReader {
-    fn schema(&self) -> SchemaRef {
-        self.schema.clone()
-    }
-}
-
-impl Iterator for ArrowStreamReader {
-    type Item = Result<RecordBatch, ArrowError>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        let rt = &get_tokio_runtime().0;
-        match rt.block_on(self.stream.next()) {
-            Some(Ok(batch)) => {
-                let batch = if self.project {
-                    match record_batch_into_schema(batch, self.schema.as_ref()) {
-                        Ok(b) => b,
-                        Err(e) => return Some(Err(e)),
-                    }
-                } else {
-                    batch
-                };
-                Some(Ok(batch))
-            }
-            Some(Err(e)) => Some(Err(ArrowError::from(e))),
-            None => None,
-        }
-    }
-}
-
 /// Print DataFrame
 fn print_dataframe(py: Python, df: DataFrame) -> PyDataFusionResult<()> {
     // Get string representation of record batches