Enhance Arrow C stream export to support partitioned reading, reducing memory usage for large result sets

kosiew · kosiew · commit c5cd149f9c95 · 2025-08-29T19:51:12.000+08:00
diff --git a/docs/source/user-guide/io/arrow.rst b/docs/source/user-guide/io/arrow.rst
@@ -79,6 +79,9 @@ output incrementally:
     for batch in reader:
         ...  # process each batch without buffering the entire table
 
+DataFusion reads one partition at a time when exporting a C stream, so large
+result sets are not buffered entirely in memory.
+
 If the goal is simply to persist results, prefer engine-level writers such as
 ``df.write_parquet()``. These writers stream data from Rust directly to the
 destination and avoid Python-side memory growth.
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -1100,10 +1100,12 @@ def unnest_columns(self, *columns: str, preserve_nulls: bool = True) -> DataFram
     def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
         """Export an Arrow PyCapsule Stream.
 
-        This will execute and collect the DataFrame. We will attempt to respect the
-        requested schema, but only trivial transformations will be applied such as only
-        returning the fields listed in the requested schema if their data types match
-        those in the DataFrame.
+        This executes the query lazily and returns a capsule backed by a
+        partition-aware reader. It will respect the requested schema when
+        possible, but only trivial transformations are applied such as returning
+        only the fields listed in the requested schema if their data types match
+        those in the DataFrame. Batches are yielded one partition at a time so
+        results are not buffered entirely in memory.
 
         Args:
             requested_schema: Attempt to provide the DataFrame using this schema.
diff --git a/python/tests/test_record_batch_stream.py b/python/tests/test_record_batch_stream.py
@@ -33,3 +33,35 @@ async def test_record_batch_stream_anext(ctx):
     assert batch.to_pyarrow().num_rows == 1
     with pytest.raises(StopAsyncIteration):
         await stream.__anext__()
+
+
+def test_arrow_c_stream_partitioned(tmp_path, ctx):
+    import gc
+
+    import pyarrow as pa
+    import pyarrow.parquet as pq
+
+    num_parts = 5
+    rows_per_part = 100_000
+    arr = pa.array(range(rows_per_part), pa.int64())
+    batch = pa.RecordBatch.from_arrays([arr], names=["a"])
+    table = pa.Table.from_batches([batch])
+    for i in range(num_parts):
+        pq.write_table(table, tmp_path / f"part{i}.parquet")
+
+    del arr, batch, table
+    gc.collect()
+
+    df = ctx.read_parquet(str(tmp_path))
+    capsule = df.__arrow_c_stream__()
+    reader = pa.ipc.RecordBatchStreamReader._import_from_c(capsule)
+
+    pool = pa.default_memory_pool()
+    baseline = pool.bytes_allocated()
+    peak = baseline
+    for b in reader:
+        peak = max(peak, pool.bytes_allocated())
+        del b
+        gc.collect()
+
+    assert peak - baseline < rows_per_part * 8 * 2
diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -943,12 +943,12 @@ impl PyDataFrame {
         py: Python<'py>,
         requested_schema: Option<Bound<'py, PyCapsule>>,
     ) -> PyDataFusionResult<Bound<'py, PyCapsule>> {
-        // execute query lazily using a stream
+        // execute query lazily using a stream per partition
         let df = self.df.as_ref().clone();
-        let stream = spawn_and_wait(py, async move { df.execute_stream().await })?;
+        let streams = spawn_and_wait(py, async move { df.execute_stream_partitioned().await })?;
 
         // Determine the schema and handle optional projection
-        let stream_schema = stream.schema();
+        let stream_schema = streams[0].schema();
         let mut schema: Schema = stream_schema.as_ref().to_owned();
         let mut project = false;
 
@@ -963,8 +963,9 @@ impl PyDataFrame {
         }
 
         let schema_ref: SchemaRef = Arc::new(schema);
-        let reader: Box<dyn RecordBatchReader + Send> =
-            Box::new(ArrowStreamReader::new(stream, schema_ref, project));
+        let reader: Box<dyn RecordBatchReader + Send> = Box::new(
+            PartitionedArrowStreamReader::new(streams, schema_ref, project),
+        );
 
         let ffi_stream = FFI_ArrowArrayStream::new(reader);
         let stream_capsule_name = CString::new("arrow_array_stream").unwrap();
@@ -1049,48 +1050,60 @@ impl PyDataFrame {
     }
 }
 
-struct ArrowStreamReader {
-    stream: SendableRecordBatchStream,
+struct PartitionedArrowStreamReader {
+    streams: Vec<SendableRecordBatchStream>,
     schema: SchemaRef,
     project: bool,
+    current: usize,
 }
 
-impl ArrowStreamReader {
-    fn new(stream: SendableRecordBatchStream, schema: SchemaRef, project: bool) -> Self {
+impl PartitionedArrowStreamReader {
+    fn new(streams: Vec<SendableRecordBatchStream>, schema: SchemaRef, project: bool) -> Self {
         Self {
-            stream,
+            streams,
             schema,
             project,
+            current: 0,
         }
     }
 }
 
-impl RecordBatchReader for ArrowStreamReader {
+impl RecordBatchReader for PartitionedArrowStreamReader {
     fn schema(&self) -> SchemaRef {
         self.schema.clone()
     }
 }
 
-impl Iterator for ArrowStreamReader {
+impl Iterator for PartitionedArrowStreamReader {
     type Item = Result<RecordBatch, ArrowError>;
 
     fn next(&mut self) -> Option<Self::Item> {
-        let result = Python::with_gil(|py| wait_for_stream_next(py, &mut self.stream));
-
-        match result {
-            Ok(Some(batch)) => {
-                let batch = if self.project {
-                    match record_batch_into_schema(batch, self.schema.as_ref()) {
-                        Ok(b) => b,
-                        Err(e) => return Some(Err(e)),
-                    }
-                } else {
-                    batch
-                };
-                Some(Ok(batch))
+        loop {
+            if self.current >= self.streams.len() {
+                return None;
+            }
+
+            let stream = &mut self.streams[self.current];
+            let result = Python::with_gil(|py| wait_for_stream_next(py, stream));
+
+            match result {
+                Ok(Some(batch)) => {
+                    let batch = if self.project {
+                        match record_batch_into_schema(batch, self.schema.as_ref()) {
+                            Ok(b) => b,
+                            Err(e) => return Some(Err(e)),
+                        }
+                    } else {
+                        batch
+                    };
+                    return Some(Ok(batch));
+                }
+                Ok(None) => {
+                    self.current += 1;
+                    continue;
+                }
+                Err(e) => return Some(Err(ArrowError::from(e))),
             }
-            Ok(None) => None,
-            Err(e) => Some(Err(ArrowError::from(e))),
         }
     }
 }