refactor: enhance DataFrame and RecordBatchStream iteration support

kosiew · kosiew · commit d6f5c866ed93 · 2025-09-08T22:05:47.000+08:00
diff --git a/docs/source/user-guide/dataframe/index.rst b/docs/source/user-guide/dataframe/index.rst
@@ -168,14 +168,33 @@ out-of-memory errors.
     for batch in reader:
         ...  # process each batch as it is produced
 
-DataFrames are also iterable, yielding :class:`pyarrow.RecordBatch` objects
-lazily so you can loop over results directly:
+DataFrames are also iterable, yielding :class:`datafusion.RecordBatch`
+objects lazily so you can loop over results directly without importing
+PyArrow:
 
 .. code-block:: python
 
     for batch in df:
+        ...  # each batch is a ``RecordBatch``
+
+Asynchronous iteration is supported as well, allowing integration with
+``asyncio`` event loops:
+
+.. code-block:: python
+
+    async for batch in df:
         ...  # process each batch as it is produced
 
+To work with the stream directly, use
+``to_record_batch_stream()``, which returns a
+:class:`~datafusion.RecordBatchStream`:
+
+.. code-block:: python
+
+    stream = df.to_record_batch_stream()
+    for batch in stream:
+        ...
+
 See :doc:`../io/arrow` for additional details on the Arrow interface.
 
 HTML Rendering
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -43,7 +43,7 @@
 from datafusion._internal import ParquetWriterOptions as ParquetWriterOptionsInternal
 from datafusion.expr import Expr, SortExpr, sort_or_default
 from datafusion.plan import ExecutionPlan, LogicalPlan
-from datafusion.record_batch import RecordBatchStream
+from datafusion.record_batch import RecordBatch, RecordBatchStream
 
 if TYPE_CHECKING:
     import pathlib
@@ -1030,6 +1030,10 @@ def execute_stream(self) -> RecordBatchStream:
         """
         return RecordBatchStream(self.df.execute_stream())
 
+    def to_record_batch_stream(self) -> RecordBatchStream:
+        """Return a :class:`RecordBatchStream` executing this DataFrame."""
+        return self.execute_stream()
+
     def execute_stream_partitioned(self) -> list[RecordBatchStream]:
         """Executes this DataFrame and returns a stream for each partition.
 
@@ -1121,22 +1125,13 @@ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
         # preserving the original partition order.
         return self.df.__arrow_c_stream__(requested_schema)
 
-    def __iter__(self) -> Iterator[pa.RecordBatch]:
-        """Yield record batches from the DataFrame without materializing results.
-
-        This implementation streams record batches via the Arrow C Stream
-        interface, allowing callers such as :func:`pyarrow.Table.from_batches` to
-        consume results lazily. The DataFrame is executed using DataFusion's
-        partitioned streaming APIs so ``collect`` is never invoked and batch
-        order across partitions is preserved.
-        """
-        from contextlib import closing
-
-        import pyarrow as pa
+    def __iter__(self) -> Iterator[RecordBatch]:
+        """Yield :class:`RecordBatch` objects by streaming execution."""
+        yield from self.to_record_batch_stream()
 
-        reader = pa.RecordBatchReader._import_from_c_capsule(self.__arrow_c_stream__())
-        with closing(reader):
-            yield from reader
+    async def __aiter__(self) -> RecordBatchStream:
+        """Return an asynchronous iterator over streamed ``RecordBatch`` objects."""
+        return await self.to_record_batch_stream().__aiter__()
 
     def transform(self, func: Callable[..., DataFrame], *args: Any) -> DataFrame:
         """Apply a function to the current DataFrame which returns another DataFrame.
diff --git a/python/datafusion/record_batch.py b/python/datafusion/record_batch.py
@@ -46,6 +46,26 @@ def to_pyarrow(self) -> pa.RecordBatch:
         """Convert to :py:class:`pa.RecordBatch`."""
         return self.record_batch.to_pyarrow()
 
+    def __arrow_c_array__(
+        self, requested_schema: object | None = None
+    ) -> tuple[object, object]:
+        """Export the record batch via the Arrow C Data Interface.
+
+        This allows zero-copy interchange with libraries that support the
+        `Arrow PyCapsule interface <https://arrow.apache.org/docs/format/
+        CDataInterface/PyCapsuleInterface.html>`_.
+
+        Args:
+            requested_schema: Attempt to provide the record batch using this
+                schema. Only straightforward projections such as column
+                selection or reordering are applied.
+
+        Returns:
+            Two Arrow PyCapsule objects representing the ``ArrowArray`` and
+            ``ArrowSchema``.
+        """
+        return self.record_batch.__arrow_c_array__(requested_schema)
+
 
 class RecordBatchStream:
     """This class represents a stream of record batches.
@@ -72,8 +92,9 @@ def __next__(self) -> RecordBatch:
         next_batch = next(self.rbs)
         return RecordBatch(next_batch)
 
-    def __aiter__(self) -> typing_extensions.Self:
+    async def __aiter__(self) -> typing_extensions.Self:
         """Async iterator function."""
+        await self.rbs.__aiter__()
         return self
 
     def __iter__(self) -> typing_extensions.Self: