refactor: update DataFrame iteration to yield RecordBatch objects directly

kosiew · kosiew · commit 98e7e005858c · 2025-09-06T13:38:30.000+08:00
diff --git a/docs/source/user-guide/dataframe/index.rst b/docs/source/user-guide/dataframe/index.rst
@@ -131,13 +131,13 @@ Terminal Operations
 -------------------
 
 To materialize the results of your DataFrame operations, call a terminal method or iterate over the
-``DataFrame`` to consume ``pyarrow.RecordBatch`` objects lazily:
+``DataFrame`` to consume :py:class:`datafusion.record_batch.RecordBatch` objects lazily:
 
 .. code-block:: python
 
     # Iterate over the DataFrame to stream record batches
     for batch in df:
-        ...  # process each batch as it is produced
+        ...  # batch is a datafusion.record_batch.RecordBatch
 
     # Collect all data as PyArrow RecordBatches
     result_batches = df.collect()
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -1133,23 +1133,24 @@ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
         # preserving the original partition order.
         return self.df.__arrow_c_stream__(requested_schema)
 
-    def __iter__(self) -> Iterator[pa.RecordBatch]:
-        """Yield record batches from this DataFrame lazily.
+    def __iter__(self) -> Iterator[RecordBatch]:
+        """Yield :class:`datafusion.record_batch.RecordBatch` objects lazily.
 
-        This delegates to :py:meth:`to_stream` and converts each batch to a
-        :class:`pyarrow.RecordBatch` without eagerly materializing the entire
-        result set.
+        This delegates to :py:meth:`to_stream` without converting each batch to a
+        :class:`pyarrow.RecordBatch`. Use
+        :py:meth:`datafusion.record_batch.RecordBatch.to_pyarrow` when a
+        :class:`pyarrow.RecordBatch` is required.
         """
         for batch in self.to_stream():
-            yield batch.to_pyarrow()
+            yield batch
 
-    def __aiter__(self) -> AsyncIterator[pa.RecordBatch]:
-        """Asynchronously yield record batches from this DataFrame lazily."""
+    def __aiter__(self) -> AsyncIterator[RecordBatch]:
+        """Asynchronously yield :class:`datafusion.record_batch.RecordBatch` objects lazily."""
         stream = self.to_stream()
 
-        async def iterator() -> AsyncIterator[pa.RecordBatch]:
+        async def iterator() -> AsyncIterator[RecordBatch]:
             async for batch in stream:
-                yield batch.to_pyarrow()
+                yield batch
 
         return iterator()
 
diff --git a/python/tests/test_dataframe_iter_stream.py b/python/tests/test_dataframe_iter_stream.py
@@ -16,6 +16,9 @@
 # under the License.
 
 
+from datafusion.record_batch import RecordBatch
+
+
 def test_to_stream(ctx):
     df = ctx.from_pydict({"a": [1, 2]})
     stream = df.to_stream()
@@ -28,4 +31,5 @@ def test_dataframe_iter(ctx):
     df = ctx.from_pydict({"a": [1, 2]})
     batches = list(df)
     assert len(batches) == 1
-    assert batches[0].to_pydict() == {"a": [1, 2]}
+    assert isinstance(batches[0], RecordBatch)
+    assert batches[0].to_pyarrow().to_pydict() == {"a": [1, 2]}