feat: add to_stream method for lazy processing of DataFrame results

kosiew · kosiew · commit 3b19e8d28dfe · 2025-09-04T22:38:45.000+08:00
diff --git a/docs/source/user-guide/dataframe/index.rst b/docs/source/user-guide/dataframe/index.rst
@@ -168,8 +168,18 @@ out-of-memory errors.
     for batch in reader:
         ...  # process each batch as it is produced
 
-DataFrames are also iterable, yielding :class:`pyarrow.RecordBatch` objects
-lazily so you can loop over results directly:
+DataFrames expose :py:meth:`~datafusion.DataFrame.to_stream`, which returns a
+``RecordBatchStream`` for lazily processing results without materializing them
+all at once:
+
+.. code-block:: python
+
+    stream = df.to_stream()
+    for batch in stream:
+        ...  # process each batch as it is produced
+
+DataFrames themselves are also iterable and delegate to ``to_stream()`` under
+the hood:
 
 .. code-block:: python
 
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -1022,6 +1022,14 @@ def to_arrow_table(self) -> pa.Table:
         """
         return self.df.to_arrow_table()
 
+    def to_stream(self) -> RecordBatchStream:
+        """Execute this :py:class:`DataFrame` and return a record batch stream.
+
+        This is a convenience wrapper around :py:meth:`execute_stream` and can be
+        used to iterate over results without materializing them.
+        """
+        return self.execute_stream()
+
     def execute_stream(self) -> RecordBatchStream:
         """Executes this DataFrame and returns a stream over a single partition.
 
diff --git a/python/tests/test_dataframe_iter_stream.py b/python/tests/test_dataframe_iter_stream.py
@@ -15,41 +15,17 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import pyarrow as pa
-
-
-def test_iter_releases_reader(monkeypatch, ctx):
-    batches = [
-        pa.RecordBatch.from_pydict({"a": [1]}),
-        pa.RecordBatch.from_pydict({"a": [2]}),
-    ]
-
-    class DummyReader:
-        def __init__(self, batches):
-            self._iter = iter(batches)
-            self.closed = False
-
-        def __iter__(self):
-            return self
-
-        def __next__(self):
-            return next(self._iter)
-
-        def close(self):
-            self.closed = True
-
-    dummy_reader = DummyReader(batches)
-
-    class FakeRecordBatchReader:
-        @staticmethod
-        def _import_from_c_capsule(*_args, **_kwargs):
-            return dummy_reader
-
-    monkeypatch.setattr(pa, "RecordBatchReader", FakeRecordBatchReader)
 
+def test_to_stream(ctx):
     df = ctx.from_pydict({"a": [1, 2]})
+    stream = df.to_stream()
+    batches = [rb.to_pyarrow() for rb in stream]
+    assert len(batches) == 1
+    assert batches[0].to_pydict() == {"a": [1, 2]}
 
-    for _ in df:
-        break
 
-    assert dummy_reader.closed
+def test_dataframe_iter(ctx):
+    df = ctx.from_pydict({"a": [1, 2]})
+    batches = list(df)
+    assert len(batches) == 1
+    assert batches[0].to_pydict() == {"a": [1, 2]}