Enhance documentation for __arrow_c_stream__ method and add tests for incremental streaming of DataFrame

kosiew · kosiew · commit 949da2a22063 · 2025-09-01T11:28:45.000+08:00
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -1098,19 +1098,22 @@ def unnest_columns(self, *columns: str, preserve_nulls: bool = True) -> DataFram
         return DataFrame(self.df.unnest_columns(columns, preserve_nulls=preserve_nulls))
 
     def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
-        """Export an Arrow PyCapsule Stream.
+        """Export the DataFrame as an Arrow C Stream.
 
-        This will execute and collect the DataFrame. We will attempt to respect the
-        requested schema, but only trivial transformations will be applied such as only
-        returning the fields listed in the requested schema if their data types match
-        those in the DataFrame.
+        The DataFrame is executed using DataFusion's streaming APIs and exposed via
+        Arrow's C Stream interface. Record batches are produced incrementally, so the
+        full result set is never materialized in memory. When ``requested_schema`` is
+        provided, only straightforward projections such as column selection or
+        reordering are applied.
 
         Args:
             requested_schema: Attempt to provide the DataFrame using this schema.
 
         Returns:
-            Arrow PyCapsule object.
+            Arrow PyCapsule object representing an ``ArrowArrayStream``.
         """
+        # ``DataFrame.__arrow_c_stream__`` in the Rust extension leverages
+        # ``execute_stream`` under the hood to stream batches one at a time.
         return self.df.__arrow_c_stream__(requested_schema)
 
     def transform(self, func: Callable[..., DataFrame], *args: Any) -> DataFrame:
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -1582,6 +1582,29 @@ def test_empty_to_arrow_table(df):
     assert set(pyarrow_table.column_names) == {"a", "b", "c"}
 
 
+def test_arrow_c_stream_to_table(monkeypatch):
+    ctx = SessionContext()
+
+    # Create a DataFrame with two separate record batches
+    batch1 = pa.record_batch([pa.array([1])], names=["a"])
+    batch2 = pa.record_batch([pa.array([2])], names=["a"])
+    df = ctx.create_dataframe([[batch1], [batch2]])
+
+    # Fail if the DataFrame is pre-collected
+    def fail_collect(self):  # pragma: no cover - failure path
+        msg = "collect should not be called"
+        raise AssertionError(msg)
+
+    monkeypatch.setattr(DataFrame, "collect", fail_collect)
+
+    table = pa.Table.from_batches(df)
+    expected = pa.Table.from_batches([batch1, batch2])
+
+    assert table.equals(expected)
+    assert table.schema == df.schema()
+    assert table.column("a").num_chunks == 2
+
+
 def test_to_pylist(df):
     # Convert datafusion dataframe to Python list
     pylist = df.to_pylist()
diff --git a/python/tests/test_io.py b/python/tests/test_io.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import resource
 from pathlib import Path
 
 import pyarrow as pa
@@ -92,3 +93,28 @@ def test_read_avro():
     path = Path.cwd() / "testing/data/avro/alltypes_plain.avro"
     avro_df = read_avro(path=path)
     assert avro_df is not None
+
+
+def test_arrow_c_stream_large_dataset(ctx):
+    """DataFrame.__arrow_c_stream__ yields batches incrementally.
+
+    This test constructs a DataFrame that would be far larger than available
+    memory if materialized. The ``__arrow_c_stream__`` method should expose a
+    stream of record batches without collecting the full dataset, so reading a
+    handful of batches should not exhaust process memory.
+    """
+    # Create a very large DataFrame using range; this would be terabytes if collected
+    df = ctx.range(0, 1 << 40)
+
+    reader = pa.RecordBatchReader._import_from_c(df.__arrow_c_stream__())
+
+    # Track maximum RSS before consuming batches
+    start_max_rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
+
+    for _ in range(5):
+        batch = reader.read_next_batch()
+        assert batch is not None
+        assert len(batch) > 0
+        current_max_rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
+        # Ensure memory usage hasn't grown substantially (>50MB)
+        assert current_max_rss - start_max_rss < 50 * 1024