2525from typing import (
2626 TYPE_CHECKING ,
2727 Any ,
28- AsyncIterator ,
2928 Iterable ,
30- Iterator ,
3129 Literal ,
3230 Optional ,
3331 Union ,
4442from datafusion ._internal import ParquetWriterOptions as ParquetWriterOptionsInternal
4543from datafusion .expr import Expr , SortExpr , sort_or_default
4644from datafusion .plan import ExecutionPlan , LogicalPlan
47- from datafusion .record_batch import RecordBatch , RecordBatchStream
45+ from datafusion .record_batch import RecordBatchStream
4846
4947if TYPE_CHECKING :
5048 import pathlib
@@ -298,9 +296,6 @@ def __init__(
298296class DataFrame :
299297 """Two dimensional table representation of data.
300298
301- DataFrame objects are iterable; iterating over a DataFrame yields
302- :class:`datafusion.RecordBatch` instances lazily.
303-
304299 See :ref:`user_guide_concepts` in the online documentation for more information.
305300 """
306301
@@ -317,7 +312,7 @@ def into_view(self) -> pa.Table:
317312 return self .df .into_view ()
318313
319314 def __getitem__ (self , key : str | list [str ]) -> DataFrame :
320- """Return a new :py:class: `DataFrame` with the specified column or columns.
315+ """Return a new :py:class`DataFrame` with the specified column or columns.
321316
322317 Args:
323318 key: Column name or list of column names to select.
@@ -1047,18 +1042,6 @@ def execute_stream_partitioned(self) -> list[RecordBatchStream]:
10471042 streams = self .df .execute_stream_partitioned ()
10481043 return [RecordBatchStream (rbs ) for rbs in streams ]
10491044
1050- @deprecated ("Use execute_stream() instead" )
1051- def to_record_batch_stream (self ) -> RecordBatchStream :
1052- """Return a :py:class:`RecordBatchStream` over this DataFrame's results.
1053-
1054- This method is deprecated. Use :py:meth:`execute_stream` instead.
1055-
1056- Returns:
1057- A ``RecordBatchStream`` representing the lazily generated record
1058- batches for this DataFrame.
1059- """
1060- return self .execute_stream ()
1061-
10621045 def to_pandas (self ) -> pd .DataFrame :
10631046 """Execute the :py:class:`DataFrame` and convert it into a Pandas DataFrame.
10641047
@@ -1122,33 +1105,21 @@ def unnest_columns(self, *columns: str, preserve_nulls: bool = True) -> DataFram
11221105 return DataFrame (self .df .unnest_columns (columns , preserve_nulls = preserve_nulls ))
11231106
11241107 def __arrow_c_stream__ (self , requested_schema : object | None = None ) -> object :
1125- """Export the DataFrame as an Arrow C Stream.
1108+ """Export an Arrow PyCapsule Stream.
11261109
1127- The DataFrame is executed using DataFusion's streaming APIs and exposed via
1128- Arrow's C Stream interface. Record batches are produced incrementally, so the
1129- full result set is never materialized in memory. When ``requested_schema`` is
1130- provided, only straightforward projections such as column selection or
1131- reordering are applied.
1110+ This will execute and collect the DataFrame. We will attempt to respect the
1111+ requested schema, but only trivial transformations will be applied such as only
1112+ returning the fields listed in the requested schema if their data types match
1113+ those in the DataFrame.
11321114
11331115 Args:
11341116 requested_schema: Attempt to provide the DataFrame using this schema.
11351117
11361118 Returns:
1137- Arrow PyCapsule object representing an ``ArrowArrayStream`` .
1119+ Arrow PyCapsule object.
11381120 """
1139- # ``DataFrame.__arrow_c_stream__`` in the Rust extension leverages
1140- # ``execute_stream_partitioned`` under the hood to stream batches while
1141- # preserving the original partition order.
11421121 return self .df .__arrow_c_stream__ (requested_schema )
11431122
1144- def __iter__ (self ) -> Iterator [RecordBatch ]:
1145- """Return an iterator over this DataFrame's record batches."""
1146- return iter (self .execute_stream ())
1147-
1148- def __aiter__ (self ) -> AsyncIterator [RecordBatch ]:
1149- """Return an async iterator over this DataFrame's record batches."""
1150- return self .execute_stream ().__aiter__ ()
1151-
11521123 def transform (self , func : Callable [..., DataFrame ], * args : Any ) -> DataFrame :
11531124 """Apply a function to the current DataFrame which returns another DataFrame.
11541125
0 commit comments