|
43 | 43 | from datafusion._internal import ParquetWriterOptions as ParquetWriterOptionsInternal |
44 | 44 | from datafusion.expr import Expr, SortExpr, sort_or_default |
45 | 45 | from datafusion.plan import ExecutionPlan, LogicalPlan |
46 | | -from datafusion.record_batch import RecordBatchStream |
| 46 | +from datafusion.record_batch import RecordBatch, RecordBatchStream |
47 | 47 |
|
48 | 48 | if TYPE_CHECKING: |
49 | 49 | import pathlib |
@@ -1030,6 +1030,10 @@ def execute_stream(self) -> RecordBatchStream: |
1030 | 1030 | """ |
1031 | 1031 | return RecordBatchStream(self.df.execute_stream()) |
1032 | 1032 |
|
| 1033 | + def to_record_batch_stream(self) -> RecordBatchStream: |
| 1034 | + """Return a :class:`RecordBatchStream` executing this DataFrame.""" |
| 1035 | + return self.execute_stream() |
| 1036 | + |
1033 | 1037 | def execute_stream_partitioned(self) -> list[RecordBatchStream]: |
1034 | 1038 | """Executes this DataFrame and returns a stream for each partition. |
1035 | 1039 |
|
@@ -1121,22 +1125,13 @@ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: |
1121 | 1125 | # preserving the original partition order. |
1122 | 1126 | return self.df.__arrow_c_stream__(requested_schema) |
1123 | 1127 |
|
1124 | | - def __iter__(self) -> Iterator[pa.RecordBatch]: |
1125 | | - """Yield record batches from the DataFrame without materializing results. |
1126 | | -
|
1127 | | - This implementation streams record batches via the Arrow C Stream |
1128 | | - interface, allowing callers such as :func:`pyarrow.Table.from_batches` to |
1129 | | - consume results lazily. The DataFrame is executed using DataFusion's |
1130 | | - partitioned streaming APIs so ``collect`` is never invoked and batch |
1131 | | - order across partitions is preserved. |
1132 | | - """ |
1133 | | - from contextlib import closing |
1134 | | - |
1135 | | - import pyarrow as pa |
| 1128 | + def __iter__(self) -> Iterator[RecordBatch]: |
| 1129 | + """Yield :class:`RecordBatch` objects by streaming execution.""" |
| 1130 | + yield from self.to_record_batch_stream() |
1136 | 1131 |
|
1137 | | - reader = pa.RecordBatchReader._import_from_c_capsule(self.__arrow_c_stream__()) |
1138 | | - with closing(reader): |
1139 | | - yield from reader |
| 1132 | + async def __aiter__(self) -> RecordBatchStream: |
| 1133 | + """Return an asynchronous iterator over streamed ``RecordBatch`` objects.""" |
| 1134 | + return await self.to_record_batch_stream().__aiter__() |
1140 | 1135 |
|
1141 | 1136 | def transform(self, func: Callable[..., DataFrame], *args: Any) -> DataFrame: |
1142 | 1137 | """Apply a function to the current DataFrame which returns another DataFrame. |
|
0 commit comments