feat: enhance RecordBatch and DataFrame methods for improved PyArrow compatibility

kosiew · kosiew · commit 34a7e511b5c5 · 2025-09-05T19:34:00.000+08:00
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -332,8 +332,8 @@ def _repr_html_(self) -> str:
 
     @staticmethod
     def default_str_repr(
-        batches: list[pa.RecordBatch],
-        schema: pa.Schema,
+        batches: list[RecordBatch],
+        schema: "pa.Schema",
         has_more: bool,
         table_uuid: str | None = None,
     ) -> str:
@@ -342,7 +342,13 @@ def default_str_repr(
         This method is used by the default formatter and implemented in Rust for
         performance reasons.
         """
-        return DataFrameInternal.default_str_repr(batches, schema, has_more, table_uuid)
+        import pyarrow as pa
+
+        py_batches = [b.to_pyarrow() for b in batches]
+        schema = pa.schema(schema)
+        return DataFrameInternal.default_str_repr(
+            py_batches, schema, has_more, table_uuid
+        )
 
     def describe(self) -> DataFrame:
         """Return the statistics for this DataFrame.
@@ -589,17 +595,17 @@ def tail(self, n: int = 5) -> DataFrame:
         """
         return DataFrame(self.df.limit(n, max(0, self.count() - n)))
 
-    def collect(self) -> list[pa.RecordBatch]:
+    def collect(self) -> list[RecordBatch]:
         """Execute this :py:class:`DataFrame` and collect results into memory.
 
-        Prior to calling ``collect``, modifying a DataFrme simply updates a plan
+        Prior to calling ``collect``, modifying a DataFrame simply updates a plan
         (no actual computation is performed). Calling ``collect`` triggers the
         computation.
 
         Returns:
-            List of :py:class:`pyarrow.RecordBatch` collected from the DataFrame.
+            List of :py:class:`RecordBatch` collected from the DataFrame.
         """
-        return self.df.collect()
+        return [RecordBatch(rb) for rb in self.df.collect()]
 
     def cache(self) -> DataFrame:
         """Cache the DataFrame as a memory table.
@@ -609,17 +615,19 @@ def cache(self) -> DataFrame:
         """
         return DataFrame(self.df.cache())
 
-    def collect_partitioned(self) -> list[list[pa.RecordBatch]]:
+    def collect_partitioned(self) -> list[list[RecordBatch]]:
         """Execute this DataFrame and collect all partitioned results.
 
-        This operation returns :py:class:`pyarrow.RecordBatch` maintaining the input
+        This operation returns :py:class:`RecordBatch` maintaining the input
         partitioning.
 
         Returns:
             List of list of :py:class:`RecordBatch` collected from the
                 DataFrame.
         """
-        return self.df.collect_partitioned()
+        return [
+            [RecordBatch(rb) for rb in rbs] for rbs in self.df.collect_partitioned()
+        ]
 
     def show(self, num: int = 20) -> None:
         """Execute the DataFrame and print the result to the console.
@@ -1047,13 +1055,15 @@ def execute_stream_partitioned(self) -> list[RecordBatchStream]:
         streams = self.df.execute_stream_partitioned()
         return [RecordBatchStream(rbs) for rbs in streams]
 
-    def to_pandas(self) -> pd.DataFrame:
-        """Execute the :py:class:`DataFrame` and convert it into a Pandas DataFrame.
+    def to_pandas(self) -> "pd.DataFrame":
+        """Execute the :py:class:`DataFrame` and convert it into a Pandas DataFrame."""
 
-        Returns:
-            Pandas DataFrame.
-        """
-        return self.df.to_pandas()
+        import pandas as pd
+        import pyarrow as pa
+
+        batches = [rb.to_pyarrow() for rb in self.collect()]
+        table = pa.Table.from_batches(batches)
+        return table.to_pandas()
 
     def to_pylist(self) -> list[dict[str, Any]]:
         """Execute the :py:class:`DataFrame` and convert it into a list of dictionaries.
@@ -1071,13 +1081,15 @@ def to_pydict(self) -> dict[str, list[Any]]:
         """
         return self.df.to_pydict()
 
-    def to_polars(self) -> pl.DataFrame:
-        """Execute the :py:class:`DataFrame` and convert it into a Polars DataFrame.
+    def to_polars(self) -> "pl.DataFrame":
+        """Execute the :py:class:`DataFrame` and convert it into a Polars DataFrame."""
 
-        Returns:
-            Polars DataFrame.
-        """
-        return self.df.to_polars()
+        import polars as pl
+        import pyarrow as pa
+
+        batches = [rb.to_pyarrow() for rb in self.collect()]
+        table = pa.Table.from_batches(batches)
+        return pl.from_arrow(table)
 
     def count(self) -> int:
         """Return the total number of rows in this :py:class:`DataFrame`.
diff --git a/python/datafusion/record_batch.py b/python/datafusion/record_batch.py
@@ -23,7 +23,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 if TYPE_CHECKING:
     import pyarrow as pa
@@ -33,25 +33,52 @@
 
 
 class RecordBatch:
-    """This class is essentially a wrapper for :py:class:`pa.RecordBatch`."""
+    """Wrapper around project-defined ``RecordBatch`` with optional PyArrow support."""
 
     def __init__(self, record_batch: df_internal.RecordBatch) -> None:
         """This constructor is generally not called by the end user.
 
         See the :py:class:`RecordBatchStream` iterator for generating this class.
         """
         self.record_batch = record_batch
+        self._pyarrow_rb: pa.RecordBatch | None = None
 
     def to_pyarrow(self) -> pa.RecordBatch:
-        """Convert to :py:class:`pa.RecordBatch`."""
-        return self.record_batch.to_pyarrow()
+        """Convert to :py:class:`pa.RecordBatch`.
+
+        Requires :mod:`pyarrow` to be installed.
+        """
+        if self._pyarrow_rb is None:
+            self._pyarrow_rb = self.record_batch.to_pyarrow()
+        return self._pyarrow_rb
 
     def __arrow_c_array__(
         self, requested_schema: object | None = None
     ) -> tuple[object, object]:
         """Arrow C Data Interface export."""
         return self.record_batch.__arrow_c_array__(requested_schema)
 
+    # ------------------------------------------------------------------
+    # PyArrow compatibility helpers
+    # ------------------------------------------------------------------
+    def __getattr__(self, item: str) -> Any:  # pragma: no cover - simple delegation
+        """Delegate attribute access to the PyArrow ``RecordBatch``."""
+        return getattr(self.to_pyarrow(), item)
+
+    def __getitem__(self, key) -> Any:  # pragma: no cover - simple delegation
+        """Delegate item access to the PyArrow ``RecordBatch``."""
+        return self.to_pyarrow()[key]
+
+    def __len__(self) -> int:  # pragma: no cover - simple delegation
+        """Delegate ``len`` to the PyArrow ``RecordBatch``."""
+        return len(self.to_pyarrow())
+
+    def __eq__(self, other: object) -> bool:  # pragma: no cover - simple delegation
+        """Compare equality using the underlying PyArrow ``RecordBatch``."""
+        if not isinstance(other, RecordBatch):
+            return NotImplemented
+        return self.to_pyarrow().equals(other.to_pyarrow())
+
 
 class RecordBatchStream:
     """This class represents a stream of record batches.
diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -47,7 +47,7 @@ use crate::catalog::PyTable;
 use crate::errors::{py_datafusion_err, PyDataFusionError};
 use crate::expr::sort_expr::to_sort_expressions;
 use crate::physical_plan::PyExecutionPlan;
-use crate::record_batch::{poll_next_batch, PyRecordBatchStream};
+use crate::record_batch::{poll_next_batch, PyRecordBatch, PyRecordBatchStream};
 use crate::sql::logical::PyLogicalPlan;
 use crate::utils::{
     get_tokio_runtime, is_ipython_env, py_obj_to_scalar_value, spawn_future, validate_pycapsule,
@@ -582,12 +582,10 @@ impl PyDataFrame {
     /// Executes the plan, returning a list of `RecordBatch`es.
     /// Unless some order is specified in the plan, there is no
     /// guarantee of the order of the result.
-    fn collect(&self, py: Python) -> PyResult<Vec<PyObject>> {
+    fn collect(&self, py: Python) -> PyDataFusionResult<Vec<PyRecordBatch>> {
         let batches = wait_for_future(py, self.df.as_ref().clone().collect())?
             .map_err(PyDataFusionError::from)?;
-        // cannot use PyResult<Vec<RecordBatch>> return type due to
-        // https://github.com/PyO3/pyo3/issues/1813
-        batches.into_iter().map(|rb| rb.to_pyarrow(py)).collect()
+        Ok(batches.into_iter().map(PyRecordBatch::from).collect())
     }
 
     /// Cache DataFrame.
@@ -596,16 +594,16 @@ impl PyDataFrame {
         Ok(Self::new(df))
     }
 
-    /// Executes this DataFrame and collects all results into a vector of vector of RecordBatch
-    /// maintaining the input partitioning.
-    fn collect_partitioned(&self, py: Python) -> PyResult<Vec<Vec<PyObject>>> {
+    /// Executes this DataFrame and collects all results into a vector of vector of
+    /// `RecordBatch`, maintaining the input partitioning.
+    fn collect_partitioned(&self, py: Python) -> PyDataFusionResult<Vec<Vec<PyRecordBatch>>> {
         let batches = wait_for_future(py, self.df.as_ref().clone().collect_partitioned())?
             .map_err(PyDataFusionError::from)?;
 
-        batches
+        Ok(batches
             .into_iter()
-            .map(|rbs| rbs.into_iter().map(|rb| rb.to_pyarrow(py)).collect())
-            .collect()
+            .map(|rbs| rbs.into_iter().map(PyRecordBatch::from).collect())
+            .collect())
     }
 
     /// Print the result, 20 lines by default