feat: refactor DatasetExec to utilize ArrowArrayStreamReader and improve projection handling

kosiew · kosiew · commit 4b57c7a76735 · 2025-09-05T19:27:34.000+08:00
diff --git a/src/dataset_exec.rs b/src/dataset_exec.rs
@@ -15,138 +15,92 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType};
-/// Implements a Datafusion physical ExecutionPlan that delegates to a PyArrow Dataset
-/// This actually performs the projection, filtering and scanning of a Dataset
-use pyo3::prelude::*;
-use pyo3::types::{PyDict, PyIterator, PyList};
-
-use std::any::Any;
-use std::sync::Arc;
-
-use futures::{stream, TryStreamExt};
-
-use datafusion::arrow::datatypes::SchemaRef;
-use datafusion::arrow::error::ArrowError;
-use datafusion::arrow::error::Result as ArrowResult;
-use datafusion::arrow::pyarrow::PyArrowType;
-use datafusion::arrow::record_batch::RecordBatch;
+use arrow::array::RecordBatchReader;
+use arrow::datatypes::SchemaRef;
+use arrow::error::{ArrowError, Result as ArrowResult};
+use arrow::ffi_stream::ArrowArrayStreamReader;
+use arrow::pyarrow::FromPyArrow;
+use arrow::record_batch::RecordBatch;
 use datafusion::error::{DataFusionError as InnerDataFusionError, Result as DFResult};
 use datafusion::execution::context::TaskContext;
-use datafusion::logical_expr::utils::conjunction;
-use datafusion::logical_expr::Expr;
 use datafusion::physical_expr::{EquivalenceProperties, LexOrdering};
+use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType};
 use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
 use datafusion::physical_plan::{
     DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, Partitioning,
     SendableRecordBatchStream, Statistics,
 };
+use futures::{stream, StreamExt};
+use pyo3::prelude::*;
+use std::any::Any;
+use std::sync::Arc;
 
 use crate::errors::PyDataFusionResult;
-use crate::pyarrow_filter_expression::PyArrowFilterExpression;
 
-struct PyArrowBatchesAdapter {
-    batches: Py<PyIterator>,
+/// Iterator over an ArrowArrayStreamReader with optional projection
+struct ArrowCStreamAdapter {
+    reader: ArrowArrayStreamReader,
+    projection: Option<Vec<usize>>,
 }
 
-impl Iterator for PyArrowBatchesAdapter {
+impl Iterator for ArrowCStreamAdapter {
     type Item = ArrowResult<RecordBatch>;
 
     fn next(&mut self) -> Option<Self::Item> {
-        Python::with_gil(|py| {
-            let mut batches = self.batches.clone_ref(py).into_bound(py);
-            Some(
-                batches
-                    .next()?
-                    .and_then(|batch| Ok(batch.extract::<PyArrowType<_>>()?.0))
-                    .map_err(|err| ArrowError::ExternalError(Box::new(err))),
-            )
+        self.reader.next().map(|batch_res| {
+            batch_res.and_then(|batch| {
+                if let Some(indices) = &self.projection {
+                    batch
+                        .project(indices)
+                        .map_err(|e| ArrowError::ExternalError(Box::new(e)))
+                } else {
+                    Ok(batch)
+                }
+            })
         })
     }
 }
 
-// Wraps a pyarrow.dataset.Dataset class and implements a Datafusion ExecutionPlan around it
+/// Execution plan that scans a Python object implementing ``__arrow_c_stream__``
 #[derive(Debug)]
 pub(crate) struct DatasetExec {
     dataset: PyObject,
     schema: SchemaRef,
-    fragments: Py<PyList>,
-    columns: Option<Vec<String>>,
-    filter_expr: Option<PyObject>,
+    projection: Option<Vec<usize>>,
     projected_statistics: Statistics,
     plan_properties: datafusion::physical_plan::PlanProperties,
 }
 
 impl DatasetExec {
     pub fn new(
-        py: Python,
         dataset: &Bound<'_, PyAny>,
         projection: Option<Vec<usize>>,
-        filters: &[Expr],
     ) -> PyDataFusionResult<Self> {
-        let columns: Option<PyDataFusionResult<Vec<String>>> = projection.map(|p| {
-            p.iter()
-                .map(|index| {
-                    let name: String = dataset
-                        .getattr("schema")?
-                        .call_method1("field", (*index,))?
-                        .getattr("name")?
-                        .extract()?;
-                    Ok(name)
-                })
-                .collect()
-        });
-        let columns: Option<Vec<String>> = columns.transpose()?;
-        let filter_expr: Option<PyObject> = conjunction(filters.to_owned())
-            .map(|filters| {
-                PyArrowFilterExpression::try_from(&filters)
-                    .map(|filter_expr| filter_expr.inner().clone_ref(py))
-            })
-            .transpose()?;
-
-        let kwargs = PyDict::new(py);
-
-        kwargs.set_item("columns", columns.clone())?;
-        kwargs.set_item(
-            "filter",
-            filter_expr.as_ref().map(|expr| expr.clone_ref(py)),
-        )?;
-
-        let scanner = dataset.call_method("scanner", (), Some(&kwargs))?;
-
-        let schema = Arc::new(
-            scanner
-                .getattr("projected_schema")?
-                .extract::<PyArrowType<_>>()?
-                .0,
-        );
-
-        let builtins = Python::import(py, "builtins")?;
-        let pylist = builtins.getattr("list")?;
-
-        // Get the fragments or partitions of the dataset
-        let fragments_iterator: Bound<'_, PyAny> = dataset.call_method1(
-            "get_fragments",
-            (filter_expr.as_ref().map(|expr| expr.clone_ref(py)),),
-        )?;
-
-        let fragments_iter = pylist.call1((fragments_iterator,))?;
-        let fragments = fragments_iter.downcast::<PyList>().map_err(PyErr::from)?;
+        let reader = ArrowArrayStreamReader::from_pyarrow_bound(dataset)?;
+        let base_schema = reader.schema().as_ref().clone();
+        drop(reader);
+
+        let projected_schema = if let Some(ref proj) = projection {
+            base_schema
+                .project(proj)
+                .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))?
+        } else {
+            base_schema
+        };
+        let schema: SchemaRef = Arc::new(projected_schema);
 
         let projected_statistics = Statistics::new_unknown(&schema);
         let plan_properties = datafusion::physical_plan::PlanProperties::new(
             EquivalenceProperties::new(schema.clone()),
-            Partitioning::UnknownPartitioning(fragments.len()),
+            Partitioning::UnknownPartitioning(1),
             EmissionType::Final,
             Boundedness::Bounded,
         );
 
         Ok(DatasetExec {
             dataset: dataset.clone().unbind(),
             schema,
-            fragments: fragments.clone().unbind(),
-            columns,
-            filter_expr,
+            projection,
             projected_statistics,
             plan_properties,
         })
@@ -155,22 +109,18 @@ impl DatasetExec {
 
 impl ExecutionPlan for DatasetExec {
     fn name(&self) -> &str {
-        // [ExecutionPlan::name] docs recommends forwarding to `static_name`
         Self::static_name()
     }
 
-    /// Return a reference to Any that can be used for downcasting
     fn as_any(&self) -> &dyn Any {
         self
     }
 
-    /// Get the schema for this execution plan
     fn schema(&self) -> SchemaRef {
         self.schema.clone()
     }
 
     fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
-        // this is a leaf node and has no children
         vec![]
     }
 
@@ -184,56 +134,22 @@ impl ExecutionPlan for DatasetExec {
     fn execute(
         &self,
         partition: usize,
-        context: Arc<TaskContext>,
+        _context: Arc<TaskContext>,
     ) -> DFResult<SendableRecordBatchStream> {
-        let batch_size = context.session_config().batch_size();
+        if partition != 0 {
+            return Err(InnerDataFusionError::Plan("invalid partition".to_string()));
+        }
         Python::with_gil(|py| {
             let dataset = self.dataset.bind(py);
-            let fragments = self.fragments.bind(py);
-            let fragment = fragments
-                .get_item(partition)
-                .map_err(|err| InnerDataFusionError::External(Box::new(err)))?;
-
-            // We need to pass the dataset schema to unify the fragment and dataset schema per PyArrow docs
-            let dataset_schema = dataset
-                .getattr("schema")
-                .map_err(|err| InnerDataFusionError::External(Box::new(err)))?;
-            let kwargs = PyDict::new(py);
-            kwargs
-                .set_item("columns", self.columns.clone())
-                .map_err(|err| InnerDataFusionError::External(Box::new(err)))?;
-            kwargs
-                .set_item(
-                    "filter",
-                    self.filter_expr.as_ref().map(|expr| expr.clone_ref(py)),
-                )
+            let reader = ArrowArrayStreamReader::from_pyarrow_bound(dataset)
                 .map_err(|err| InnerDataFusionError::External(Box::new(err)))?;
-            kwargs
-                .set_item("batch_size", batch_size)
-                .map_err(|err| InnerDataFusionError::External(Box::new(err)))?;
-            let scanner = fragment
-                .call_method("scanner", (dataset_schema,), Some(&kwargs))
-                .map_err(|err| InnerDataFusionError::External(Box::new(err)))?;
-            let schema: SchemaRef = Arc::new(
-                scanner
-                    .getattr("projected_schema")
-                    .and_then(|schema| Ok(schema.extract::<PyArrowType<_>>()?.0))
-                    .map_err(|err| InnerDataFusionError::External(Box::new(err)))?,
-            );
-            let record_batches: Bound<'_, PyIterator> = scanner
-                .call_method0("to_batches")
-                .map_err(|err| InnerDataFusionError::External(Box::new(err)))?
-                .try_iter()
-                .map_err(|err| InnerDataFusionError::External(Box::new(err)))?;
-
-            let record_batches = PyArrowBatchesAdapter {
-                batches: record_batches.into(),
+            let adapter = ArrowCStreamAdapter {
+                reader,
+                projection: self.projection.clone(),
             };
-
-            let record_batch_stream = stream::iter(record_batches);
-            let record_batch_stream: SendableRecordBatchStream = Box::pin(
-                RecordBatchStreamAdapter::new(schema, record_batch_stream.map_err(|e| e.into())),
-            );
+            let stream = stream::iter(adapter).map(|r| r.map_err(|e| e.into()));
+            let record_batch_stream: SendableRecordBatchStream =
+                Box::pin(RecordBatchStreamAdapter::new(self.schema.clone(), stream));
             Ok(record_batch_stream)
         })
     }
@@ -248,7 +164,6 @@ impl ExecutionPlan for DatasetExec {
 }
 
 impl ExecutionPlanProperties for DatasetExec {
-    /// Get the output partitioning of this plan
     fn output_partitioning(&self) -> &Partitioning {
         self.plan_properties.output_partitioning()
     }
@@ -272,37 +187,22 @@ impl ExecutionPlanProperties for DatasetExec {
 
 impl DisplayAs for DatasetExec {
     fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        Python::with_gil(|py| {
-            let number_of_fragments = self.fragments.bind(py).len();
-            match t {
-                DisplayFormatType::Default
-                | DisplayFormatType::Verbose
-                | DisplayFormatType::TreeRender => {
-                    let projected_columns: Vec<String> = self
-                        .schema
-                        .fields()
-                        .iter()
-                        .map(|x| x.name().to_owned())
-                        .collect();
-                    if let Some(filter_expr) = &self.filter_expr {
-                        let filter_expr = filter_expr.bind(py).str().or(Err(std::fmt::Error))?;
-                        write!(
-                            f,
-                            "DatasetExec: number_of_fragments={}, filter_expr={}, projection=[{}]",
-                            number_of_fragments,
-                            filter_expr,
-                            projected_columns.join(", "),
-                        )
-                    } else {
-                        write!(
-                            f,
-                            "DatasetExec: number_of_fragments={}, projection=[{}]",
-                            number_of_fragments,
-                            projected_columns.join(", "),
-                        )
-                    }
-                }
+        match t {
+            DisplayFormatType::Default
+            | DisplayFormatType::Verbose
+            | DisplayFormatType::TreeRender => {
+                let projected_columns: Vec<String> = self
+                    .schema
+                    .fields()
+                    .iter()
+                    .map(|x| x.name().to_owned())
+                    .collect();
+                write!(
+                    f,
+                    "DatasetExec: projection=[{}]",
+                    projected_columns.join(", ")
+                )
             }
-        })
+        }
     }
 }