Add tests for KeyboardInterrupt handling in __arrow_c_stream__ and improve async stream signal handling

kosiew · kosiew · commit 760969eb2d23 · 2025-09-01T11:41:50.000+08:00
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -2689,6 +2689,110 @@ def trigger_interrupt():
     interrupt_thread.join(timeout=1.0)
 
 
+def test_arrow_c_stream_interrupted():
+    """__arrow_c_stream__ responds to ``KeyboardInterrupt`` signals.
+
+    Similar to ``test_collect_interrupted`` this test issues a long running
+    query, but consumes the results via ``__arrow_c_stream__``. It then raises
+    ``KeyboardInterrupt`` in the main thread and verifies that the stream
+    iteration stops promptly with the appropriate exception.
+    """
+
+    ctx = SessionContext()
+
+    batches = []
+    for i in range(10):
+        batch = pa.RecordBatch.from_arrays(
+            [
+                pa.array(list(range(i * 1000, (i + 1) * 1000))),
+                pa.array([f"value_{j}" for j in range(i * 1000, (i + 1) * 1000)]),
+            ],
+            names=["a", "b"],
+        )
+        batches.append(batch)
+
+    ctx.register_record_batches("t1", [batches])
+    ctx.register_record_batches("t2", [batches])
+
+    df = ctx.sql(
+        """
+        WITH t1_expanded AS (
+            SELECT
+                a,
+                b,
+                CAST(a AS DOUBLE) / 1.5 AS c,
+                CAST(a AS DOUBLE) * CAST(a AS DOUBLE) AS d
+            FROM t1
+            CROSS JOIN (SELECT 1 AS dummy FROM t1 LIMIT 5)
+        ),
+        t2_expanded AS (
+            SELECT
+                a,
+                b,
+                CAST(a AS DOUBLE) * 2.5 AS e,
+                CAST(a AS DOUBLE) * CAST(a AS DOUBLE) * CAST(a AS DOUBLE) AS f
+            FROM t2
+            CROSS JOIN (SELECT 1 AS dummy FROM t2 LIMIT 5)
+        )
+        SELECT
+            t1.a, t1.b, t1.c, t1.d,
+            t2.a AS a2, t2.b AS b2, t2.e, t2.f
+        FROM t1_expanded t1
+        JOIN t2_expanded t2 ON t1.a % 100 = t2.a % 100
+        WHERE t1.a > 100 AND t2.a > 100
+        """
+    )
+
+    reader = pa.RecordBatchReader._import_from_c(df.__arrow_c_stream__())
+
+    interrupted = False
+    interrupt_error = None
+    query_started = threading.Event()
+    max_wait_time = 5.0
+
+    def trigger_interrupt():
+        start_time = time.time()
+        while not query_started.is_set():
+            time.sleep(0.1)
+            if time.time() - start_time > max_wait_time:
+                msg = f"Query did not start within {max_wait_time} seconds"
+                raise RuntimeError(msg)
+
+        thread_id = threading.main_thread().ident
+        if thread_id is None:
+            msg = "Cannot get main thread ID"
+            raise RuntimeError(msg)
+
+        exception = ctypes.py_object(KeyboardInterrupt)
+        res = ctypes.pythonapi.PyThreadState_SetAsyncExc(
+            ctypes.c_long(thread_id), exception
+        )
+        if res != 1:
+            ctypes.pythonapi.PyThreadState_SetAsyncExc(
+                ctypes.c_long(thread_id), ctypes.py_object(0)
+            )
+            msg = "Failed to raise KeyboardInterrupt in main thread"
+            raise RuntimeError(msg)
+
+    interrupt_thread = threading.Thread(target=trigger_interrupt)
+    interrupt_thread.daemon = True
+    interrupt_thread.start()
+
+    try:
+        query_started.set()
+        # consume the reader which should block and be interrupted
+        reader.read_all()
+    except KeyboardInterrupt:
+        interrupted = True
+    except Exception as e:  # pragma: no cover - unexpected errors
+        interrupt_error = e
+
+    if not interrupted:
+        pytest.fail(f"Stream was not interrupted; got error: {interrupt_error}")
+
+    interrupt_thread.join(timeout=1.0)
+
+
 def test_show_select_where_no_rows(capsys) -> None:
     ctx = SessionContext()
     df = ctx.sql("SELECT 1 WHERE 1=0")
diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -42,7 +42,7 @@ use pyo3::exceptions::PyValueError;
 use pyo3::prelude::*;
 use pyo3::pybacked::PyBackedStr;
 use pyo3::types::{PyCapsule, PyList, PyTuple, PyTupleMethods};
-use tokio::runtime::Handle;
+use tokio::task::JoinHandle;
 
 use crate::catalog::PyTable;
 use crate::errors::{py_datafusion_err, PyDataFusionError};
@@ -363,7 +363,6 @@ impl PyDataFrame {
 /// changes per batch.
 struct DataFrameStreamReader {
     stream: SendableRecordBatchStream,
-    runtime: Handle,
     schema: SchemaRef,
     projection: Option<SchemaRef>,
 }
@@ -372,8 +371,15 @@ impl Iterator for DataFrameStreamReader {
     type Item = Result<RecordBatch, ArrowError>;
 
     fn next(&mut self) -> Option<Self::Item> {
-        match self.runtime.block_on(self.stream.next()) {
-            Some(Ok(batch)) => {
+        // Use wait_for_future to poll the underlying async stream while
+        // respecting Python signal handling (e.g. ``KeyboardInterrupt``).
+        // This mirrors the behaviour of other synchronous wrappers and
+        // prevents blocking indefinitely when a Python interrupt is raised.
+        let fut = self.stream.next();
+        let result = Python::with_gil(|py| wait_for_future(py, fut));
+
+        match result {
+            Ok(Some(Ok(batch))) => {
                 let batch = if let Some(ref schema) = self.projection {
                     match record_batch_into_schema(batch, schema.as_ref()) {
                         Ok(b) => b,
@@ -384,8 +390,9 @@ impl Iterator for DataFrameStreamReader {
                 };
                 Some(Ok(batch))
             }
-            Some(Err(e)) => Some(Err(ArrowError::ExternalError(Box::new(e)))),
-            None => None,
+            Ok(Some(Err(e))) => Some(Err(ArrowError::ExternalError(Box::new(e)))),
+            Ok(None) => None,
+            Err(e) => Some(Err(ArrowError::ExternalError(Box::new(e)))),
         }
     }
 }
@@ -921,7 +928,6 @@ impl PyDataFrame {
         py: Python<'py>,
         requested_schema: Option<Bound<'py, PyCapsule>>,
     ) -> PyDataFusionResult<Bound<'py, PyCapsule>> {
-        let rt = &get_tokio_runtime().0;
         let df = self.df.as_ref().clone();
         let stream = spawn_stream(py, async move { df.execute_stream().await })?;
 
@@ -942,7 +948,6 @@ impl PyDataFrame {
 
         let reader = DataFrameStreamReader {
             stream,
-            runtime: rt.handle().clone(),
             schema: schema_ref,
             projection,
         };