@@ -292,15 +292,20 @@ impl PyParquetColumnOptions {
292292pub struct PyDataFrame {
293293 df : Arc < DataFrame > ,
294294
295+ // Hold the session state so streams/readers can keep the
296+ // underlying SessionContext alive while Python iterates.
297+ session_state : Arc < SessionState > ,
298+
295299 // In IPython environment cache batches between __repr__ and _repr_html_ calls.
296300 batches : Option < ( Vec < RecordBatch > , bool ) > ,
297301}
298302
299303impl PyDataFrame {
300304 /// creates a new PyDataFrame
301- pub fn new ( df : DataFrame ) -> Self {
305+ pub fn new ( df : DataFrame , session_state : Arc < SessionState > ) -> Self {
302306 Self {
303307 df : Arc :: new ( df) ,
308+ session_state,
304309 batches : None ,
305310 }
306311 }
@@ -481,7 +486,7 @@ impl PyDataFrame {
481486 fn describe ( & self , py : Python ) -> PyDataFusionResult < Self > {
482487 let df = self . df . as_ref ( ) . clone ( ) ;
483488 let stat_df = spawn_future ( py, async move { df. describe ( ) . await } ) ?;
484- Ok ( Self :: new ( stat_df) )
489+ Ok ( Self :: new ( stat_df, self . session_state . clone ( ) ) )
485490 }
486491
487492 /// Returns the schema from the logical plan
@@ -511,31 +516,31 @@ impl PyDataFrame {
511516 fn select_columns ( & self , args : Vec < PyBackedStr > ) -> PyDataFusionResult < Self > {
512517 let args = args. iter ( ) . map ( |s| s. as_ref ( ) ) . collect :: < Vec < & str > > ( ) ;
513518 let df = self . df . as_ref ( ) . clone ( ) . select_columns ( & args) ?;
514- Ok ( Self :: new ( df) )
519+ Ok ( Self :: new ( df, self . session_state . clone ( ) ) )
515520 }
516521
517522 #[ pyo3( signature = ( * args) ) ]
518523 fn select ( & self , args : Vec < PyExpr > ) -> PyDataFusionResult < Self > {
519524 let expr: Vec < Expr > = args. into_iter ( ) . map ( |e| e. into ( ) ) . collect ( ) ;
520525 let df = self . df . as_ref ( ) . clone ( ) . select ( expr) ?;
521- Ok ( Self :: new ( df) )
526+ Ok ( Self :: new ( df, self . session_state . clone ( ) ) )
522527 }
523528
524529 #[ pyo3( signature = ( * args) ) ]
525530 fn drop ( & self , args : Vec < PyBackedStr > ) -> PyDataFusionResult < Self > {
526531 let cols = args. iter ( ) . map ( |s| s. as_ref ( ) ) . collect :: < Vec < & str > > ( ) ;
527532 let df = self . df . as_ref ( ) . clone ( ) . drop_columns ( & cols) ?;
528- Ok ( Self :: new ( df) )
533+ Ok ( Self :: new ( df, self . session_state . clone ( ) ) )
529534 }
530535
531536 fn filter ( & self , predicate : PyExpr ) -> PyDataFusionResult < Self > {
532537 let df = self . df . as_ref ( ) . clone ( ) . filter ( predicate. into ( ) ) ?;
533- Ok ( Self :: new ( df) )
538+ Ok ( Self :: new ( df, self . session_state . clone ( ) ) )
534539 }
535540
536541 fn with_column ( & self , name : & str , expr : PyExpr ) -> PyDataFusionResult < Self > {
537542 let df = self . df . as_ref ( ) . clone ( ) . with_column ( name, expr. into ( ) ) ?;
538- Ok ( Self :: new ( df) )
543+ Ok ( Self :: new ( df, self . session_state . clone ( ) ) )
539544 }
540545
541546 fn with_columns ( & self , exprs : Vec < PyExpr > ) -> PyDataFusionResult < Self > {
@@ -545,7 +550,7 @@ impl PyDataFrame {
545550 let name = format ! ( "{}" , expr. schema_name( ) ) ;
546551 df = df. with_column ( name. as_str ( ) , expr) ?
547552 }
548- Ok ( Self :: new ( df) )
553+ Ok ( Self :: new ( df, self . session_state . clone ( ) ) )
549554 }
550555
551556 /// Rename one column by applying a new projection. This is a no-op if the column to be
@@ -556,27 +561,27 @@ impl PyDataFrame {
556561 . as_ref ( )
557562 . clone ( )
558563 . with_column_renamed ( old_name, new_name) ?;
559- Ok ( Self :: new ( df) )
564+ Ok ( Self :: new ( df, self . session_state . clone ( ) ) )
560565 }
561566
562567 fn aggregate ( & self , group_by : Vec < PyExpr > , aggs : Vec < PyExpr > ) -> PyDataFusionResult < Self > {
563568 let group_by = group_by. into_iter ( ) . map ( |e| e. into ( ) ) . collect ( ) ;
564569 let aggs = aggs. into_iter ( ) . map ( |e| e. into ( ) ) . collect ( ) ;
565570 let df = self . df . as_ref ( ) . clone ( ) . aggregate ( group_by, aggs) ?;
566- Ok ( Self :: new ( df) )
571+ Ok ( Self :: new ( df, self . session_state . clone ( ) ) )
567572 }
568573
569574 #[ pyo3( signature = ( * exprs) ) ]
570575 fn sort ( & self , exprs : Vec < PySortExpr > ) -> PyDataFusionResult < Self > {
571576 let exprs = to_sort_expressions ( exprs) ;
572577 let df = self . df . as_ref ( ) . clone ( ) . sort ( exprs) ?;
573- Ok ( Self :: new ( df) )
578+ Ok ( Self :: new ( df, self . session_state . clone ( ) ) )
574579 }
575580
576581 #[ pyo3( signature = ( count, offset=0 ) ) ]
577582 fn limit ( & self , count : usize , offset : usize ) -> PyDataFusionResult < Self > {
578583 let df = self . df . as_ref ( ) . clone ( ) . limit ( offset, Some ( count) ) ?;
579- Ok ( Self :: new ( df) )
584+ Ok ( Self :: new ( df, self . session_state . clone ( ) ) )
580585 }
581586
582587 /// Executes the plan, returning a list of `RecordBatch`es.
@@ -593,7 +598,7 @@ impl PyDataFrame {
593598 /// Cache DataFrame.
594599 fn cache ( & self , py : Python ) -> PyDataFusionResult < Self > {
595600 let df = wait_for_future ( py, self . df . as_ref ( ) . clone ( ) . cache ( ) ) ??;
596- Ok ( Self :: new ( df) )
601+ Ok ( Self :: new ( df, self . session_state . clone ( ) ) )
597602 }
598603
599604 /// Executes this DataFrame and collects all results into a vector of vector of RecordBatch
@@ -618,7 +623,7 @@ impl PyDataFrame {
618623 /// Filter out duplicate rows
619624 fn distinct ( & self ) -> PyDataFusionResult < Self > {
620625 let df = self . df . as_ref ( ) . clone ( ) . distinct ( ) ?;
621- Ok ( Self :: new ( df) )
626+ Ok ( Self :: new ( df, self . session_state . clone ( ) ) )
622627 }
623628
624629 fn join (
@@ -652,7 +657,7 @@ impl PyDataFrame {
652657 & right_keys,
653658 None ,
654659 ) ?;
655- Ok ( Self :: new ( df) )
660+ Ok ( Self :: new ( df, self . session_state . clone ( ) ) )
656661 }
657662
658663 fn join_on (
@@ -681,7 +686,7 @@ impl PyDataFrame {
681686 . as_ref ( )
682687 . clone ( )
683688 . join_on ( right. df . as_ref ( ) . clone ( ) , join_type, exprs) ?;
684- Ok ( Self :: new ( df) )
689+ Ok ( Self :: new ( df, self . session_state . clone ( ) ) )
685690 }
686691
687692 /// Print the query plan
@@ -714,7 +719,7 @@ impl PyDataFrame {
714719 . as_ref ( )
715720 . clone ( )
716721 . repartition ( Partitioning :: RoundRobinBatch ( num) ) ?;
717- Ok ( Self :: new ( new_df) )
722+ Ok ( Self :: new ( new_df, self . session_state . clone ( ) ) )
718723 }
719724
720725 /// Repartition a `DataFrame` based on a logical partitioning scheme.
@@ -726,7 +731,7 @@ impl PyDataFrame {
726731 . as_ref ( )
727732 . clone ( )
728733 . repartition ( Partitioning :: Hash ( expr, num) ) ?;
729- Ok ( Self :: new ( new_df) )
734+ Ok ( Self :: new ( new_df, self . session_state . clone ( ) ) )
730735 }
731736
732737 /// Calculate the union of two `DataFrame`s, preserving duplicate rows.The
@@ -742,7 +747,7 @@ impl PyDataFrame {
742747 self . df . as_ref ( ) . clone ( ) . union ( py_df. df . as_ref ( ) . clone ( ) ) ?
743748 } ;
744749
745- Ok ( Self :: new ( new_df) )
750+ Ok ( Self :: new ( new_df, self . session_state . clone ( ) ) )
746751 }
747752
748753 /// Calculate the distinct union of two `DataFrame`s. The
@@ -753,7 +758,7 @@ impl PyDataFrame {
753758 . as_ref ( )
754759 . clone ( )
755760 . union_distinct ( py_df. df . as_ref ( ) . clone ( ) ) ?;
756- Ok ( Self :: new ( new_df) )
761+ Ok ( Self :: new ( new_df, self . session_state . clone ( ) ) )
757762 }
758763
759764 #[ pyo3( signature = ( column, preserve_nulls=true ) ) ]
@@ -766,7 +771,7 @@ impl PyDataFrame {
766771 . as_ref ( )
767772 . clone ( )
768773 . unnest_columns_with_options ( & [ column] , unnest_options) ?;
769- Ok ( Self :: new ( df) )
774+ Ok ( Self :: new ( df, self . session_state . clone ( ) ) )
770775 }
771776
772777 #[ pyo3( signature = ( columns, preserve_nulls=true ) ) ]
@@ -784,7 +789,7 @@ impl PyDataFrame {
784789 . as_ref ( )
785790 . clone ( )
786791 . unnest_columns_with_options ( & cols, unnest_options) ?;
787- Ok ( Self :: new ( df) )
792+ Ok ( Self :: new ( df, self . session_state . clone ( ) ) )
788793 }
789794
790795 /// Calculate the intersection of two `DataFrame`s. The two `DataFrame`s must have exactly the same schema
@@ -794,13 +799,13 @@ impl PyDataFrame {
794799 . as_ref ( )
795800 . clone ( )
796801 . intersect ( py_df. df . as_ref ( ) . clone ( ) ) ?;
797- Ok ( Self :: new ( new_df) )
802+ Ok ( Self :: new ( new_df, self . session_state . clone ( ) ) )
798803 }
799804
800805 /// Calculate the exception of two `DataFrame`s. The two `DataFrame`s must have exactly the same schema
801806 fn except_all ( & self , py_df : PyDataFrame ) -> PyDataFusionResult < Self > {
802807 let new_df = self . df . as_ref ( ) . clone ( ) . except ( py_df. df . as_ref ( ) . clone ( ) ) ?;
803- Ok ( Self :: new ( new_df) )
808+ Ok ( Self :: new ( new_df, self . session_state . clone ( ) ) )
804809 }
805810
806811 /// Write a `DataFrame` to a CSV file.
@@ -957,7 +962,7 @@ impl PyDataFrame {
957962 requested_schema : Option < Bound < ' py , PyCapsule > > ,
958963 ) -> PyDataFusionResult < Bound < ' py , PyCapsule > > {
959964 let df = self . df . as_ref ( ) . clone ( ) ;
960- let state = df . session_state ( ) . clone ( ) ;
965+ let state = self . session_state . clone ( ) ;
961966 let streams = spawn_future ( py, async move { df. execute_stream_partitioned ( ) . await } ) ?;
962967 let streams = streams
963968 . into_iter ( )
@@ -997,14 +1002,14 @@ impl PyDataFrame {
9971002
9981003 fn execute_stream ( & self , py : Python ) -> PyDataFusionResult < PyRecordBatchStream > {
9991004 let df = self . df . as_ref ( ) . clone ( ) ;
1000- let state = df . session_state ( ) . clone ( ) ;
1005+ let state = self . session_state . clone ( ) ;
10011006 let stream = spawn_future ( py, async move { df. execute_stream ( ) . await } ) ?;
10021007 Ok ( PyRecordBatchStream :: new ( stream, state) )
10031008 }
10041009
10051010 fn execute_stream_partitioned ( & self , py : Python ) -> PyResult < Vec < PyRecordBatchStream > > {
10061011 let df = self . df . as_ref ( ) . clone ( ) ;
1007- let state = df . session_state ( ) . clone ( ) ;
1012+ let state = self . session_state . clone ( ) ;
10081013 let streams = spawn_future ( py, async move { df. execute_stream_partitioned ( ) . await } ) ?;
10091014 Ok ( streams
10101015 . into_iter ( )
@@ -1073,7 +1078,7 @@ impl PyDataFrame {
10731078 } ;
10741079
10751080 let df = self . df . as_ref ( ) . clone ( ) . fill_null ( scalar_value, cols) ?;
1076- Ok ( Self :: new ( df) )
1081+ Ok ( Self :: new ( df, self . session_state . clone ( ) ) )
10771082 }
10781083}
10791084
0 commit comments