Revert "feat: add configurable max table bytes and min table rows for DataFrame display"

kosiew · kosiew · commit 4d8fa38007b7 · 2025-04-27T18:58:23.000+08:00
This reverts commit f9b78fa.
diff --git a/python/datafusion/html_formatter.py b/python/datafusion/html_formatter.py
@@ -98,8 +98,6 @@ class DataFrameHtmlFormatter:
         style_provider: Custom provider for cell and header styles
         use_shared_styles: Whether to load styles and scripts only once per notebook
           session
-        max_table_bytes: Maximum bytes to display for table presentation (default: 2MB) 
-        min_table_rows: Minimum number of table rows to display (default: 20)
     """
 
     # Class variable to track if styles have been loaded in the notebook
@@ -115,8 +113,6 @@ def __init__(
         show_truncation_message: bool = True,
         style_provider: Optional[StyleProvider] = None,
         use_shared_styles: bool = True,
-        max_table_bytes: int = 2 * 1024 * 1024,  # 2 MB
-        min_table_rows: int = 20,
     ) -> None:
         """Initialize the HTML formatter.
 
@@ -139,16 +135,11 @@ def __init__(
             is used.
         use_shared_styles : bool, default True
             Whether to use shared styles across multiple tables.
-        max_table_bytes : int, default 2MB (2 * 1024 * 1024)
-            Maximum bytes to display for table presentation.
-        min_table_rows : int, default 20
-            Minimum number of table rows to display.
 
         Raises:
         ------
         ValueError
-            If max_cell_length, max_width, max_height, max_table_bytes, or min_table_rows 
-            is not a positive integer.
+            If max_cell_length, max_width, or max_height is not a positive integer.
         TypeError
             If enable_cell_expansion, show_truncation_message, or use_shared_styles is
             not a boolean,
@@ -167,12 +158,6 @@ def __init__(
         if not isinstance(max_height, int) or max_height <= 0:
             msg = "max_height must be a positive integer"
             raise ValueError(msg)
-        if not isinstance(max_table_bytes, int) or max_table_bytes <= 0:
-            msg = "max_table_bytes must be a positive integer"
-            raise ValueError(msg)
-        if not isinstance(min_table_rows, int) or min_table_rows <= 0:
-            msg = "min_table_rows must be a positive integer"
-            raise ValueError(msg)
 
         # Validate boolean parameters
         if not isinstance(enable_cell_expansion, bool):
@@ -203,8 +188,6 @@ def __init__(
         self.show_truncation_message = show_truncation_message
         self.style_provider = style_provider or DefaultStyleProvider()
         self.use_shared_styles = use_shared_styles
-        self.max_table_bytes = max_table_bytes
-        self.min_table_rows = min_table_rows
         # Registry for custom type formatters
         self._type_formatters: dict[type, CellFormatter] = {}
         # Custom cell builders
diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -71,6 +71,8 @@ impl PyTableProvider {
         PyTable::new(table_provider)
     }
 }
+const MAX_TABLE_BYTES_TO_DISPLAY: usize = 2 * 1024 * 1024; // 2 MB
+const MIN_TABLE_ROWS_TO_DISPLAY: usize = 20;
 
 /// A PyDataFrame is a representation of a logical plan and an API to compose statements.
 /// Use it to build a plan and `.collect()` to execute the plan and collect the result.
@@ -79,16 +81,12 @@ impl PyTableProvider {
 #[derive(Clone)]
 pub struct PyDataFrame {
     df: Arc<DataFrame>,
-    display_config: Arc<PyDataframeDisplayConfig>,
 }
 
 impl PyDataFrame {
     /// creates a new PyDataFrame
-    pub fn new(df: DataFrame, display_config: PyDataframeDisplayConfig) -> Self {
-        Self {
-            df: Arc::new(df),
-            display_config: Arc::new(display_config),
-        }
+    pub fn new(df: DataFrame) -> Self {
+        Self { df: Arc::new(df) }
     }
 }
 
@@ -118,12 +116,7 @@ impl PyDataFrame {
     fn __repr__(&self, py: Python) -> PyDataFusionResult<String> {
         let (batches, has_more) = wait_for_future(
             py,
-            collect_record_batches_to_display(
-                self.df.as_ref().clone(),
-                10,
-                10,
-                self.display_config.max_table_bytes,
-            ),
+            collect_record_batches_to_display(self.df.as_ref().clone(), 10, 10),
         )?;
         if batches.is_empty() {
             // This should not be reached, but do it for safety since we index into the vector below
@@ -146,9 +139,8 @@ impl PyDataFrame {
             py,
             collect_record_batches_to_display(
                 self.df.as_ref().clone(),
-                self.display_config.min_table_rows,
+                MIN_TABLE_ROWS_TO_DISPLAY,
                 usize::MAX,
-                self.display_config.max_table_bytes,
             ),
         )?;
         if batches.is_empty() {
@@ -189,7 +181,7 @@ impl PyDataFrame {
     fn describe(&self, py: Python) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone();
         let stat_df = wait_for_future(py, df.describe())?;
-        Ok(Self::new(stat_df, (*self.display_config).clone()))
+        Ok(Self::new(stat_df))
     }
 
     /// Returns the schema from the logical plan
@@ -219,31 +211,31 @@ impl PyDataFrame {
     fn select_columns(&self, args: Vec<PyBackedStr>) -> PyDataFusionResult<Self> {
         let args = args.iter().map(|s| s.as_ref()).collect::<Vec<&str>>();
         let df = self.df.as_ref().clone().select_columns(&args)?;
-        Ok(Self::new(df, (*self.display_config).clone()))
+        Ok(Self::new(df))
     }
 
     #[pyo3(signature = (*args))]
     fn select(&self, args: Vec<PyExpr>) -> PyDataFusionResult<Self> {
         let expr = args.into_iter().map(|e| e.into()).collect();
         let df = self.df.as_ref().clone().select(expr)?;
-        Ok(Self::new(df, (*self.display_config).clone()))
+        Ok(Self::new(df))
     }
 
     #[pyo3(signature = (*args))]
     fn drop(&self, args: Vec<PyBackedStr>) -> PyDataFusionResult<Self> {
         let cols = args.iter().map(|s| s.as_ref()).collect::<Vec<&str>>();
         let df = self.df.as_ref().clone().drop_columns(&cols)?;
-        Ok(Self::new(df, (*self.display_config).clone()))
+        Ok(Self::new(df))
     }
 
     fn filter(&self, predicate: PyExpr) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone().filter(predicate.into())?;
-        Ok(Self::new(df, (*self.display_config).clone()))
+        Ok(Self::new(df))
     }
 
     fn with_column(&self, name: &str, expr: PyExpr) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone().with_column(name, expr.into())?;
-        Ok(Self::new(df, (*self.display_config).clone()))
+        Ok(Self::new(df))
     }
 
     fn with_columns(&self, exprs: Vec<PyExpr>) -> PyDataFusionResult<Self> {
@@ -253,7 +245,7 @@ impl PyDataFrame {
             let name = format!("{}", expr.schema_name());
             df = df.with_column(name.as_str(), expr)?
         }
-        Ok(Self::new(df, (*self.display_config).clone()))
+        Ok(Self::new(df))
     }
 
     /// Rename one column by applying a new projection. This is a no-op if the column to be
@@ -264,27 +256,27 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .with_column_renamed(old_name, new_name)?;
-        Ok(Self::new(df, (*self.display_config).clone()))
+        Ok(Self::new(df))
     }
 
     fn aggregate(&self, group_by: Vec<PyExpr>, aggs: Vec<PyExpr>) -> PyDataFusionResult<Self> {
         let group_by = group_by.into_iter().map(|e| e.into()).collect();
         let aggs = aggs.into_iter().map(|e| e.into()).collect();
         let df = self.df.as_ref().clone().aggregate(group_by, aggs)?;
-        Ok(Self::new(df, (*self.display_config).clone()))
+        Ok(Self::new(df))
     }
 
     #[pyo3(signature = (*exprs))]
     fn sort(&self, exprs: Vec<PySortExpr>) -> PyDataFusionResult<Self> {
         let exprs = to_sort_expressions(exprs);
         let df = self.df.as_ref().clone().sort(exprs)?;
-        Ok(Self::new(df, (*self.display_config).clone()))
+        Ok(Self::new(df))
     }
 
     #[pyo3(signature = (count, offset=0))]
     fn limit(&self, count: usize, offset: usize) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone().limit(offset, Some(count))?;
-        Ok(Self::new(df, (*self.display_config).clone()))
+        Ok(Self::new(df))
     }
 
     /// Executes the plan, returning a list of `RecordBatch`es.
@@ -301,7 +293,7 @@ impl PyDataFrame {
     /// Cache DataFrame.
     fn cache(&self, py: Python) -> PyDataFusionResult<Self> {
         let df = wait_for_future(py, self.df.as_ref().clone().cache())?;
-        Ok(Self::new(df, (*self.display_config).clone()))
+        Ok(Self::new(df))
     }
 
     /// Executes this DataFrame and collects all results into a vector of vector of RecordBatch
@@ -326,7 +318,7 @@ impl PyDataFrame {
     /// Filter out duplicate rows
     fn distinct(&self) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone().distinct()?;
-        Ok(Self::new(df, (*self.display_config).clone()))
+        Ok(Self::new(df))
     }
 
     fn join(
@@ -360,7 +352,7 @@ impl PyDataFrame {
             &right_keys,
             None,
         )?;
-        Ok(Self::new(df, (*self.display_config).clone()))
+        Ok(Self::new(df))
     }
 
     fn join_on(
@@ -389,7 +381,7 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .join_on(right.df.as_ref().clone(), join_type, exprs)?;
-        Ok(Self::new(df, (*self.display_config).clone()))
+        Ok(Self::new(df))
     }
 
     /// Print the query plan
@@ -422,7 +414,7 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .repartition(Partitioning::RoundRobinBatch(num))?;
-        Ok(Self::new(new_df, (*self.display_config).clone()))
+        Ok(Self::new(new_df))
     }
 
     /// Repartition a `DataFrame` based on a logical partitioning scheme.
@@ -434,7 +426,7 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .repartition(Partitioning::Hash(expr, num))?;
-        Ok(Self::new(new_df, (*self.display_config).clone()))
+        Ok(Self::new(new_df))
     }
 
     /// Calculate the union of two `DataFrame`s, preserving duplicate rows.The
@@ -450,7 +442,7 @@ impl PyDataFrame {
             self.df.as_ref().clone().union(py_df.df.as_ref().clone())?
         };
 
-        Ok(Self::new(new_df, (*self.display_config).clone()))
+        Ok(Self::new(new_df))
     }
 
     /// Calculate the distinct union of two `DataFrame`s.  The
@@ -461,7 +453,7 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .union_distinct(py_df.df.as_ref().clone())?;
-        Ok(Self::new(new_df, (*self.display_config).clone()))
+        Ok(Self::new(new_df))
     }
 
     #[pyo3(signature = (column, preserve_nulls=true))]
@@ -502,13 +494,13 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .intersect(py_df.df.as_ref().clone())?;
-        Ok(Self::new(new_df, (*self.display_config).clone()))
+        Ok(Self::new(new_df))
     }
 
     /// Calculate the exception of two `DataFrame`s.  The two `DataFrame`s must have exactly the same schema
     fn except_all(&self, py_df: PyDataFrame) -> PyDataFusionResult<Self> {
         let new_df = self.df.as_ref().clone().except(py_df.df.as_ref().clone())?;
-        Ok(Self::new(new_df, (*self.display_config).clone()))
+        Ok(Self::new(new_df))
     }
 
     /// Write a `DataFrame` to a CSV file.
@@ -806,7 +798,6 @@ async fn collect_record_batches_to_display(
     df: DataFrame,
     min_rows: usize,
     max_rows: usize,
-    max_table_bytes: usize,
 ) -> Result<(Vec<RecordBatch>, bool), DataFusionError> {
     let partitioned_stream = df.execute_stream_partitioned().await?;
     let mut stream = futures::stream::iter(partitioned_stream).flatten();
@@ -815,7 +806,7 @@ async fn collect_record_batches_to_display(
     let mut record_batches = Vec::default();
     let mut has_more = false;
 
-    while (size_estimate_so_far < max_table_bytes && rows_so_far < max_rows)
+    while (size_estimate_so_far < MAX_TABLE_BYTES_TO_DISPLAY && rows_so_far < max_rows)
         || rows_so_far < min_rows
     {
         let mut rb = match stream.next().await {
@@ -830,8 +821,8 @@ async fn collect_record_batches_to_display(
         if rows_in_rb > 0 {
             size_estimate_so_far += rb.get_array_memory_size();
 
-            if size_estimate_so_far > max_table_bytes {
-                let ratio = max_table_bytes as f32 / size_estimate_so_far as f32;
+            if size_estimate_so_far > MAX_TABLE_BYTES_TO_DISPLAY {
+                let ratio = MAX_TABLE_BYTES_TO_DISPLAY as f32 / size_estimate_so_far as f32;
                 let total_rows = rows_in_rb + rows_so_far;
 
                 let mut reduced_row_num = (total_rows as f32 * ratio).round() as usize;