Skip to content

Commit a2df6d5

Browse files
committed
feat: add configurable HTML formatter for DataFrames
1 parent 09b929a commit a2df6d5

3 files changed

Lines changed: 259 additions & 111 deletions

File tree

python/datafusion/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
from .plan import ExecutionPlan, LogicalPlan
5050
from .record_batch import RecordBatch, RecordBatchStream
5151
from .udf import Accumulator, AggregateUDF, ScalarUDF, WindowUDF, udaf, udf, udwf
52+
from .html_formatter import configure_formatter
5253

5354
__version__ = importlib_metadata.version(__name__)
5455

@@ -90,6 +91,7 @@
9091
"udf",
9192
"udwf",
9293
"unparser",
94+
"configure_formatter",
9395
]
9496

9597

Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
"""HTML formatting utilities for DataFusion DataFrames."""
2+
3+
from typing import Dict, Optional, Any, Union
4+
5+
6+
class DataFrameHtmlFormatter:
7+
"""Configurable HTML formatter for DataFusion DataFrames.
8+
9+
This class handles the HTML rendering of DataFrames for display in
10+
Jupyter notebooks and other rich display contexts.
11+
12+
Args:
13+
max_cell_length: Maximum characters to display in a cell before truncation
14+
max_width: Maximum width of the HTML table in pixels
15+
max_height: Maximum height of the HTML table in pixels
16+
enable_cell_expansion: Whether to add expand/collapse buttons for long cell values
17+
custom_css: Additional CSS to include in the HTML output
18+
show_truncation_message: Whether to display a message when data is truncated
19+
"""
20+
21+
def __init__(
22+
self,
23+
max_cell_length: int = 25,
24+
max_width: int = 1000,
25+
max_height: int = 300,
26+
enable_cell_expansion: bool = True,
27+
custom_css: Optional[str] = None,
28+
show_truncation_message: bool = True,
29+
):
30+
self.max_cell_length = max_cell_length
31+
self.max_width = max_width
32+
self.max_height = max_height
33+
self.enable_cell_expansion = enable_cell_expansion
34+
self.custom_css = custom_css
35+
self.show_truncation_message = show_truncation_message
36+
37+
def format_html(
38+
self,
39+
batches: list,
40+
schema: Any,
41+
has_more: bool = False,
42+
table_uuid: Optional[str] = None,
43+
) -> str:
44+
"""Format record batches as HTML.
45+
46+
Args:
47+
batches: List of Arrow RecordBatch objects
48+
schema: Arrow Schema object
49+
has_more: Whether there are more batches not shown
50+
table_uuid: Unique ID for the table, used for JavaScript interactions
51+
52+
Returns:
53+
HTML string representation of the data
54+
"""
55+
if not batches:
56+
return "No data to display"
57+
58+
# Generate a unique ID if none provided
59+
table_uuid = table_uuid or "df-" + str(id(batches))
60+
61+
# Start building HTML string
62+
html = []
63+
64+
# Add CSS styles
65+
html.append("<style>")
66+
html.append(self._get_default_css())
67+
if self.custom_css:
68+
html.append(self.custom_css)
69+
html.append("</style>")
70+
71+
# Create table container
72+
html.append(
73+
f'<div style="width: 100%; max-width: {self.max_width}px; '
74+
f'max-height: {self.max_height}px; overflow: auto; border: 1px solid #ccc;">'
75+
)
76+
html.append('<table style="border-collapse: collapse; min-width: 100%">')
77+
78+
# Add table header
79+
html.append("<thead>")
80+
html.append("<tr>")
81+
for field in schema.fields:
82+
html.append(
83+
"<th style='border: 1px solid black; padding: 8px; "
84+
"text-align: left; background-color: #f2f2f2; "
85+
"white-space: nowrap; min-width: fit-content; "
86+
f"max-width: fit-content;'>{field.name}</th>"
87+
)
88+
html.append("</tr>")
89+
html.append("</thead>")
90+
91+
# Add table body
92+
html.append("<tbody>")
93+
94+
# Process and add rows
95+
row_count = 0
96+
for batch in batches:
97+
for row_idx in range(batch.num_rows):
98+
row_count += 1
99+
html.append("<tr>")
100+
101+
for col_idx, column in enumerate(batch.columns):
102+
cell_value = self._format_cell_value(column, row_idx)
103+
104+
if (
105+
len(str(cell_value)) > self.max_cell_length
106+
and self.enable_cell_expansion
107+
):
108+
# Add expandable cell
109+
short_value = str(cell_value)[: self.max_cell_length]
110+
html.append(
111+
f"<td style='border: 1px solid black; padding: 8px; "
112+
f"text-align: left; white-space: nowrap;'>"
113+
f"<div class='expandable-container'>"
114+
f"<span class='expandable' id='{table_uuid}-min-text-{row_count}-{col_idx}'>"
115+
f"{short_value}</span>"
116+
f"<span class='full-text' id='{table_uuid}-full-text-{row_count}-{col_idx}'>"
117+
f"{cell_value}</span>"
118+
f"<button class='expand-btn' "
119+
f"onclick=\"toggleDataFrameCellText('{table_uuid}',{row_count},{col_idx})\">"
120+
f"...</button>"
121+
f"</div>"
122+
f"</td>"
123+
)
124+
else:
125+
# Add regular cell
126+
html.append(
127+
f"<td style='border: 1px solid black; padding: 8px; "
128+
f"text-align: left; white-space: nowrap;'>{cell_value}</td>"
129+
)
130+
131+
html.append("</tr>")
132+
133+
html.append("</tbody>")
134+
html.append("</table>")
135+
html.append("</div>")
136+
137+
# Add JavaScript for interactivity
138+
if self.enable_cell_expansion:
139+
html.append(self._get_javascript())
140+
141+
# Add truncation message if needed
142+
if has_more and self.show_truncation_message:
143+
html.append("<div>Data truncated due to size.</div>")
144+
145+
return "\n".join(html)
146+
147+
def _format_cell_value(self, column: Any, row_idx: int) -> str:
148+
"""Format a cell value for display.
149+
150+
Args:
151+
column: Arrow array
152+
row_idx: Row index
153+
154+
Returns:
155+
Formatted cell value as string
156+
"""
157+
# This is a simplified implementation for Python-side formatting
158+
# In practice, we'd want to handle different Arrow types appropriately
159+
try:
160+
return str(column[row_idx])
161+
except (IndexError, TypeError):
162+
return ""
163+
164+
def _get_default_css(self) -> str:
165+
"""Get default CSS styles for the HTML table."""
166+
return """
167+
.expandable-container {
168+
display: inline-block;
169+
max-width: 200px;
170+
}
171+
.expandable {
172+
white-space: nowrap;
173+
overflow: hidden;
174+
text-overflow: ellipsis;
175+
display: block;
176+
}
177+
.full-text {
178+
display: none;
179+
white-space: normal;
180+
}
181+
.expand-btn {
182+
cursor: pointer;
183+
color: blue;
184+
text-decoration: underline;
185+
border: none;
186+
background: none;
187+
font-size: inherit;
188+
display: block;
189+
margin-top: 5px;
190+
}
191+
"""
192+
193+
def _get_javascript(self) -> str:
194+
"""Get JavaScript code for interactive elements."""
195+
return """
196+
<script>
197+
function toggleDataFrameCellText(table_uuid, row, col) {
198+
var shortText = document.getElementById(table_uuid + "-min-text-" + row + "-" + col);
199+
var fullText = document.getElementById(table_uuid + "-full-text-" + row + "-" + col);
200+
var button = event.target;
201+
202+
if (fullText.style.display === "none") {
203+
shortText.style.display = "none";
204+
fullText.style.display = "inline";
205+
button.textContent = "(less)";
206+
} else {
207+
shortText.style.display = "inline";
208+
fullText.style.display = "none";
209+
button.textContent = "...";
210+
}
211+
}
212+
</script>
213+
"""
214+
215+
216+
# Global formatter instance to be used by default
217+
_default_formatter = DataFrameHtmlFormatter()
218+
219+
220+
def get_formatter() -> DataFrameHtmlFormatter:
221+
"""Get the current global DataFrame HTML formatter."""
222+
return _default_formatter
223+
224+
225+
def configure_formatter(**kwargs: Any) -> None:
226+
"""Configure the global DataFrame HTML formatter.
227+
228+
Args:
229+
**kwargs: Formatter configuration parameters
230+
"""
231+
global _default_formatter
232+
_default_formatter = DataFrameHtmlFormatter(**kwargs)

0 commit comments

Comments
 (0)