|
| 1 | +"""HTML formatting utilities for DataFusion DataFrames.""" |
| 2 | + |
| 3 | +from typing import Dict, Optional, Any, Union |
| 4 | + |
| 5 | + |
| 6 | +class DataFrameHtmlFormatter: |
| 7 | + """Configurable HTML formatter for DataFusion DataFrames. |
| 8 | +
|
| 9 | + This class handles the HTML rendering of DataFrames for display in |
| 10 | + Jupyter notebooks and other rich display contexts. |
| 11 | +
|
| 12 | + Args: |
| 13 | + max_cell_length: Maximum characters to display in a cell before truncation |
| 14 | + max_width: Maximum width of the HTML table in pixels |
| 15 | + max_height: Maximum height of the HTML table in pixels |
| 16 | + enable_cell_expansion: Whether to add expand/collapse buttons for long cell values |
| 17 | + custom_css: Additional CSS to include in the HTML output |
| 18 | + show_truncation_message: Whether to display a message when data is truncated |
| 19 | + """ |
| 20 | + |
| 21 | + def __init__( |
| 22 | + self, |
| 23 | + max_cell_length: int = 25, |
| 24 | + max_width: int = 1000, |
| 25 | + max_height: int = 300, |
| 26 | + enable_cell_expansion: bool = True, |
| 27 | + custom_css: Optional[str] = None, |
| 28 | + show_truncation_message: bool = True, |
| 29 | + ): |
| 30 | + self.max_cell_length = max_cell_length |
| 31 | + self.max_width = max_width |
| 32 | + self.max_height = max_height |
| 33 | + self.enable_cell_expansion = enable_cell_expansion |
| 34 | + self.custom_css = custom_css |
| 35 | + self.show_truncation_message = show_truncation_message |
| 36 | + |
| 37 | + def format_html( |
| 38 | + self, |
| 39 | + batches: list, |
| 40 | + schema: Any, |
| 41 | + has_more: bool = False, |
| 42 | + table_uuid: Optional[str] = None, |
| 43 | + ) -> str: |
| 44 | + """Format record batches as HTML. |
| 45 | +
|
| 46 | + Args: |
| 47 | + batches: List of Arrow RecordBatch objects |
| 48 | + schema: Arrow Schema object |
| 49 | + has_more: Whether there are more batches not shown |
| 50 | + table_uuid: Unique ID for the table, used for JavaScript interactions |
| 51 | +
|
| 52 | + Returns: |
| 53 | + HTML string representation of the data |
| 54 | + """ |
| 55 | + if not batches: |
| 56 | + return "No data to display" |
| 57 | + |
| 58 | + # Generate a unique ID if none provided |
| 59 | + table_uuid = table_uuid or "df-" + str(id(batches)) |
| 60 | + |
| 61 | + # Start building HTML string |
| 62 | + html = [] |
| 63 | + |
| 64 | + # Add CSS styles |
| 65 | + html.append("<style>") |
| 66 | + html.append(self._get_default_css()) |
| 67 | + if self.custom_css: |
| 68 | + html.append(self.custom_css) |
| 69 | + html.append("</style>") |
| 70 | + |
| 71 | + # Create table container |
| 72 | + html.append( |
| 73 | + f'<div style="width: 100%; max-width: {self.max_width}px; ' |
| 74 | + f'max-height: {self.max_height}px; overflow: auto; border: 1px solid #ccc;">' |
| 75 | + ) |
| 76 | + html.append('<table style="border-collapse: collapse; min-width: 100%">') |
| 77 | + |
| 78 | + # Add table header |
| 79 | + html.append("<thead>") |
| 80 | + html.append("<tr>") |
| 81 | + for field in schema.fields: |
| 82 | + html.append( |
| 83 | + "<th style='border: 1px solid black; padding: 8px; " |
| 84 | + "text-align: left; background-color: #f2f2f2; " |
| 85 | + "white-space: nowrap; min-width: fit-content; " |
| 86 | + f"max-width: fit-content;'>{field.name}</th>" |
| 87 | + ) |
| 88 | + html.append("</tr>") |
| 89 | + html.append("</thead>") |
| 90 | + |
| 91 | + # Add table body |
| 92 | + html.append("<tbody>") |
| 93 | + |
| 94 | + # Process and add rows |
| 95 | + row_count = 0 |
| 96 | + for batch in batches: |
| 97 | + for row_idx in range(batch.num_rows): |
| 98 | + row_count += 1 |
| 99 | + html.append("<tr>") |
| 100 | + |
| 101 | + for col_idx, column in enumerate(batch.columns): |
| 102 | + cell_value = self._format_cell_value(column, row_idx) |
| 103 | + |
| 104 | + if ( |
| 105 | + len(str(cell_value)) > self.max_cell_length |
| 106 | + and self.enable_cell_expansion |
| 107 | + ): |
| 108 | + # Add expandable cell |
| 109 | + short_value = str(cell_value)[: self.max_cell_length] |
| 110 | + html.append( |
| 111 | + f"<td style='border: 1px solid black; padding: 8px; " |
| 112 | + f"text-align: left; white-space: nowrap;'>" |
| 113 | + f"<div class='expandable-container'>" |
| 114 | + f"<span class='expandable' id='{table_uuid}-min-text-{row_count}-{col_idx}'>" |
| 115 | + f"{short_value}</span>" |
| 116 | + f"<span class='full-text' id='{table_uuid}-full-text-{row_count}-{col_idx}'>" |
| 117 | + f"{cell_value}</span>" |
| 118 | + f"<button class='expand-btn' " |
| 119 | + f"onclick=\"toggleDataFrameCellText('{table_uuid}',{row_count},{col_idx})\">" |
| 120 | + f"...</button>" |
| 121 | + f"</div>" |
| 122 | + f"</td>" |
| 123 | + ) |
| 124 | + else: |
| 125 | + # Add regular cell |
| 126 | + html.append( |
| 127 | + f"<td style='border: 1px solid black; padding: 8px; " |
| 128 | + f"text-align: left; white-space: nowrap;'>{cell_value}</td>" |
| 129 | + ) |
| 130 | + |
| 131 | + html.append("</tr>") |
| 132 | + |
| 133 | + html.append("</tbody>") |
| 134 | + html.append("</table>") |
| 135 | + html.append("</div>") |
| 136 | + |
| 137 | + # Add JavaScript for interactivity |
| 138 | + if self.enable_cell_expansion: |
| 139 | + html.append(self._get_javascript()) |
| 140 | + |
| 141 | + # Add truncation message if needed |
| 142 | + if has_more and self.show_truncation_message: |
| 143 | + html.append("<div>Data truncated due to size.</div>") |
| 144 | + |
| 145 | + return "\n".join(html) |
| 146 | + |
| 147 | + def _format_cell_value(self, column: Any, row_idx: int) -> str: |
| 148 | + """Format a cell value for display. |
| 149 | +
|
| 150 | + Args: |
| 151 | + column: Arrow array |
| 152 | + row_idx: Row index |
| 153 | +
|
| 154 | + Returns: |
| 155 | + Formatted cell value as string |
| 156 | + """ |
| 157 | + # This is a simplified implementation for Python-side formatting |
| 158 | + # In practice, we'd want to handle different Arrow types appropriately |
| 159 | + try: |
| 160 | + return str(column[row_idx]) |
| 161 | + except (IndexError, TypeError): |
| 162 | + return "" |
| 163 | + |
| 164 | + def _get_default_css(self) -> str: |
| 165 | + """Get default CSS styles for the HTML table.""" |
| 166 | + return """ |
| 167 | + .expandable-container { |
| 168 | + display: inline-block; |
| 169 | + max-width: 200px; |
| 170 | + } |
| 171 | + .expandable { |
| 172 | + white-space: nowrap; |
| 173 | + overflow: hidden; |
| 174 | + text-overflow: ellipsis; |
| 175 | + display: block; |
| 176 | + } |
| 177 | + .full-text { |
| 178 | + display: none; |
| 179 | + white-space: normal; |
| 180 | + } |
| 181 | + .expand-btn { |
| 182 | + cursor: pointer; |
| 183 | + color: blue; |
| 184 | + text-decoration: underline; |
| 185 | + border: none; |
| 186 | + background: none; |
| 187 | + font-size: inherit; |
| 188 | + display: block; |
| 189 | + margin-top: 5px; |
| 190 | + } |
| 191 | + """ |
| 192 | + |
| 193 | + def _get_javascript(self) -> str: |
| 194 | + """Get JavaScript code for interactive elements.""" |
| 195 | + return """ |
| 196 | + <script> |
| 197 | + function toggleDataFrameCellText(table_uuid, row, col) { |
| 198 | + var shortText = document.getElementById(table_uuid + "-min-text-" + row + "-" + col); |
| 199 | + var fullText = document.getElementById(table_uuid + "-full-text-" + row + "-" + col); |
| 200 | + var button = event.target; |
| 201 | +
|
| 202 | + if (fullText.style.display === "none") { |
| 203 | + shortText.style.display = "none"; |
| 204 | + fullText.style.display = "inline"; |
| 205 | + button.textContent = "(less)"; |
| 206 | + } else { |
| 207 | + shortText.style.display = "inline"; |
| 208 | + fullText.style.display = "none"; |
| 209 | + button.textContent = "..."; |
| 210 | + } |
| 211 | + } |
| 212 | + </script> |
| 213 | + """ |
| 214 | + |
| 215 | + |
| 216 | +# Global formatter instance to be used by default |
| 217 | +_default_formatter = DataFrameHtmlFormatter() |
| 218 | + |
| 219 | + |
| 220 | +def get_formatter() -> DataFrameHtmlFormatter: |
| 221 | + """Get the current global DataFrame HTML formatter.""" |
| 222 | + return _default_formatter |
| 223 | + |
| 224 | + |
| 225 | +def configure_formatter(**kwargs: Any) -> None: |
| 226 | + """Configure the global DataFrame HTML formatter. |
| 227 | +
|
| 228 | + Args: |
| 229 | + **kwargs: Formatter configuration parameters |
| 230 | + """ |
| 231 | + global _default_formatter |
| 232 | + _default_formatter = DataFrameHtmlFormatter(**kwargs) |
0 commit comments