Skip to content

Commit ea55a0d

Browse files
committed
Gate TPU power profiling events behind profile_power_events flag
On GPU, the CUPTI tracer does not recognize TPU-specific keys in advanced_configuration (tpu_power_trace_level, e2e_enable_fw_*_event), causing an INVALID_ARGUMENT error that aborts the GPU xplane trace and leaves only CPU traces behind. Introduce a profile_power_events flag (default False) that gates the TPU-specific advanced_configuration block. GPU runs are unaffected by default; TPU users who want power/thermal tracing can opt in with profile_power_events=True.
1 parent b5f41ec commit ea55a0d

3 files changed

Lines changed: 3 additions & 1 deletion

File tree

src/maxtext/common/profiler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def __init__(self, config, offset_step=0):
5151
ManagedMLDiagnostics(config) # Initialize the MLRun instance.
5252

5353
self.profiling_options = jax.profiler.ProfileOptions()
54-
if self.mode == "xplane" and not self.managed_mldiagnostics:
54+
if self.mode == "xplane" and not self.managed_mldiagnostics and config.profile_power_events:
5555
self.profiling_options.advanced_configuration = {
5656
"tpu_power_trace_level": config.xprof_tpu_power_trace_level,
5757
"e2e_enable_fw_throttle_event": config.xprof_e2e_enable_fw_throttle_event,

src/maxtext/configs/base.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -911,6 +911,7 @@ xprof_tpu_power_trace_level: 0
911911
xprof_e2e_enable_fw_throttle_event: False
912912
xprof_e2e_enable_fw_power_level_event: False
913913
xprof_e2e_enable_fw_thermal_event: False
914+
profile_power_events: False # Set to True to enable TPU-specific power/thermal profiling events. Defaults to False to avoid breaking GPU xplane tracing.
914915

915916
log_config: True # Prints the config (after defaults have been set by pyconfig logic)
916917
debug_sharding: False # Prints model weights sharding info

src/maxtext/configs/types.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1380,6 +1380,7 @@ class Profiling(BaseModel):
13801380
xprof_e2e_enable_fw_throttle_event: bool = Field(False, description="Enable FW throttle event.")
13811381
xprof_e2e_enable_fw_power_level_event: bool = Field(False, description="Enable FW power level event.")
13821382
xprof_e2e_enable_fw_thermal_event: bool = Field(False, description="Enable FW thermal event.")
1383+
profile_power_events: bool = Field(False, description="Enable TPU-specific power/thermal profiling events. Defaults to False to avoid breaking GPU xplane tracing.")
13831384

13841385

13851386
class HloDump(BaseModel):

0 commit comments

Comments
 (0)