Formatting

coolkp · coolkp · commit 2bad0845efc7 · 2025-04-24T22:28:32.000Z
Signed-off-by: Kunjan patel &lt;kunjanp@google.com&gt;
diff --git a/end_to_end/tpu/eval_assert.py b/end_to_end/tpu/eval_assert.py
@@ -14,6 +14,15 @@
  limitations under the License.
  """
 
+"""
+Example to run
+python end_to_end/tpu/eval_assert.py avg_tflops metrics.txt 100
+python end_to_end/tpu/eval_assert.py avg_step_time metrics.txt 0.5 100
+python end_to_end/tpu/eval_assert.py avg_step_time metrics.txt 0.5 100
+"""
+
+
+
 # pylint: skip-file
 """Reads and asserts over target values"""
 from absl import app
@@ -34,11 +43,12 @@ def get_last_n_data(metrics_file, target, n=10):
   return last_n_data
 
 
-def test_final_loss(metrics_file, target_loss):
+def test_final_loss(metrics_file, target_loss, num_samples_str="10"):
   target_loss = float(target_loss)
+  num_samples = int(num_samples_str)
   with open(metrics_file, "r", encoding="utf8") as _:
     use_last_n_data = 10
-    last_n_data = get_last_n_data(metrics_file, "learning/loss", use_last_n_data)
+    last_n_data = get_last_n_data(metrics_file, "learning/loss",num_samples)
     avg_last_n_data = sum(last_n_data) / len(last_n_data)
     print(f"Mean of last {len(last_n_data)} losses is {avg_last_n_data}")
     print(f"Target loss is {target_loss}")
@@ -61,8 +71,9 @@ def test_avg_step_time(metrics_file, max_avg_step_time_str, num_samples_str="10"
   print(f"Found {len(last_n_step_times)} data points for '{metric_key}'.")
   print(f"Mean of last {len(last_n_step_times)} step times is {avg_last_n_step_time:.4f} s")
 
-  assert avg_last_n_step_time < max_avg_step_time, \
-      f"Average step time {avg_last_n_step_time:.4f}s is not less than target {max_avg_step_time}s."
+  assert (
+      avg_last_n_step_time < max_avg_step_time
+  ), f"Average step time {avg_last_n_step_time:.4f}s is not less than target {max_avg_step_time}s."
   print("Average step time test passed.")
 
 
@@ -82,8 +93,9 @@ def test_avg_tflops(metrics_file, min_avg_tflops_str, num_samples_str="10"):
   print(f"Found {len(last_n_tflops)} data points for '{metric_key}'.")
   print(f"Mean of last {len(last_n_tflops)} steps TFLOPs/sec is {avg_last_n_tflops:.2f}")
 
-  assert avg_last_n_tflops > min_avg_tflops, \
-      f"Average TFLOPs/sec {avg_last_n_tflops:.2f} is not greater than target {min_avg_tflops}."
+  assert (
+      avg_last_n_tflops > min_avg_tflops
+  ), f"Average TFLOPs/sec {avg_last_n_tflops:.2f} is not greater than target {min_avg_tflops}."
   print("Average TFLOPs/sec test passed.")
 
 
@@ -118,4 +130,4 @@ def main(argv: Sequence[str]) -> None:
 
 
 if __name__ == "__main__":
-  app.run(main)
+  app.run(main)
diff --git a/src/maxdiffusion/max_utils.py b/src/maxdiffusion/max_utils.py
@@ -564,11 +564,12 @@ def calculate_model_tflops(module: module_lib.Module, rngs: Union[PRNGKey, RNGSe
   total_flops = (total_flops * 3 if train else total_flops) / 10**12
   return total_flops
 
-def get_train_step_partial_with_signature(train_step:Callable, pipeline:object, params:Dict, config:object)->Callable:
+
+def get_train_step_partial_with_signature(train_step: Callable, pipeline: object, params: Dict, config: object) -> Callable:
   partial_train = partial(train_step, pipeline=pipeline, params=params, config=config)
   partial_train.__name__ = "train_step"
   return partial_train
-  
+
 
 def calculate_num_params_from_pytree(params):
   """Calculates number of parameters from a pytree"""
diff --git a/src/maxdiffusion/trainers/base_stable_diffusion_trainer.py b/src/maxdiffusion/trainers/base_stable_diffusion_trainer.py
@@ -24,15 +24,15 @@
 
 # Define a filename for logging
 
-def _log_to_file(message: str, log_file:str=""):
-    """Appends a message to the global log file with a timestamp."""
-    timestamp = time.strftime('%Y-%m-%d %H:%M:%S %Z', time.localtime())
-    full_message = f"[{timestamp}] {message}\n"
-    if log_file:
-      with open(log_file, 'a') as f:
-          f.write(full_message)
-    max_logging.log(full_message.strip())
-    
+
+def _log_to_file(message: str, log_file: str = ""):
+  """Appends a message to the global log file with a timestamp."""
+  timestamp = time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime())
+  full_message = f"[{timestamp}] {message}\n"
+  if log_file:
+    with open(log_file, "a") as f:
+      f.write(full_message)
+  max_logging.log(full_message.strip())
 
 
 class BaseStableDiffusionTrainer(BaseStableDiffusionCheckpointer):
@@ -80,32 +80,28 @@ def get_data_shardings(self):
   @abstractmethod
   def create_scheduler(self, pipeline, params):
     pass
-  
+
   def _time_and_log_call(
-    self,
-    func_obj: Callable[..., Any],   
-    *func_args: Any,             
-    description: str = "",      
-    **func_kwargs: Any      
-) -> Any:
+      self, func_obj: Callable[..., Any], *func_args: Any, description: str = "", **func_kwargs: Any
+  ) -> Any:
     """
     Times a function call, logs its duration, and returns its result.
     """
     if not description:
-      if hasattr(func_obj, '__name__'):
+      if hasattr(func_obj, "__name__"):
         description = func_obj.__name__
-      elif hasattr(func_obj, '__call__') and hasattr(type(func_obj), '__name__'):
+      elif hasattr(func_obj, "__call__") and hasattr(type(func_obj), "__name__"):
         description = type(func_obj).__name__
     log_file = ""
-    
+
     if self.config.write_timing_metrics and self.config.timing_metrics_file:
       log_file = self.config.get.timing_metrics_file
     _log_to_file(f"Starting: {description}...", log_file=log_file)
-    start_time = time.perf_counter() # Use perf_counter for more precise duration measurement
+    start_time = time.perf_counter()  # Use perf_counter for more precise duration measurement
     result = func_obj(*func_args, **func_kwargs)
     end_time = time.perf_counter()
     duration = end_time - start_time
-    _log_to_file(f"Finished: {description} - Duration: {duration:.4f} seconds",log_file=log_file)
+    _log_to_file(f"Finished: {description} - Duration: {duration:.4f} seconds", log_file=log_file)
     return result
 
   def calculate_tflops(self, pipeline, params):
@@ -129,7 +125,7 @@ def start_training(self):
         pipeline=pipeline,
         params=params,
         checkpoint_item_name="vae_state",
-        is_training=False
+        is_training=False,
     )
 
     train_states["vae_state"] = vae_state
@@ -147,13 +143,13 @@ def start_training(self):
     state_shardings["text_encoder_state_shardings"] = text_encoder_state_mesh_shardings
     if hasattr(pipeline, "text_encoder_2"):
       text_encoder_2_state, text_encoder_2_state_mesh_shardings = self._time_and_log_call(
-        self.create_text_encoder_2_state,
-        # Arguments for create_text_encoder_2_state
-        pipeline=pipeline,
-        params=params,
-        checkpoint_item_name="text_encoder_2_state",
-        is_training=self.config.train_text_encoder,
-    )
+          self.create_text_encoder_2_state,
+          # Arguments for create_text_encoder_2_state
+          pipeline=pipeline,
+          params=params,
+          checkpoint_item_name="text_encoder_2_state",
+          is_training=self.config.train_text_encoder,
+      )
       train_states["text_encoder_2_state"] = text_encoder_2_state
       state_shardings["text_encoder_2_state_shardings"] = text_encoder_2_state_mesh_shardings
 
@@ -167,17 +163,9 @@ def start_training(self):
     self.per_device_tflops = per_device_tflops
 
     # Load dataset
-    data_iterator = self._time_and_log_call(
-        self.load_dataset,
-        pipeline,     
-        params,       
-        train_states  
-    )
+    data_iterator = self._time_and_log_call(self.load_dataset, pipeline, params, train_states)
     if self.config.dataset_type == "grain":
-      data_iterator = self._time_and_log_call(
-          self.restore_data_iterator_state,
-          data_iterator=data_iterator 
-      )
+      data_iterator = self._time_and_log_call(self.restore_data_iterator_state, data_iterator=data_iterator)
 
     unet_state, unet_state_mesh_shardings, unet_learning_rate_scheduler = self._time_and_log_call(
         self.create_unet_state,
@@ -196,13 +184,12 @@ def start_training(self):
     data_shardings = self.get_data_shardings()
     # Compile train_step
     p_train_step = self._time_and_log_call(
-            self.compile_train_step,
-            pipeline, params, train_states, state_shardings, data_shardings
-        )
+        self.compile_train_step, pipeline, params, train_states, state_shardings, data_shardings
+    )
     # Start training
-    train_states = self._time_and_log_call(self.training_loop,
-        p_train_step, pipeline, params, train_states, data_iterator, unet_learning_rate_scheduler
+    train_states = self._time_and_log_call(
+        self.training_loop, p_train_step, pipeline, params, train_states, data_iterator, unet_learning_rate_scheduler
     )
     # 6. save final checkpoint
     # Hook
-    self._time_and_log_call(self.post_training_steps,pipeline, params, train_states)
+    self._time_and_log_call(self.post_training_steps, pipeline, params, train_states)
diff --git a/src/maxdiffusion/trainers/flux_trainer.py b/src/maxdiffusion/trainers/flux_trainer.py
@@ -286,12 +286,12 @@ def compile_train_step(self, pipeline, params, train_states, state_shardings, da
     guidance_vec = jnp.full((self.total_train_batch_size,), self.config.guidance_scale, dtype=self.config.activations_dtype)
     with self.mesh, nn_partitioning.axis_rules(self.config.logical_axis_rules):
       train_step_partial = partial(
-              _train_step,
-              guidance_vec=guidance_vec,
-              pipeline=pipeline,
-              scheduler=train_states["scheduler"],
-              config=self.config,
-          )
+          _train_step,
+          guidance_vec=guidance_vec,
+          pipeline=pipeline,
+          scheduler=train_states["scheduler"],
+          config=self.config,
+      )
       train_step_partial.__name__ = "train_step"
       p_train_step = jax.jit(
           train_step_partial,
diff --git a/src/maxdiffusion/utils/dynamic_modules_utils.py b/src/maxdiffusion/utils/dynamic_modules_utils.py
@@ -28,6 +28,7 @@
 from huggingface_hub import HfFolder, hf_hub_download, model_info
 import huggingface_hub
 from packaging import version
+
 cached_download = None
 
 from .. import __version__
@@ -42,20 +43,22 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 # https://github.com/huggingface/huggingface_hub/releases/tag/v0.26.0
-# `cached_download(), url_to_filename(), filename_to_url() methods are now completely removed. 
+# `cached_download(), url_to_filename(), filename_to_url() methods are now completely removed.
 # From now on, you will have to use hf_hub_download() to benefit from the new cache layout.`
-if hasattr(huggingface_hub, '__version__'):
-    current_version = version.parse(huggingface_hub.__version__)
-    target_version = version.parse("0.26.0")
-
-    if current_version < target_version:
-        try:
-            from huggingface_hub import cached_download
-            
-        except ImportError:
-            logger.error(f"huggingface_hub version {current_version} is below 0.26.0, but 'cached_download' could not be imported. It might have been removed or deprecated in this version as well.")
+if hasattr(huggingface_hub, "__version__"):
+  current_version = version.parse(huggingface_hub.__version__)
+  target_version = version.parse("0.26.0")
+
+  if current_version < target_version:
+    try:
+      from huggingface_hub import cached_download
+
+    except ImportError:
+      logger.error(
+          f"huggingface_hub version {current_version} is below 0.26.0, but 'cached_download' could not be imported. It might have been removed or deprecated in this version as well."
+      )
 else:
-    logger.error("Could not determine huggingface_hub version. Unable to conditionally import 'cached_download'.")
+  logger.error("Could not determine huggingface_hub version. Unable to conditionally import 'cached_download'.")
 
 
 def get_diffusers_versions():