add test which exist wrong for bs should be 128

susanbao · susanbao · commit 3a44f61faf84 · 2025-09-29T03:55:35.000Z
diff --git a/src/maxdiffusion/trainers/wan_trainer.py b/src/maxdiffusion/trainers/wan_trainer.py
@@ -23,7 +23,7 @@
 import tensorflow as tf
 import jax.numpy as jnp
 import jax
-from jax.sharding import PartitionSpec as P
+from jax.sharding import Mesh, PartitionSpec as P
 from flax import nnx
 from maxdiffusion.schedulers import FlaxFlowMatchScheduler
 from flax.linen import partitioning as nn_partitioning
@@ -39,7 +39,11 @@
 from flax.training import train_state
 from maxdiffusion.pipelines.wan.wan_pipeline import WanPipeline
 from jax.experimental import multihost_utils
+from maxdiffusion.max_utils import create_device_mesh
+import copy
 
+class EvalConfig:
+  pass
 
 class TrainState(train_state.TrainState):
   graphdef: nnx.GraphDef
@@ -212,7 +216,7 @@ def start_training(self):
 
     pipeline = self.load_checkpoint()
     # Generate a sample before training to compare against generated sample after training.
-    pretrained_video_path = generate_sample(self.config, pipeline, filename_prefix="pre-training-")
+    # pretrained_video_path = generate_sample(self.config, pipeline, filename_prefix="pre-training-")
 
     if self.config.eval_every == -1 or (not self.config.enable_generate_video_for_eval):
       # save some memory.
@@ -230,8 +234,8 @@ def start_training(self):
     # Returns pipeline with trained transformer state
     pipeline = self.training_loop(pipeline, optimizer, learning_rate_scheduler, train_data_iterator)
 
-    posttrained_video_path = generate_sample(self.config, pipeline, filename_prefix="post-training-")
-    print_ssim(pretrained_video_path, posttrained_video_path)
+    # posttrained_video_path = generate_sample(self.config, pipeline, filename_prefix="post-training-")
+    # print_ssim(pretrained_video_path, posttrained_video_path)
 
   def training_loop(self, pipeline, optimizer, learning_rate_scheduler, train_data_iterator):
     mesh = pipeline.mesh
@@ -246,7 +250,19 @@ def training_loop(self, pipeline, optimizer, learning_rate_scheduler, train_data
       state = jax.lax.with_sharding_constraint(state, state_spec)
       state_shardings = nnx.get_named_sharding(state, mesh)
     data_shardings = self.get_data_shardings(mesh)
-    eval_data_shardings = self.get_eval_data_shardings(mesh)
+
+    single_batch_size = min(self.config.eval_max_processed_batch_size, self.config.global_batch_size_to_train_on)
+    eval_config = EvalConfig()
+    eval_config.dcn_data_parallelism = self.config.dcn_data_parallelism
+    eval_config.dcn_fsdp_parallelism = self.config.dcn_fsdp_parallelism
+    eval_config.dcn_tensor_parallelism = self.config.dcn_tensor_parallelism
+    eval_config.ici_data_parallelism = single_batch_size
+    eval_config.ici_fsdp_parallelism = 1
+    eval_config.ici_tensor_parallelism = 1
+    eval_config.allow_split_physical_axes = self.config.allow_split_physical_axes
+    eval_devices_array = create_device_mesh(eval_config)
+    eval_mesh = Mesh(eval_devices_array, self.config.mesh_axes)
+    eval_data_shardings = self.get_eval_data_shardings(eval_mesh)
 
     writer = max_utils.initialize_summary_writer(self.config)
     writer_thread = threading.Thread(target=_tensorboard_writer_worker, args=(writer, self.config), daemon=True)
@@ -327,25 +343,39 @@ def training_loop(self, pipeline, optimizer, learning_rate_scheduler, train_data
           # Loop indefinitely until the iterator is exhausted
           while True:
             try:
-              with mesh:
-                eval_start_time = datetime.datetime.now()
-                eval_batch = load_next_batch(eval_data_iterator, None, self.config)
-                metrics, eval_rng = p_eval_step(state, eval_batch, eval_rng, scheduler_state)
-                losses = metrics["scalar"]["learning/eval_loss"]
-                timesteps = eval_batch["timesteps"]
-                gathered_timesteps_on_device = multihost_utils.process_allgather(timesteps)
-                gathered_timesteps = jax.device_get(gathered_timesteps_on_device)
-                gathered_losses_on_device = multihost_utils.process_allgather(losses)
-                gathered_losses = jax.device_get(gathered_losses_on_device)
-                for t, l in zip(gathered_timesteps.flatten(), gathered_losses.flatten()):
+              eval_start_time = datetime.datetime.now()
+              eval_batch = load_next_batch(eval_data_iterator, None, self.config)
+              bs = len(eval_batch["latents"])
+              for i in range(0, bs, single_batch_size):
+                eval_step_start_time = datetime.datetime.now()
+                start = i
+                end = min(i + single_batch_size, bs)
+                timesteps = eval_batch["timesteps"][start:end]
+                chunk_eval_branch = {
+                    "latents": eval_batch["latents"][start:end, :],
+                    "encoder_hidden_states": eval_batch["encoder_hidden_states"][start:end, :],
+                    "timesteps": timesteps,
+                }
+                with eval_mesh:
+                  metrics, eval_rng = p_eval_step(state, chunk_eval_branch, eval_rng, scheduler_state)
+                  losses = metrics["scalar"]["learning/eval_loss"]
+                # gathered_timesteps_on_device = multihost_utils.process_allgather(timesteps)
+                # gathered_timesteps = jax.device_get(gathered_timesteps_on_device)
+                  gathered_losses_on_device = multihost_utils.process_allgather(losses)
+                  gathered_losses = jax.device_get(gathered_losses_on_device)
+                for t, l in zip(timesteps.flatten(), gathered_losses.flatten()):
                   timestep = int(t)
                   if timestep not in eval_losses_by_timestep:
                       eval_losses_by_timestep[timestep] = []
                   eval_losses_by_timestep[timestep].append(l)
-                eval_end_time = datetime.datetime.now()
-                eval_duration = eval_end_time - eval_start_time
+                eval_step_end_time = datetime.datetime.now()
+                eval_step_duration = eval_step_end_time - eval_step_start_time
                 if jax.process_index() == 0:
-                  max_logging.log(f"  Eval step time {eval_duration.total_seconds():.2f} seconds.")
+                  max_logging.log(f"  Eval step processed batch {end} :  {start} in {eval_step_duration.total_seconds():.2f} seconds.")
+              eval_end_time = datetime.datetime.now()
+              eval_duration = eval_end_time - eval_start_time
+              if jax.process_index() == 0:
+                max_logging.log(f"  Eval step time {eval_duration.total_seconds():.2f} seconds.")
             except StopIteration:
               # This block is executed when the iterator has no more data
               break
@@ -440,11 +470,14 @@ def eval_step(state, data, rng, scheduler_state, scheduler, config):
 
   # The loss function logic is identical to training. We are evaluating the model's
   # ability to perform its core training objective (e.g., denoising).
-  @jax.jit
-  def loss_fn(params, latents, encoder_hidden_states, timesteps, rng):
+  def loss_fn(params):
     # Reconstruct the model from its definition and parameters
     model = nnx.merge(state.graphdef, params, state.rest_of_state)
 
+    latents = data["latents"].astype(config.weights_dtype)
+    encoder_hidden_states = data["encoder_hidden_states"].astype(config.weights_dtype)
+    timesteps = data["timesteps"].astype("int64")
+
     noise = jax.random.normal(key=rng, shape=latents.shape, dtype=latents.dtype)
     noisy_latents = scheduler.add_noise(scheduler_state, latents, noise, timesteps)
 
@@ -468,18 +501,8 @@ def loss_fn(params, latents, encoder_hidden_states, timesteps, rng):
   # --- Key Difference from train_step ---
   # Directly compute the loss without calculating gradients.
   # The model's state.params are used but not updated.
-  bs = len(data["latents"])
-  single_batch_size = min(config.eval_max_processed_batch_size, config.global_batch_size_to_train_on)
-  losses = jnp.zeros(bs)
-  for i in range(0, bs, single_batch_size):
-    start = i
-    end = min(i + single_batch_size, bs)
-    latents= data["latents"][start:end, :].astype(config.weights_dtype)
-    encoder_hidden_states = data["encoder_hidden_states"][start:end, :].astype(config.weights_dtype)
-    timesteps = data["timesteps"][start:end].astype("int64")
-    _, new_rng = jax.random.split(rng, num=2)
-    loss = loss_fn(state.params, latents, encoder_hidden_states, timesteps, new_rng)
-    losses = losses.at[start:end].set(loss)
+  _, new_rng = jax.random.split(rng, num=2)
+  losses = loss_fn(state.params)
 
   # Structure the metrics for logging and aggregation
   metrics = {"scalar": {"learning/eval_loss": losses}}