add hyper

susanbao · susanbao · commit 5503f9c77e67 · 2025-09-27T09:37:36.000Z
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -235,6 +235,8 @@ global_batch_size: 0
 tfrecords_dir: ''
 no_records_per_shard: 0
 enable_eval_timesteps: False
+considered_timesteps_list: [125, 250, 375, 500, 625, 750, 875]
+num_eval_samples: 420
 
 warmup_steps_fraction: 0.1
 learning_rate_schedule_steps: -1 # By default the length of the schedule is set to the number of steps.
@@ -316,3 +318,4 @@ quantization_calibration_method: "absmax"
 eval_every: -1
 eval_data_dir: ""
 enable_generate_video_for_eval: False # This will increase the used TPU memory.
+eval_max_number_of_samples_in_bucket: 60
diff --git a/src/maxdiffusion/data_preprocessing/wan_pusav1_to_tfrecords.py b/src/maxdiffusion/data_preprocessing/wan_pusav1_to_tfrecords.py
@@ -85,9 +85,10 @@ def generate_dataset(config):
   shard_record_count = 0
 
   # Define timesteps and bucket configuration
-  timesteps_list = [125, 250, 375, 500, 625, 750, 875]
-  bucket_size = 60
-  num_samples_to_process = 420
+  num_eval_samples = config.num_eval_samples
+  timesteps_list = config.timesteps_list
+  assert num_eval_samples % len(timesteps_list) == 0
+  bucket_size = num_eval_samples // len(timesteps_list)
 
   # Load dataset
   metadata_path = os.path.join(config.train_data_dir, "metadata.csv")
@@ -115,12 +116,12 @@ def generate_dataset(config):
       current_timestep = None
       # Determine the timestep for the first 420 samples
       if config.enable_eval_timesteps:
-        if global_record_count < num_samples_to_process:
+        if global_record_count < num_eval_samples:
           print(f"global_record_count: {global_record_count}")
           bucket_index = global_record_count // bucket_size
           current_timestep = timesteps_list[bucket_index]
         else:
-          print(f"value {global_record_count} is greater than or equal to {num_samples_to_process}")
+          print(f"value {global_record_count} is greater than or equal to {num_eval_samples}")
           return
 
       # Write the example, including the timestep if applicable
diff --git a/src/maxdiffusion/trainers/wan_trainer.py b/src/maxdiffusion/trainers/wan_trainer.py
@@ -345,7 +345,7 @@ def training_loop(self, pipeline, optimizer, learning_rate_scheduler, train_data
             max_logging.log(f"Step {step}, calculating mean loss per timestep...")
             for timestep, losses in sorted(eval_losses_by_timestep.items()):
                 losses = jnp.array(losses)
-                losses = losses[: min(60, len(losses))]
+                losses = losses[: min(self.config.eval_max_number_of_samples_in_bucket, len(losses))]
                 mean_loss = jnp.mean(losses)
                 max_logging.log(f"  Mean eval loss for timestep {timestep}: {mean_loss:.4f}, num of losses: {len(losses)}")
                 mean_per_timestep.append(mean_loss)