add flag and complete the video generation during eval

susanbao · susanbao · commit be689dfaa02d · 2025-08-27T21:41:30.000Z
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -298,3 +298,4 @@ quantization_calibration_method: "absmax"
 # Eval model on per eval_every steps. -1 means don't eval.
 eval_every: -1
 eval_data_dir: ""
+enable_generate_video_for_eval: False # This will increase the used TPU memory.
diff --git a/src/maxdiffusion/generate_wan.py b/src/maxdiffusion/generate_wan.py
@@ -46,24 +46,24 @@ def upload_video_to_gcs(output_dir: str, video_path: str):
         blob = bucket.blob(destination_blob_name)
 
         # Upload the file
-        print(f"Uploading {source_file_path} to {bucket_name}/{destination_blob_name}...")
+        max_logging.log(f"Uploading {source_file_path} to {bucket_name}/{destination_blob_name}...")
         blob.upload_from_filename(source_file_path)
-        print(f"Upload complete {source_file_path}.")
+        max_logging.log(f"Upload complete {source_file_path}.")
 
     except Exception as e:
-        print(f"An error occurred: {e}")
+        max_logging.log(f"An error occurred: {e}")
 
 def delete_file(file_path: str):
    # Best practice: Check if the file exists before trying to delete it.
   if os.path.exists(file_path):
       try:
           os.remove(file_path)
-          print(f"Successfully deleted file: {file_path}")
+          max_logging.log(f"Successfully deleted file: {file_path}")
       except OSError as e:
           # This catches other issues like permission errors
-          print(f"Error deleting file '{file_path}': {e}")
+          max_logging.log(f"Error deleting file '{file_path}': {e}")
   else:
-      print(f"The file '{file_path}' does not exist.")
+      max_logging.log(f"The file '{file_path}' does not exist.")
 
 jax.config.update("jax_use_shardy_partitioner", True)
 
@@ -86,12 +86,14 @@ def inference_generate_video(config, pipeline, filename_prefix=""):
       guidance_scale=config.guidance_scale,
   )
 
-  print(f"video {filename_prefix}, compile time: {(time.perf_counter() - s0)}")
+  max_logging.log(f"video {filename_prefix}, compile time: {(time.perf_counter() - s0)}")
   for i in range(len(videos)):
     video_path = f"{filename_prefix}wan_output_{config.seed}_{i}.mp4"
     export_to_video(videos[i], video_path, fps=config.fps)
-    upload_video_to_gcs(config.output_dir, video_path)
-    delete_file(f"./{video_path}")
+    if config.output_dir.startswith("gs://"):
+      upload_video_to_gcs(config.output_dir, video_path)
+      # Delete local files to avoid storing too manys videoss
+      delete_file(f"./{video_path}")
   return
 
 def run(config, pipeline=None, filename_prefix=""):
@@ -128,7 +130,8 @@ def run(config, pipeline=None, filename_prefix=""):
     video_path = os.path.join(f"{filename_prefix}wan_output_{config.seed}_{i}.mp4")
     export_to_video(videos[i], video_path, fps=config.fps)
     saved_video_path.append(video_path)
-    upload_video_to_gcs(config.output_dir, video_path)
+    if config.output_dir.startswith("gs://"):
+      upload_video_to_gcs(config.output_dir, video_path)
 
   s0 = time.perf_counter()
   videos = pipeline(
diff --git a/src/maxdiffusion/trainers/wan_trainer.py b/src/maxdiffusion/trainers/wan_trainer.py
@@ -152,9 +152,10 @@ def start_training(self):
     # Generate a sample before training to compare against generated sample after training.
     pretrained_video_path = generate_sample(self.config, pipeline, filename_prefix="pre-training-")
 
-    # save some memory.
-    # del pipeline.vae
-    # del pipeline.vae_cache
+    if self.config.eval_every == -1 or (not self.config.enable_generate_video_for_eval):
+      # save some memory.
+      del pipeline.vae
+      del pipeline.vae_cache
 
     mesh = pipeline.mesh
     train_data_iterator = self.load_dataset(mesh, is_training=True)
@@ -250,10 +251,11 @@ def training_loop(self, pipeline, optimizer, learning_rate_scheduler, train_data
           train_utils.write_metrics(writer, local_metrics_file, running_gcs_metrics, train_metric, step, self.config)
 
         if self.config.eval_every > 0 and (step + 1) % self.config.eval_every == 0:
+          if self.config.enable_generate_video_for_eval:
+            pipeline.transformer = nnx.merge(state.graphdef, state.params, state.rest_of_state)
+            inference_generate_video(self.config, pipeline, filename_prefix=f"{step+1}-train_steps-")
           # Re-create the iterator each time you start evaluation to reset it
           # This assumes your data loading logic can be called to get a fresh iterator.
-          pipeline.transformer = nnx.merge(state.graphdef, state.params, state.rest_of_state)
-          inference_generate_video(self.config, pipeline, filename_prefix=f"{step+1}-train_steps-")
           eval_data_iterator = self.load_dataset(mesh, is_training=False)
           eval_rng = jax.random.key(self.config.seed + step)
           eval_metrics = []