Changes made for moving ltx2_pipeline_utils to export_utils.py

prishajain1 · prishajain1 · commit f0067e1e1690 · 2026-03-17T14:40:02.000+05:30
diff --git a/src/maxdiffusion/generate_ltx2.py b/src/maxdiffusion/generate_ltx2.py
@@ -23,7 +23,7 @@
 from absl import app
 from google.cloud import storage
 import flax
-from maxdiffusion.pipelines.ltx2.ltx2_pipeline_utils import encode_video
+from maxdiffusion.utils.export_utils import export_to_video_with_audio
 
 
 def upload_video_to_gcs(output_dir: str, video_path: str):
@@ -163,7 +163,7 @@ def run(config, pipeline=None, filename_prefix="", commit_hash=None):
     video_path = f"{filename_prefix}ltx2_output_{getattr(config, 'seed', 0)}_{i}.mp4"
     audio_i = audios[i] if audios is not None else None
 
-    encode_video(video=videos[i], fps=fps, audio=audio_i, audio_sample_rate=audio_sample_rate, output_path=video_path)
+    export_to_video_with_audio(video=videos[i], fps=fps, audio=audio_i, audio_sample_rate=audio_sample_rate, output_path=video_path)
 
     saved_video_path.append(video_path)
     if config.output_dir.startswith("gs://"):
diff --git a/src/maxdiffusion/utils/export_utils.py b/src/maxdiffusion/utils/export_utils.py
@@ -26,9 +26,12 @@
 import PIL.Image
 import PIL.ImageOps
 
-from .import_utils import BACKENDS_MAPPING, is_imageio_available, is_opencv_available
+from .import_utils import AV_IMPORT_ERROR, BACKENDS_MAPPING, is_av_available, is_imageio_available, is_opencv_available
 from .logging import get_logger
 
+if is_av_available():
+  import av
+
 
 global_rng = random.Random()
 
@@ -222,3 +225,129 @@ def export_to_video(
       writer.append_data(frame)
 
   return output_video_path
+
+
+def _prepare_audio_stream(container, audio_sample_rate: int):
+  """
+  Prepare the audio stream for writing.
+  """
+  from fractions import Fraction
+  audio_stream = container.add_stream("aac", rate=audio_sample_rate)
+  audio_stream.codec_context.sample_rate = audio_sample_rate
+  audio_stream.codec_context.layout = "stereo"
+  audio_stream.codec_context.time_base = Fraction(1, audio_sample_rate)
+  return audio_stream
+
+
+def _resample_audio(container, audio_stream, frame_in) -> None:
+  cc = audio_stream.codec_context
+
+  target_format = cc.format or "fltp"
+  target_layout = cc.layout or "stereo"
+  target_rate = cc.sample_rate or frame_in.sample_rate
+
+  audio_resampler = av.audio.resampler.AudioResampler(
+      format=target_format,
+      layout=target_layout,
+      rate=target_rate,
+  )
+
+  audio_next_pts = 0
+  for rframe in audio_resampler.resample(frame_in):
+    if rframe.pts is None:
+      rframe.pts = audio_next_pts
+    audio_next_pts += rframe.samples
+    rframe.sample_rate = frame_in.sample_rate
+    container.mux(audio_stream.encode(rframe))
+
+  # flush audio encoder
+  for packet in audio_stream.encode():
+    container.mux(packet)
+
+
+def _write_audio(
+    container,
+    audio_stream,
+    samples: Any,
+    audio_sample_rate: int,
+) -> None:
+  import numpy as np
+
+  samples = np.asarray(samples)
+
+  if samples.ndim == 1:
+    samples = samples[:, None]
+
+  # The Vocoder naturally outputs (Channels=2, Time)
+  if samples.shape[0] == 2 and samples.shape[1] != 2:
+    samples = samples.T  # Now (Time, 2)
+
+  if samples.shape[1] != 2:
+    raise ValueError(f"Expected samples with 2 channels; got shape {samples.shape}.")
+
+  if samples.dtype != np.int16:
+    samples = np.clip(samples, -1.0, 1.0)
+    samples = (samples * 32767.0).astype(np.int16)
+  samples_np = np.ascontiguousarray(samples).reshape(1, -1)
+
+  frame_in = av.AudioFrame.from_ndarray(
+      samples_np,
+      format="s16",
+      layout="stereo",
+  )
+  frame_in.sample_rate = audio_sample_rate
+
+  _resample_audio(container, audio_stream, frame_in)
+
+
+def export_to_video_with_audio(video: Any, fps: int, audio: Optional[Any], audio_sample_rate: Optional[int], output_path: str) -> None:
+  """
+  Encodes video (and optionally audio) to a file using PyAV.
+  Args:
+      video: Video array-like [F, H, W, C] (frames, height, width, channels)
+      fps: Frames per second
+      audio: Audio array-like [C, L] or [L, C]
+      audio_sample_rate: Audio sample rate
+      output_path: Output file path
+  """
+  if not is_av_available():
+    raise ImportError(AV_IMPORT_ERROR.format("export_to_video_with_audio"))
+
+  video_np = np.asarray(video)
+
+  if video_np.ndim == 4:
+    # [F, H, W, C]
+    _, height, width, _ = video_np.shape
+  elif video_np.ndim == 5:
+    # [B, F, H, W, C] -> take the first video in the batch
+    video_np = video_np[0]
+    _, height, width, _ = video_np.shape
+  else:
+    raise ValueError(f"export_to_video_with_audio expects a 4D or 5D video tensor, got {video_np.ndim}D")
+
+  container = av.open(output_path, mode="w")
+  stream = container.add_stream("libx264", rate=int(fps))
+  stream.width = width
+  stream.height = height
+  stream.pix_fmt = "yuv420p"
+
+  if audio is not None:
+    if audio_sample_rate is None:
+      raise ValueError("audio_sample_rate is required when audio is provided")
+
+    audio_stream = _prepare_audio_stream(container, audio_sample_rate)
+
+  for frame_array in video_np:
+    # frame_array is [H, W, C]
+    frame = av.VideoFrame.from_ndarray(frame_array, format="rgb24")
+    for packet in stream.encode(frame):
+      container.mux(packet)
+
+  # Flush encoder
+  for packet in stream.encode():
+    container.mux(packet)
+
+  if audio is not None:
+    _write_audio(container, audio_stream, audio, audio_sample_rate)
+
+  container.close()