wip - context parallelism

jfacevedo-google · jfacevedo-google · commit 69a93b99dd4a · 2025-06-26T23:58:21.000Z
diff --git a/src/maxdiffusion/configs/base_wan_14b.yml b/src/maxdiffusion/configs/base_wan_14b.yml
@@ -112,8 +112,11 @@ mesh_axes: ['data', 'fsdp', 'tensor']
 # conv_out : conv.shape[-1] weight
 logical_axis_rules: [
                       ['batch', 'data'],
-                      ['activation_heads', 'fsdp'],
-                      ['activation_batch', ['data','fsdp']],
+                      #['activation_heads', 'fsdp'],
+                      ['activation_length', 'fsdp'],
+                      #['activation_heads', 'fsdp'],
+                      #['activation_heads', 'fsdp'],
+                      #['activation_batch', ['data','fsdp']],
                       ['activation_kv', 'tensor'],
                       ['mlp','tensor'],
                       ['embed','fsdp'],
@@ -141,14 +144,15 @@ ici_tensor_parallelism: 1
 # Replace with dataset path or train_data_dir. One has to be set.
 dataset_name: 'diffusers/pokemon-gpt4-captions'
 train_split: 'train'
-dataset_type: 'tf'
+dataset_type: 'tfrecord'
 cache_latents_text_encoder_outputs: True
 # cache_latents_text_encoder_outputs only apply to dataset_type="tf",
 # only apply to small dataset that fits in memory
 # prepare image latents and text encoder outputs
 # Reduce memory consumption and reduce step time during training
 # transformed dataset is saved at dataset_save_location
-dataset_save_location: '/tmp/pokemon-gpt4-captions_xl'
+dataset_save_location: ''
+load_tfrecord_cached: True
 train_data_dir: ''
 dataset_config_name: ''
 jax_cache_dir: ''
@@ -185,6 +189,10 @@ per_device_batch_size: 1
 # If global_batch_size % jax.device_count is not 0, use FSDP sharding.
 global_batch_size: 0
 
+# For creating tfrecords from dataset
+tfrecords_dir: ''
+no_records_per_shard: 0
+
 warmup_steps_fraction: 0.1
 learning_rate_schedule_steps: -1 # By default the length of the schedule is set to the number of steps.
 
diff --git a/src/maxdiffusion/generate_wan.py b/src/maxdiffusion/generate_wan.py
@@ -20,10 +20,13 @@
 from absl import app
 from maxdiffusion.utils import export_to_video
 
+jax.config.update('jax_use_shardy_partitioner', True)
 
-def run(config):
+
+def run(config, pipeline=None, filename_prefix=""):
   print("seed: ", config.seed)
-  pipeline = WanPipeline.from_pretrained(config)
+  if pipeline is None:
+    pipeline = WanPipeline.from_pretrained(config)
   s0 = time.perf_counter()
 
   # Skip layer guidance
@@ -59,7 +62,7 @@ def run(config):
 
   print("compile time: ", (time.perf_counter() - s0))
   for i in range(len(videos)):
-    export_to_video(videos[i], f"wan_output_{config.seed}_{i}.mp4", fps=config.fps)
+    export_to_video(videos[i], f"{filename_prefix}wan_output_{config.seed}_{i}.mp4", fps=config.fps)
   s0 = time.perf_counter()
   videos = pipeline(
       prompt=prompt,
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
@@ -173,25 +173,54 @@ def _tpu_flash_attention(
   value, _, _ = _reshape_data_for_flash(value, heads, block_sizes.block_kv_compute)
 
   axis_names = nn.logical_to_mesh_axes(flash_axis_names)
+  kv_axis_names = nn.logical_to_mesh_axes((BATCH, HEAD, None, D_KV))
+  flash_axis_names_splash_kernel: AxisNames = (HEAD, LENGTH)
+  axis_names_splash_kernel = nn.logical_to_mesh_axes(flash_axis_names_splash_kernel)
+  named_sharding = jax.sharding.NamedSharding(mesh, axis_names_splash_kernel)
+  
+  cp_size=8
 
   @functools.partial(
-      shard_map.shard_map,
-      mesh=mesh,
-      in_specs=(
-          axis_names,
-          axis_names,
-          axis_names,
-      ),
-      out_specs=axis_names,
-      check_rep=False,
+      jax.jit,
+      static_argnames=[
+        "multi_head_mask",
+        "shard_head_size"
+      ],
   )
-  def wrap_flash_attention(query, key, value):
-    masks = [splash_attention_mask.FullMask(_shape=(query.shape[2], query.shape[2])) for _ in range(query.shape[1])]
-    multi_head_mask = splash_attention_mask.MultiHeadMask(masks=masks)
+  def wrap_splash_kernel(multi_head_mask, shard_head_size=1):
     splash_kernel = splash_attention_kernel.make_splash_mha(
-        mask=multi_head_mask, head_shards=1, q_seq_shards=1, block_sizes=block_sizes
+      mask=multi_head_mask,
+      head_shards=shard_head_size, # the sizes of the axis is sharding over heads
+      q_seq_shards=cp_size,
+      block_sizes=block_sizes,
     )
-    return jax.vmap(splash_kernel)(query, key, value)
+    return splash_kernel
+
+  shard_head_size = 1
+  mask = splash_attention_mask.FullMask(_shape=(query.shape[2], query.shape[2]))
+  mask &= splash_attention_mask.LocalMask(
+    shape=(query.shape[2], key.shape[2]),
+    window_size=(query.shape[2], query.shape[2]),
+    offset=0
+  )
+  multi_head_mask = splash_attention_mask.MultiHeadMask(masks=(mask,) * query.shape[1])
+  splash_kernel = wrap_splash_kernel(multi_head_mask, int(shard_head_size))
+  segment_axis_names_splash_kernel = splash_kernel.manual_sharding_spec(named_sharding)
+  @functools.partial(
+    shard_map.shard_map,
+    mesh=mesh,
+    in_specs=(
+      axis_names,
+      kv_axis_names,
+      kv_axis_names,
+      segment_axis_names_splash_kernel,
+    ),
+    out_specs=axis_names,
+    check_rep=False
+  )
+  def wrap_flash_attention(query, key, value, splash_kernel):
+    attention_output = jax.vmap(splash_kernel)(query, key, value)
+    return attention_output
 
   devices_in_data_fsdp = mesh.shape["data"] * mesh.shape["fsdp"]
   # This warning might show up when doing model eval for example, when calculating model flops
@@ -201,7 +230,7 @@ def wrap_flash_attention(query, key, value):
         "Warning, batch dimension should be shardable among the devices in data and fsdp"
         f" axis, batch dimension: {query.shape[0]}, devices_in_data_fsdp: {devices_in_data_fsdp}"
     )
-  x = wrap_flash_attention(query, key, value)
+  x = wrap_flash_attention(query, key, value, splash_kernel)
   x = x[:, :, :query_seq_len, :kv_size]
   x = _reshape_heads_to_head_dim(x)
 
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline.py b/src/maxdiffusion/pipelines/wan/wan_pipeline.py
@@ -397,7 +397,7 @@ def __call__(
             num_channels_latents=num_channel_latents,
         )
 
-      data_sharding = NamedSharding(self.devices_array, P())
+      data_sharding = NamedSharding(self.mesh, P())
       if len(prompt) % jax.device_count() == 0:
         data_sharding = jax.sharding.NamedSharding(self.mesh, P(*self.config.data_sharding))
 

Original file line number	Diff line number	Diff line change
`@@ -397,7 +397,7 @@ def __call__(`
`397`	`397`	`num_channels_latents=num_channel_latents,`
`398`	`398`	`)`
`399`	`399`
`400`		`- data_sharding = NamedSharding(self.devices_array, P())`
	`400`	`+ data_sharding = NamedSharding(self.mesh, P())`
`401`	`401`	`if len(prompt) % jax.device_count() == 0:`
`402`	`402`	`data_sharding = jax.sharding.NamedSharding(self.mesh, P(*self.config.data_sharding))`
`403`	`403`