Add debugging printing - will revert

eltsai · eltsai · commit 8f1ffdefbd1e · 2025-12-09T00:08:44.000Z
diff --git a/sh_scripts/128_intoverflow.sh b/sh_scripts/128_intoverflow.sh
@@ -0,0 +1,102 @@
+
+#!/bin/bash
+# bash docker_build_dependency_image.sh
+# docker tag maxdiffusion_base_image:latest gcr.io/cloud-tpu-multipod-dev/sanbao/maxdiffusion_base_image:latest
+# docker push gcr.io/cloud-tpu-multipod-dev/sanbao/maxdiffusion_base_image:latest
+CLUSTER_NAME=bodaborg-tpu7x-128
+DEVICE_TYPE=tpu7x-128 # can change to any size <= tpu7x-256
+PROJECT=cloud-tpu-multipod-dev
+ZONE=us-central1
+
+# Please change the RUN_NAME and OUTPUT_DIR to your own GCS bucket path.
+export RUN_NAME=wan-v7x-128-incre-bring-back-v0
+
+USR_NAME=elisatsai
+YOUR_GCS_BUCKET=gs://${USR_NAME}-wan-maxdiffusion
+
+OUTPUT_DIR=${YOUR_GCS_BUCKET}/wan/${RUN_NAME}
+
+# using sanbao's dir
+DATASET_DIR=gs://sanbao-bucket/wan_tfr_dataset_pusa_v1/train/
+EVAL_DATA_DIR=gs://sanbao-bucket/wan_tfr_dataset_pusa_v1/eval_timesteps/
+SAVE_DATASET_DIR=gs://sanbao-bucket/wan_tfr_dataset_pusa_v1/save/
+
+RANDOM=123456789
+IMAGE_DIR=gcr.io/cloud-tpu-multipod-dev/sanbao/maxdiffusion_base_image:latest
+LIBTPU_VERSION=libtpu-0.0.25.dev20251013+tpu7x-cp312-cp312-manylinux_2_31_x86_64.whl
+
+xpk workload create \
+--cluster=$CLUSTER_NAME \
+--project=$PROJECT \
+--zone=$ZONE \
+--device-type=$DEVICE_TYPE \
+--num-slices=1 \
+--command=" \
+pip install . && \
+gsutil cp gs://libtpu-tpu7x-releases/wheels/libtpu/${LIBTPU_VERSION} . && \
+python -m pip install ${LIBTPU_VERSION} && \
+pip install tokamax && \
+export XLA_FLAGS='--xla_dump_to=/tmp/xla_dumps --xla_dump_hlo_as_text --xla_dump_hlo_as_proto' && \
+export LIBTPU_INIT_ARGS='--xla_enable_async_all_gather=true \
+--xla_tpu_enable_async_collective_fusion=true \
+--xla_tpu_enable_async_collective_fusion_fuse_all_gather=true \
+--xla_enable_async_all_reduce=true \
+--xla_tpu_enable_sparse_core_collective_offload_all_reduce=true \
+--xla_max_concurrent_async_all_gathers=4 \
+--xla_tpu_enable_async_all_to_all=true \
+--xla_latency_hiding_scheduler_rerun=5 \
+--xla_tpu_rwb_fusion=false \
+--xla_tpu_enable_sublane_major_scaling_bitcast_fusion=false \
+--xla_tpu_impure_enable_packed_bf16_math_ops=false \
+--xla_tpu_enable_sparse_core_reduce_scatter_v2=true \
+--xla_tpu_enable_sparse_core_collective_offload_all_gather=true \
+--xla_tpu_enable_sparse_core_collective_offload_2d_all_gather=true \
+--xla_tpu_enable_all_gather_offload_tracing=true \
+--xla_tpu_use_tc_device_shape_on_sc=true \
+--xla_tpu_prefer_async_allgather_to_allreduce=true \
+--xla_tpu_enable_sparse_core_collective_offload_reduce_scatter=true \
+--xla_tpu_scoped_vmem_limit_kib=65536 \
+--xla_tpu_enable_tpu_custom_call_scoped_vmem_adjustments=true \
+--xla_enable_transpose_trace=false' && \
+export HF_TOKEN=<your_token_here> && \
+echo 'Starting WAN training ...' && \
+HF_HUB_CACHE=/dev/shm python src/maxdiffusion/train_wan.py \
+  src/maxdiffusion/configs/base_wan_14b.yml \
+  attention='flash' \
+  weights_dtype=bfloat16 \
+  activations_dtype=bfloat16 \
+  guidance_scale=5.0 \
+  flow_shift=5.0 \
+  fps=16 \
+  skip_jax_distributed_system=False \
+  run_name='test-wan-training-new' \
+  output_dir=${OUTPUT_DIR} \
+  train_data_dir=${DATASET_DIR} \
+  load_tfrecord_cached=True \
+  height=1280 \
+  width=720 \
+  num_frames=81 \
+  num_inference_steps=50 \
+  prompt='a japanese pop star young woman with black hair is singing with a smile. She is inside a studio with dim lighting and musical instruments.' \
+  jax_cache_dir=${OUTPUT_DIR}/jax_cache/ \
+  enable_profiler=True \
+  dataset_save_location=${SAVE_DATASET_DIR} \
+  remat_policy='HIDDEN_STATE_WITH_OFFLOAD' \
+  flash_min_seq_length=0 \
+  seed=$RANDOM \
+  skip_first_n_steps_for_profiler=0 \
+  profiler_steps=2 \
+  per_device_batch_size=0.5 \
+  ici_data_parallelism=64 \
+  ici_fsdp_parallelism=2 \
+  ici_tensor_parallelism=1 \
+  allow_split_physical_axes=True \
+  max_train_steps=2 \
+  scan_layers=true \
+  flash_block_sizes='{\"block_q\":2048,\"block_kv_compute\":512,\"block_kv\":2048,\"block_q_dkv\":2048,\"block_kv_dkv\":2048,\"block_kv_dkv_compute\":512,\"use_fused_bwd_kernel\":true}' || (echo 'Training failed, uploading HLO dumps...'; sleep 5; gsutil -m cp -r /tmp/xla_dumps ${OUTPUT_DIR}/hlo_dumps/ 2>&1); \
+  " \
+--base-docker-image=${IMAGE_DIR} \
+--enable-debug-logs \
+--workload=${RUN_NAME} \
+--priority=medium \
+--max-restarts=0
diff --git a/sh_scripts/filter_by_job.sh b/sh_scripts/filter_by_job.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+# Color codes
+BLUE='\033[0;34m'
+GREEN='\033[0;32m'
+NC='\033[0m' # No Color
+
+echo -e "${GREEN}xpk workload list --cluster=bodaborg-tpu7x-128 --project=cloud-tpu-multipod-dev --zone=us-central1 --filter-by-job=$RUN_NAME${NC}"
+xpk workload list --cluster=bodaborg-tpu7x-128 --project=cloud-tpu-multipod-dev --zone=us-central1 --filter-by-job=$RUN_NAME
diff --git a/sh_scripts/first_pod_log.sh b/sh_scripts/first_pod_log.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+# Color codes
+BLUE='\033[0;34m'
+GREEN='\033[0;32m'
+NC='\033[0m' # No Color
+
+echo -e "${GREEN}kubectl logs $(kubectl get pods | grep $RUN_NAME | head -1 | awk '{print $1}') --all-containers --tail=200${NC}"
+kubectl logs $(kubectl get pods | grep $RUN_NAME | head -1 | awk '{print $1}') --all-containers --tail=200
diff --git a/sh_scripts/get_pods.sh b/sh_scripts/get_pods.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+# Color codes
+BLUE='\033[0;34m'
+GREEN='\033[0;32m'
+NC='\033[0m' # No Color
+
+echo -e "${GREEN}kubectl get pods | grep $RUN_NAME${NC}"
+kubectl get pods | grep $RUN_NAME
diff --git a/sh_scripts/linter_test.sh b/sh_scripts/linter_test.sh
@@ -0,0 +1,2 @@
+# bash unit_test_and_lint.sh
+ruff check . --fix
diff --git a/src/maxdiffusion/max_utils.py b/src/maxdiffusion/max_utils.py
@@ -650,4 +650,30 @@ def maybe_initialize_jax_distributed_system(raw_keys):
     initialize_jax_for_gpu()
     max_logging.log("Jax distributed system initialized on GPU!")
   else:
-    jax.distributed.initialize()
+    jax.distributed.initialize()
+
+
+def get_tensor_sharding_info(tensor, name="tensor", loc=""):
+  """Print tensor sharding info using jax.debug.inspect_array_sharding.
+  
+  This function uses jax.debug.inspect_array_sharding which prints sharding
+  metadata without transferring the tensor data to host (avoiding OOM).
+  
+  Args:
+    tensor: JAX array to inspect
+    name: Human-readable name for the tensor (for logging)
+    loc: Location string (e.g., "MLP_INPUT", "FFN_OUTPUT") to identify where sharding is checked
+    
+  Returns:
+    The tensor unchanged (for use in chaining)
+  """
+  # jax.debug.inspect_array_sharding only prints metadata, not the data
+  # This avoids OOM issues with large tensors
+  loc_str = f" [{loc}]" if loc else ""
+  
+  # Create a custom callback that prefixes location info to each sharding line
+  def print_with_loc(msg):
+    print(f"[{loc}] {msg}" if loc else msg)
+  
+  jax.debug.inspect_array_sharding(tensor, callback=print_with_loc)
+  return tensor
diff --git a/src/maxdiffusion/models/wan/transformers/transformer_wan.py b/src/maxdiffusion/models/wan/transformers/transformer_wan.py
@@ -36,6 +36,7 @@
 from ...normalization_flax import FP32LayerNorm
 from ...attention_flax import FlaxWanAttention
 from ...gradient_checkpoint import GradientCheckpointType
+from ....max_utils import get_tensor_sharding_info
 
 BlockSizes = common_types.BlockSizes
 
@@ -187,6 +188,7 @@ def __init__(
 
   def __call__(self, x: jax.Array) -> jax.Array:
     x = self.proj(x)
+    get_tensor_sharding_info(x, "ffn_activation_before_gelu", loc="GELU_PROJ_OUTPUT")
     return nnx.gelu(x)
 
 
@@ -245,7 +247,9 @@ def conditional_named_scope(self, name: str):
 
   def __call__(self, hidden_states: jax.Array, deterministic: bool = True, rngs: nnx.Rngs = None) -> jax.Array:
     with self.conditional_named_scope("mlp_up_proj_and_gelu"):
+      get_tensor_sharding_info(hidden_states, "mlp_input", loc="MLP_INPUT")
       hidden_states = self.act_fn(hidden_states)  # Output is (4, 75600, 13824)
+      get_tensor_sharding_info(hidden_states, "mlp_intermediate", loc="MLP_INTERMEDIATE")
       hidden_states = checkpoint_name(hidden_states, "ffn_activation")
       hidden_states = self.drop_out(hidden_states, deterministic=deterministic, rngs=rngs)
     with self.conditional_named_scope("mlp_down_proj"):
@@ -359,6 +363,7 @@ def __call__(
             (self.adaln_scale_shift_table + temb.astype(jnp.float32)), 6, axis=1
         )
       hidden_states = jax.lax.with_sharding_constraint(hidden_states, PartitionSpec("data", "fsdp", "tensor"))
+      get_tensor_sharding_info(hidden_states, "hidden_states_after_constraint", loc="BLOCK_ENTRY")
       hidden_states = checkpoint_name(hidden_states, "hidden_states")
       encoder_hidden_states = jax.lax.with_sharding_constraint(encoder_hidden_states, PartitionSpec("data", "fsdp", None))
 

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# bash unit_test_and_lint.sh`
	`2`	`+ruff check . --fix`