@@ -44,17 +44,17 @@ mesh_axes: ['data', 'fsdp', 'context', 'tensor']
4444logical_axis_rules : [
4545 ['batch', ['data', 'fsdp']],
4646 ['activation_batch', ['data', 'fsdp']],
47- ['activation_self_attn_heads', ['context', ' tensor'] ],
48- ['activation_cross_attn_q_length', ['context', ' tensor'] ],
49- ['activation_length', 'context' ],
47+ ['activation_self_attn_heads', ' tensor'],
48+ ['activation_cross_attn_q_length', ' tensor'],
49+ ['activation_length', None ],
5050 ['activation_heads', 'tensor'],
5151 ['mlp','tensor'],
52- ['embed', ['context ', 'fsdp ']],
52+ ['embed', ['fsdp ', 'tensor ']],
5353 ['heads', 'tensor'],
5454 ['norm', 'tensor'],
55- ['conv_batch', ['data', 'context', ' fsdp']],
55+ ['conv_batch', ['data', 'fsdp']],
5656 ['out_channels', 'tensor'],
57- ['conv_out', 'context '],
57+ ['conv_out', 'tensor '],
5858 ]
5959data_sharding : ['data', 'fsdp', 'context', 'tensor']
6060
@@ -75,8 +75,8 @@ dcn_context_parallelism: 1
7575dcn_tensor_parallelism : 1
7676ici_data_parallelism : 1
7777ici_fsdp_parallelism : 1
78- ici_context_parallelism : -1 # recommended ICI axis to be auto-sharded
79- ici_tensor_parallelism : 1
78+ ici_context_parallelism : 1
79+ ici_tensor_parallelism : -1 # recommended ICI axis to be auto-sharded
8080enable_profiler : False
8181
8282replicate_vae : False
0 commit comments