@@ -151,17 +151,17 @@ mesh_axes: ['data', 'fsdp', 'context', 'tensor']
151151logical_axis_rules : [
152152 ['batch', ['data', 'fsdp']],
153153 ['activation_batch', ['data', 'fsdp']],
154- ['activation_self_attn_heads', ['context', ' tensor'] ],
155- ['activation_cross_attn_q_length', ['context', ' tensor'] ],
156- ['activation_length', 'context' ],
154+ ['activation_self_attn_heads', ' tensor'],
155+ ['activation_cross_attn_q_length', ' tensor'],
156+ ['activation_length', None ],
157157 ['activation_heads', 'tensor'],
158158 ['mlp','tensor'],
159- ['embed', ['context', ' fsdp'] ],
159+ ['embed', ' fsdp'],
160160 ['heads', 'tensor'],
161161 ['norm', 'tensor'],
162- ['conv_batch', ['data', 'context', ' fsdp']],
162+ ['conv_batch', ['data', 'fsdp']],
163163 ['out_channels', 'tensor'],
164- ['conv_out', 'context '],
164+ ['conv_out', 'tensor '],
165165 ]
166166data_sharding : [['data', 'fsdp', 'context', 'tensor']]
167167
@@ -174,9 +174,9 @@ dcn_fsdp_parallelism: -1
174174dcn_context_parallelism : 1
175175dcn_tensor_parallelism : 1
176176ici_data_parallelism : 1
177- ici_fsdp_parallelism : -1 # recommended ICI axis to be auto-sharded
177+ ici_fsdp_parallelism : 1
178178ici_context_parallelism : 1
179- ici_tensor_parallelism : 1
179+ ici_tensor_parallelism : -1 # recommended ICI axis to be auto-sharded
180180
181181allow_split_physical_axes : False
182182
0 commit comments