@@ -2527,77 +2527,77 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
25272527
25282528 # I. FINAL TYPE CONVERSIONS AND DERIVED LISTS
25292529 # Create the ici_parallelism and dcn_parallelism lists for legacy compatibility.
2530- if self .using_pipeline_parallelism and self .mesh_axes and self .mesh_axes [0 ] == "stage" :
2531- self .ici_parallelism = [
2532- self .ici_diloco_parallelism ,
2533- self .ici_pipeline_parallelism ,
2534- self .ici_data_parallelism ,
2535- self .ici_fsdp_parallelism ,
2536- self .ici_fsdp_transpose_parallelism ,
2537- self .ici_sequence_parallelism ,
2538- self .ici_context_parallelism ,
2539- self .ici_context_autoregressive_parallelism ,
2540- self .ici_tensor_parallelism ,
2541- self .ici_tensor_transpose_parallelism ,
2542- self .ici_tensor_sequence_parallelism ,
2543- self .ici_expert_parallelism ,
2544- self .ici_autoregressive_parallelism ,
2545- ]
2546- self .dcn_parallelism = [
2547- self .dcn_diloco_parallelism ,
2548- self .dcn_pipeline_parallelism ,
2549- self .dcn_data_parallelism ,
2550- self .dcn_fsdp_parallelism ,
2551- self .dcn_fsdp_transpose_parallelism ,
2552- self .dcn_sequence_parallelism ,
2553- self .dcn_context_parallelism ,
2554- self .dcn_context_autoregressive_parallelism ,
2555- self .dcn_tensor_parallelism ,
2556- self .dcn_tensor_transpose_parallelism ,
2557- self .dcn_tensor_sequence_parallelism ,
2558- self .dcn_expert_parallelism ,
2559- self .dcn_autoregressive_parallelism ,
2560- ]
2561- else :
2562- ici_map = {
2563- "diloco" : self .ici_diloco_parallelism ,
2564- "data" : self .ici_data_parallelism ,
2565- "stage" : self .ici_pipeline_parallelism ,
2566- "fsdp" : self .ici_fsdp_parallelism ,
2567- "fsdp_transpose" : self .ici_fsdp_transpose_parallelism ,
2568- "sequence" : self .ici_sequence_parallelism ,
2569- "context" : self .ici_context_parallelism ,
2570- "context_autoregressive" : self .ici_context_autoregressive_parallelism ,
2571- "tensor" : self .ici_tensor_parallelism ,
2572- "tensor_transpose" : self .ici_tensor_transpose_parallelism ,
2573- "tensor_sequence" : self .ici_tensor_sequence_parallelism ,
2574- "model" : self .ici_tensor_parallelism ,
2575- "expert" : self .ici_expert_parallelism ,
2576- "autoregressive" : self .ici_autoregressive_parallelism ,
2577- "attn_dp" : 1 , # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads
2530+ # if self.using_pipeline_parallelism and self.mesh_axes and self.mesh_axes[0] == "stage":
2531+ # self.ici_parallelism = [
2532+ # self.ici_diloco_parallelism,
2533+ # self.ici_pipeline_parallelism,
2534+ # self.ici_data_parallelism,
2535+ # self.ici_fsdp_parallelism,
2536+ # self.ici_fsdp_transpose_parallelism,
2537+ # self.ici_sequence_parallelism,
2538+ # self.ici_context_parallelism,
2539+ # self.ici_context_autoregressive_parallelism,
2540+ # self.ici_tensor_parallelism,
2541+ # self.ici_tensor_transpose_parallelism,
2542+ # self.ici_tensor_sequence_parallelism,
2543+ # self.ici_expert_parallelism,
2544+ # self.ici_autoregressive_parallelism,
2545+ # ]
2546+ # self.dcn_parallelism = [
2547+ # self.dcn_diloco_parallelism,
2548+ # self.dcn_pipeline_parallelism,
2549+ # self.dcn_data_parallelism,
2550+ # self.dcn_fsdp_parallelism,
2551+ # self.dcn_fsdp_transpose_parallelism,
2552+ # self.dcn_sequence_parallelism,
2553+ # self.dcn_context_parallelism,
2554+ # self.dcn_context_autoregressive_parallelism,
2555+ # self.dcn_tensor_parallelism,
2556+ # self.dcn_tensor_transpose_parallelism,
2557+ # self.dcn_tensor_sequence_parallelism,
2558+ # self.dcn_expert_parallelism,
2559+ # self.dcn_autoregressive_parallelism,
2560+ # ]
2561+ # else:
2562+ ici_map = {
2563+ "diloco" : self .ici_diloco_parallelism ,
2564+ "data" : self .ici_data_parallelism ,
2565+ "stage" : self .ici_pipeline_parallelism ,
2566+ "fsdp" : self .ici_fsdp_parallelism ,
2567+ "fsdp_transpose" : self .ici_fsdp_transpose_parallelism ,
2568+ "sequence" : self .ici_sequence_parallelism ,
2569+ "context" : self .ici_context_parallelism ,
2570+ "context_autoregressive" : self .ici_context_autoregressive_parallelism ,
2571+ "tensor" : self .ici_tensor_parallelism ,
2572+ "tensor_transpose" : self .ici_tensor_transpose_parallelism ,
2573+ "tensor_sequence" : self .ici_tensor_sequence_parallelism ,
2574+ "model" : self .ici_tensor_parallelism ,
2575+ "expert" : self .ici_expert_parallelism ,
2576+ "autoregressive" : self .ici_autoregressive_parallelism ,
2577+ "attn_dp" : 1 , # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads
25782578 "attn_dp_expert" : 1 , # initialized to 1, vLLM will auto calculate this value based on EP
2579- }
2580- self .ici_parallelism = [ici_map [axis ] for axis in self .mesh_axes ]
2581-
2582- dcn_map = {
2583- "diloco" : self .dcn_diloco_parallelism ,
2584- "data" : self .dcn_data_parallelism ,
2585- "stage" : self .dcn_pipeline_parallelism ,
2586- "fsdp" : self .dcn_fsdp_parallelism ,
2587- "fsdp_transpose" : self .dcn_fsdp_transpose_parallelism ,
2588- "sequence" : self .dcn_sequence_parallelism ,
2589- "context" : self .dcn_context_parallelism ,
2590- "context_autoregressive" : self .dcn_context_autoregressive_parallelism ,
2591- "tensor" : self .dcn_tensor_parallelism ,
2592- "tensor_transpose" : self .dcn_tensor_transpose_parallelism ,
2593- "tensor_sequence" : self .dcn_tensor_sequence_parallelism ,
2594- "model" : self .dcn_tensor_parallelism ,
2595- "expert" : self .dcn_expert_parallelism ,
2596- "autoregressive" : self .dcn_autoregressive_parallelism ,
2597- "attn_dp" : 1 , # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads
2579+ }
2580+ self .ici_parallelism = [ici_map [axis ] for axis in self .mesh_axes ]
2581+
2582+ dcn_map = {
2583+ "diloco" : self .dcn_diloco_parallelism ,
2584+ "data" : self .dcn_data_parallelism ,
2585+ "stage" : self .dcn_pipeline_parallelism ,
2586+ "fsdp" : self .dcn_fsdp_parallelism ,
2587+ "fsdp_transpose" : self .dcn_fsdp_transpose_parallelism ,
2588+ "sequence" : self .dcn_sequence_parallelism ,
2589+ "context" : self .dcn_context_parallelism ,
2590+ "context_autoregressive" : self .dcn_context_autoregressive_parallelism ,
2591+ "tensor" : self .dcn_tensor_parallelism ,
2592+ "tensor_transpose" : self .dcn_tensor_transpose_parallelism ,
2593+ "tensor_sequence" : self .dcn_tensor_sequence_parallelism ,
2594+ "model" : self .dcn_tensor_parallelism ,
2595+ "expert" : self .dcn_expert_parallelism ,
2596+ "autoregressive" : self .dcn_autoregressive_parallelism ,
2597+ "attn_dp" : 1 , # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads
25982598 "attn_dp_expert" : 1 , # initialized to 1, vLLM will auto calculate this value based on EP
2599- }
2600- self .dcn_parallelism = [dcn_map [axis ] for axis in self .mesh_axes ]
2599+ }
2600+ self .dcn_parallelism = [dcn_map [axis ] for axis in self .mesh_axes ]
26012601
26022602 # Diloco params
26032603 self .num_diloco_replicas = int (self .ici_diloco_parallelism * self .dcn_diloco_parallelism )
0 commit comments