@@ -2526,39 +2526,6 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
25262526 raise ValueError ("`share_kv_projections` is not compatible with `attention_type='mla'`." )
25272527
25282528 # I. FINAL TYPE CONVERSIONS AND DERIVED LISTS
2529- # Create the ici_parallelism and dcn_parallelism lists for legacy compatibility.
2530- # if self.using_pipeline_parallelism and self.mesh_axes and self.mesh_axes[0] == "stage":
2531- # self.ici_parallelism = [
2532- # self.ici_diloco_parallelism,
2533- # self.ici_pipeline_parallelism,
2534- # self.ici_data_parallelism,
2535- # self.ici_fsdp_parallelism,
2536- # self.ici_fsdp_transpose_parallelism,
2537- # self.ici_sequence_parallelism,
2538- # self.ici_context_parallelism,
2539- # self.ici_context_autoregressive_parallelism,
2540- # self.ici_tensor_parallelism,
2541- # self.ici_tensor_transpose_parallelism,
2542- # self.ici_tensor_sequence_parallelism,
2543- # self.ici_expert_parallelism,
2544- # self.ici_autoregressive_parallelism,
2545- # ]
2546- # self.dcn_parallelism = [
2547- # self.dcn_diloco_parallelism,
2548- # self.dcn_pipeline_parallelism,
2549- # self.dcn_data_parallelism,
2550- # self.dcn_fsdp_parallelism,
2551- # self.dcn_fsdp_transpose_parallelism,
2552- # self.dcn_sequence_parallelism,
2553- # self.dcn_context_parallelism,
2554- # self.dcn_context_autoregressive_parallelism,
2555- # self.dcn_tensor_parallelism,
2556- # self.dcn_tensor_transpose_parallelism,
2557- # self.dcn_tensor_sequence_parallelism,
2558- # self.dcn_expert_parallelism,
2559- # self.dcn_autoregressive_parallelism,
2560- # ]
2561- # else:
25622529 ici_map = {
25632530 "diloco" : self .ici_diloco_parallelism ,
25642531 "data" : self .ici_data_parallelism ,
0 commit comments