@@ -220,7 +220,7 @@ expert_shard_attention_option: "fsdp"
220220
221221# when moe weight matrices are sharded on both fsdp and fsdp-transpose axes, use two separate all-gather calls
222222moe_fsdp_use_two_stage_all_gather : false
223- # Shard the expert dimension of the MLP weights on the FSDP axis.
223+ # Shard the expert dimension of the MLP weights on the FSDP axis.
224224# This configuration is recommended only when num_experts is a multiple of fsdp_parallelism
225225shard_exp_on_fsdp : False
226226# use fsdp and fsdp_transpose axes for sharding the moe weights
@@ -535,7 +535,7 @@ num_vocab_tiling: 1
535535
536536# Tokenizer
537537vocab_size : 32_000 # powers of 2 for sharding
538- tokenizer_path : " src/MaxText/src/ maxtext/assets/tokenizers/tokenizer.llama2"
538+ tokenizer_path : " src/maxtext/assets/tokenizers/tokenizer.llama2"
539539# tfds pipeline supports tokenizer_type: sentencepiece, huggingface, tiktoken
540540# grain pipeline supports tokenizer_type: sentencepiece, huggingface
541541# hf pipeline only supports huggingface type, and will ignore tokenizer_type flag
@@ -1027,7 +1027,7 @@ use_mrope: false
10271027mrope_section : [24, 20, 20]
10281028position_id_per_seconds : 25
10291029
1030- # Subslice shape in the form of "x,y,z" when using pathways (single controller).
1030+ # Subslice shape in the form of "x,y,z" when using pathways (single controller).
10311031# Example: "8,8" to use a 8x8 subgrid (64 chips) of a full pod (16x16) of trillium.
10321032subslice_shape : " "
10331033
0 commit comments