@@ -212,7 +212,7 @@ expert_shard_attention_option: "fsdp"
212212
213213# when moe weight matrices are sharded on both fsdp and fsdp-transpose axes, use two separate all-gather calls
214214moe_fsdp_use_two_stage_all_gather : false
215- # Shard the expert dimension of the MLP weights on the FSDP axis.
215+ # Shard the expert dimension of the MLP weights on the FSDP axis.
216216# This configuration is recommended only when num_experts is a multiple of fsdp_parallelism
217217shard_exp_on_fsdp : False
218218# use fsdp and fsdp_transpose axes for sharding the moe weights
@@ -527,7 +527,7 @@ num_vocab_tiling: 1
527527
528528# Tokenizer
529529vocab_size : 32_000 # powers of 2 for sharding
530- tokenizer_path : " src/MaxText/src/ maxtext/assets/tokenizers/tokenizer.llama2"
530+ tokenizer_path : " src/maxtext/assets/tokenizers/tokenizer.llama2"
531531# tfds pipeline supports tokenizer_type: sentencepiece, huggingface, tiktoken
532532# grain pipeline supports tokenizer_type: sentencepiece, huggingface
533533# hf pipeline only supports huggingface type, and will ignore tokenizer_type flag
@@ -1019,7 +1019,7 @@ use_mrope: false
10191019mrope_section : [24, 20, 20]
10201020position_id_per_seconds : 25
10211021
1022- # Subslice shape in the form of "x,y,z" when using pathways (single controller).
1022+ # Subslice shape in the form of "x,y,z" when using pathways (single controller).
10231023# Example: "8,8" to use a 8x8 subgrid (64 chips) of a full pod (16x16) of trillium.
10241024subslice_shape : " "
10251025
0 commit comments