You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
activations_in_float32: False#Sets activations to float32 before nonlinearity it true, else dtype
109
-
#Used to replicate the quantization scale to avoid the inefficient XLA fusion for 2d sharding.
110
-
replicate_quant_scale: False
111
-
#Path to file with quantization config for intmp.
109
+
activations_in_float32: false#sets activations to float32 before nonlinearity it true, else dtype
110
+
#used to replicate the quantization scale to avoid the inefficient xla fusion for 2d sharding.
111
+
replicate_quant_scale: false
112
+
#path to file with quantization config for intmp.
112
113
quant_cfg_path: ""
113
-
quantize_kvcache: False#Set to True to quantize KV Cache values, defaults to False
114
-
#Valid kv_quant_axis values:
115
-
# - "" is valid only when quantize_kvcache is False
114
+
quantize_kvcache: false#set to true to quantize kv cache values, defaults to false
115
+
#valid kv_quant_axis values:
116
+
# - "" is valid only when quantize_kvcache is false
116
117
# - "dkv" indicates quantize kv cache over the cache_kv, i.e. kv dimension axis
117
118
# - "heads_and_dkv" indicates quantize kv cache over cache_heads and cache_kv axes
118
-
#Default to "heads_and_dkv" for faster compution, kv_quant_axis is not used when quantize_kvcache is False
119
+
#default to "heads_and_dkv" for faster compution, kv_quant_axis is not used when quantize_kvcache is false
119
120
# - "dkv" is expected with better accuracy but degraded computation
120
121
kv_quant_axis: "heads_and_dkv"
121
122
kv_quant_dtype: "int8"
122
-
checkpoint_is_quantized: False#Set to True if reading from a saved aqt quantized checkpoint
123
-
#Saves params quantized on fly at following path
123
+
checkpoint_is_quantized: false#set to true if reading from a saved aqt quantized checkpoint
124
+
#saves params quantized on fly at following path
124
125
save_quantized_params_path: ""
125
-
#Used to configure the mode in which model is called
126
+
#used to configure the mode in which model is called
126
127
# when left as is, corresponds to training
127
128
# accepted values are "inference"
128
129
model_call_mode: ""
129
-
use_qwix_quantization: False#Whether to use qwix for quantization. If set to True, the model will be quantized using qwix.
130
-
#Quantization calibration method used for weights and activations. Supported methods can be found in https://github.com/google/qwix/blob/dc2a0770351c740e5ab3cce7c0efe9f7beacce9e/qwix/qconfig.py#L70-L80
130
+
use_qwix_quantization: false#whether to use qwix for quantization. if set to true, the model will be quantized using qwix.
131
+
#quantization calibration method used for weights and activations. supported methods can be found in https://github.com/google/qwix/blob/dc2a0770351c740e5ab3cce7c0efe9f7beacce9e/qwix/qconfig.py#l70-l80
131
132
weight_quantization_calibration_method: "absmax"
132
133
act_quantization_calibration_method: "absmax"
133
134
bwd_quantization_calibration_method: "absmax"
134
-
#Shard the range finding operation for quantization. By default this is set to number of slices.
135
+
#shard the range finding operation for quantization. by default this is set to number of slices.
135
136
quantization_local_shard_count: -1
136
137
137
-
decoder_block: "llama2"# which style of DecoderBlock to use.
138
-
#Global parameter scale needs to be a power of 2. If you want finer grained control of the model sizes
138
+
decoder_block: "llama2"# which style of decoderblock to use.
139
+
#global parameter scale needs to be a power of 2. if you want finer grained control of the model sizes
139
140
# then you should explicitly set base_embed_dim, base_num_query_heads, base_num_kv_heads,
0 commit comments