AI-Hypercomputer
diff --git a/‎src/MaxText/configs/rl.yml‎
Lines changed: 21 additions & 19 deletions b/‎src/MaxText/configs/rl.yml‎
Lines changed: 21 additions & 19 deletions
diff --git a/‎src/MaxText/configs/types.py‎
Lines changed: 7 additions & 5 deletions b/‎src/MaxText/configs/types.py‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎src/MaxText/examples/rl_llama3_demo.ipynb‎
Lines changed: 23 additions & 26 deletions b/‎src/MaxText/examples/rl_llama3_demo.ipynb‎
Lines changed: 23 additions & 26 deletions
diff --git a/‎src/MaxText/max_logging.py‎
Lines changed: 18 additions & 0 deletions b/‎src/MaxText/max_logging.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎src/MaxText/rl/evaluate_rl.py‎
Lines changed: 4 additions & 4 deletions b/‎src/MaxText/rl/evaluate_rl.py‎
Lines changed: 4 additions & 4 deletions
@@ -31,25 +31,27 @@ rollout_tensor_parallelism: -1
 # ====== Reproducibility ======
 data_shuffle_seed: 42
 
-# ====== GRPO ======
-
-# The number of times the policy generates multiple responses for a given prompt
-# within a single training step. This corresponds to `G` in Algorithm 1 in the
-# paper. The "group" in GRPO comes from here.
-num_generations: 2
-
-# === other GRPO configs ===
-# The number of iterations per batch (𝜇 in GRPO algo 1).
-num_iterations: 1
-
-# The coefficient for the KL divergence penalty (𝛽) in the GRPO loss function.
-# Important to keep a high enough value for this, otherwise, the KL divergence
-# can increase unchecked.
-grpo_beta: 0.08
-# Epsilon value for clipping (𝜀 in GRPO loss in paper). Similar to PPO, for
-# stable updates.
-grpo_epsilon: 0.2
-loss_algo: 'grpo' # grpo or gspo-token
+# ====== RL ======
+# This config includes RL algorithm variations such as grpo or gspo-token
+rl:
+  # ====== GRPO/GSPO-Token ======
+  # The number of times the policy generates multiple responses for a given prompt
+  # within a single training step. This corresponds to `G` in Algorithm 1 in the
+  # paper. The "group" in GRPO comes from here.
+  num_generations: 2
+
+  # === other GRPO configs ===
+  # The number of iterations per batch (𝜇 in GRPO algo 1).
+  num_iterations: 1
+
+  # The coefficient for the KL divergence penalty (𝛽) in the GRPO loss function.
+  # Important to keep a high enough value for this, otherwise, the KL divergence
+  # can increase unchecked.
+  grpo_beta: 0.08
+  # Epsilon value for clipping (𝜀 in GRPO loss in paper). Similar to PPO, for
+  # stable updates.
+  grpo_epsilon: 0.2
+  loss_algo: 'grpo' # grpo or gspo-token
 
 
 # ====== Models ======
 
@@ -1398,14 +1398,14 @@ class VLLM(BaseModel):
   vllm_hf_config_path: str = Field("", description="Path to HuggingFace model config for MaxText model.")
 
 
-class GRPO(BaseModel):
-  """Configuration for Group Relative Policy Optimization (GRPO)."""
+class RL(BaseModel):
+  """Configuration for RL algorithms like Group Relative Policy Optimization (GRPO) among others."""
 
   num_generations: int = Field(2, description="Number of responses to generate per prompt (G in GRPO paper).")
   num_iterations: int = Field(1, description="Number of iterations per batch (μ in GRPO paper).")
   grpo_beta: float = Field(0.08, description="Coefficient for the KL divergence penalty (β).")
   grpo_epsilon: float = Field(0.2, description="Epsilon value for clipping in the GRPO loss.")
-  loss_algo: str = Field("grpo", description="Loss algorithm, e.g., 'grpo' or 'gspo-token'.")
+  loss_algo: Literal["grpo", "gspo-token"] = Field("grpo", description="Loss algorithm, i.e., 'grpo' or 'gspo-token'.")
 
 
 class RLDataset(BaseModel):
@@ -1639,7 +1639,6 @@ class MaxTextConfig(
     # Reinforcement Learning
     RLHardware,
     VLLM,
-    GRPO,
     RLDataset,
     RLEvaluation,
     Reward,
@@ -1689,6 +1688,9 @@ class MaxTextConfig(
   """
 
   debug: Debug = Field(default_factory=Debug, description="Configuration for debugging options.")
+  rl: RL = Field(
+      default_factory=RL, description="Configuration for RL algorithms like Group Relative Policy Optimization (GRPO)."
+  )
   model_config = ConfigDict(extra="forbid", protected_namespaces=())
 
   @model_validator(mode="before")
@@ -2134,7 +2136,7 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
       raise ValueError("`eval_steps` must be > 0 when `generate_padding_batch_eval` is True.")
     if self.dataset_type == "hf" and self.num_epoch != 1:
       raise ValueError("HuggingFace pipeline only supports num_epoch=1.")
-    if self.loss_algo == "grpo":
+    if self.rl.loss_algo == "grpo":
       self.use_grpo = True
     else:
       self.use_grpo = False
 
@@ -107,7 +107,6 @@
     "from pathlib import Path\n",
     "import MaxText\n",
     "from huggingface_hub import login\n",
-    "import jax\n",
     "\n",
     "# Set up paths (adjust if needed)\n",
     "MAXTEXT_REPO_ROOT = os.path.dirname(MaxText.__file__)\n",
@@ -127,26 +126,21 @@
     "    raise RuntimeError(\"OUTPUT_DIRECTORY is not set\")\n",
     "    \n",
     "os.environ[\"HF_TOKEN\"] = HF_TOKEN\n",
+    "if \"MAXTEXT_PKG_DIR\" not in os.environ:\n",
+    "    os.environ[\"MAXTEXT_PKG_DIR\"] = MAXTEXT_REPO_ROOT\n",
     "\n",
     "if HF_TOKEN:\n",
     "    login(token=HF_TOKEN)\n",
     "    print(\"Authenticated with Hugging Face\")\n",
     "else:\n",
     "    print(\"Authentication failed: Hugging Face token not set\")\n",
     "\n",
-    "# Optional: Override training parameters\n",
-    "LEARNING_RATE = 3e-6\n",
-    "NUM_GENERATIONS = 2\n",
-    "GRPO_BETA = 0.08\n",
-    "GRPO_EPSILON = 0.2\n",
-    "CHIPS_PER_VM = 1\n",
     "\n",
     "print(f\"📁 MaxText Home: {MAXTEXT_REPO_ROOT}\")\n",
     "print(f\"🤖 Model: {MODEL_NAME}\")\n",
     "print(f\"📦 Checkpoint: {MODEL_CHECKPOINT_PATH}\")\n",
     "print(f\"💾 Output: {OUTPUT_DIRECTORY}\")\n",
     "print(f\"🔑 HF Token: {'✅ Set' if HF_TOKEN else '❌ Missing - set HF_TOKEN env var'}\")\n",
-    "print(f\"📊 Steps: {STEPS}\")\n",
     "print(f\"Loss Algorithm : {LOSS_ALGO}\")"
    ]
   },
@@ -178,10 +172,10 @@
    "outputs": [],
    "source": [
     "# Build configuration for GRPO training\n",
-    "config_file = os.path.join(MAXTEXT_REPO_ROOT, \"configs/rl.yml\")\n",
+    "config_file = os.path.join(MAXTEXT_REPO_ROOT, \"configs\", \"rl.yml\")\n",
     "\n",
     "# Verify chat template exists\n",
-    "if not os.path.exists(CHAT_TEMPLATE_PATH)):\n",
+    "if not os.path.exists(CHAT_TEMPLATE_PATH):\n",
     "    raise FileNotFoundError(f\"Chat template not found: {CHAT_TEMPLATE_PATH}\")\n",
     "\n",
     "# Build argv list for pyconfig.initialize()\n",
@@ -195,23 +189,26 @@
     "    f\"load_parameters_path={MODEL_CHECKPOINT_PATH}\",\n",
     "    f\"base_output_directory={OUTPUT_DIRECTORY}\",\n",
     "    f\"hf_access_token={HF_TOKEN}\",\n",
-    "    f\"learning_rate={LEARNING_RATE}\",\n",
-    "    f\"num_generations={NUM_GENERATIONS}\",\n",
-    "    f\"grpo_beta={GRPO_BETA}\",\n",
-    "    f\"grpo_epsilon={GRPO_EPSILON}\",\n",
-    "    f\"chips_per_vm={CHIPS_PER_VM}\",\n",
-    "    f\"loss_algo={LOSS_ALGO}\",\n",
+    "    f\"debug.rl=False\",\n",
+    "    f\"rl.loss_algo={LOSS_ALGO}\",\n",
     "    \"use_pathways=False\"\n",
     "]\n",
     "\n",
     "# Initialize configuration\n",
     "print(f\"🔧 Initializing configuration from: {config_file}\")\n",
-    "config = pyconfig.initialize(config_argv)\n",
+    "trainer_config, sampler_config, trainer_devices, sampler_devices = setup_configs_and_devices(config_argv)\n",
+    "\n",
+    "rl_train_steps = int(\n",
+    "      trainer_config.num_batches\n",
+    "      * trainer_config.rl.num_iterations\n",
+    "      * trainer_config.train_fraction\n",
+    "      * trainer_config.num_epoch\n",
+    "  )\n",
     "\n",
     "print(\"\\n✅ Configuration initialized successfully\")\n",
-    "print(f\"📊 Training steps: {config.steps}\")\n",
-    "print(f\"📁 Output directory: {config.base_output_directory}\")\n",
-    "print(f\"🤖 Model: {config.model_name}\")"
+    "print(f\"📁 Output directory: {trainer_config.base_output_directory}\")\n",
+    "print(f\"🤖 Model: {trainer_config.model_name}\")\n",
+    "print(f\"📊 RL Train Steps: {rl_train_steps}\")"
    ]
   },
   {
@@ -224,16 +221,16 @@
     "print(\"\\n\" + \"=\"*80)\n",
     "print(\"🚀 Starting Training...\")\n",
     "print(\"=\"*80)\n",
-    "print(1)\n",
     "try:\n",
     "    # Call the rl_train function (it handles everything internally)\n",
-    "    rl_train(config)\n",
+    "    rl_train(trainer_config, sampler_config, trainer_devices, sampler_devices)\n",
     "    \n",
     "    print(\"\\n\" + \"=\"*80)\n",
     "    print(\"✅ Training Completed Successfully!\")\n",
+    "    print(f\"✍️ Note the improved evaluation accuracy metrics with just {rl_train_steps} RL training steps!\")\n",
     "    print(\"=\"*80)\n",
-    "    print(f\"📁 Checkpoints saved to: {config.checkpoint_dir}\")\n",
-    "    print(f\"📊 TensorBoard logs: {config.tensorboard_dir}\")\n",
+    "    print(f\"📁 Checkpoints saved to: {trainer_config.checkpoint_dir}\")\n",
+    "    print(f\"📊 TensorBoard logs: {trainer_config.tensorboard_dir}\")\n",
     "    print(f\"🎯 Model ready for inference!\")\n",
     "    \n",
     "except Exception as e:\n",
@@ -264,7 +261,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "maxtext_venv",
    "language": "python",
    "name": "python3"
   },
@@ -278,7 +275,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.12.11"
   }
  },
  "nbformat": 4,
 
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 """Logging utilities."""
+import logging as std_logging
 from absl import logging
 
 
@@ -40,3 +41,20 @@ def warning(user_str):
 def error(user_str):
   """Logs a message at the ERROR level."""
   logging.error(user_str, stacklevel=2)
+
+
+# Define filter at module level to avoid pickling issues and ensure visibility
+class NoisyLogFilter(std_logging.Filter):
+  """
+  Class for defining log patterns to filter out
+  """
+
+  def filter(self, record):
+    # Get the message; check both the raw msg and formatted message
+    msg = record.getMessage()
+    # Suppress "Type mismatch" warnings from tunix/generate/utils.py
+    if "Type mismatch on" in msg:
+      return False
+    if "No mapping for flat state" in msg:
+      return False
+    return True
@@ -76,7 +76,7 @@ def generate_responses(
     )
     responses = responses.text
 
-    if tmvp_config.debug["rl"]:
+    if tmvp_config.debug.rl:
       max_logging.log(f"Pass {p+1}/{num_passes}, responses: {responses}")
 
     for idx, response in enumerate(responses):
@@ -101,7 +101,7 @@ def score_responses(tmvp_config, question, responses, answer):
   match_format = utils_rl.get_match_format_regex(tmvp_config)
   match_numbers = utils_rl.get_match_numbers_regex(tmvp_config)
 
-  if tmvp_config.debug["rl"]:
+  if tmvp_config.debug.rl:
     max_logging.log("========================================")
     max_logging.log(f"Evaluation Question: {question}")
     max_logging.log(f"Evaluation Answer: {answer}")
@@ -116,7 +116,7 @@ def score_responses(tmvp_config, question, responses, answer):
     # Extract numerical response
     extracted_response = guess.group(1) if (guess := match_numbers.search(response)) is not None else "-1000000"
 
-    if tmvp_config.debug["rl"]:
+    if tmvp_config.debug.rl:
       max_logging.log(f"Evaluation extracted_response: {extracted_response}")
 
     # Check exact correctness
@@ -132,7 +132,7 @@ def score_responses(tmvp_config, question, responses, answer):
         is_partially_correct = True
 
     except Exception as e:
-      if tmvp_config.debug["rl"]:
+      if tmvp_config.debug.rl:
         max_logging.log(f"Evaluation Exception: {e}")
         max_logging.log("SKIPPED")