AI-Hypercomputer
diff --git a/‎.vscode/launch.json‎
Lines changed: 2 additions & 2 deletions b/‎.vscode/launch.json‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmarks/api_server/maxtext_generator.py‎
Lines changed: 2 additions & 1 deletion b/‎benchmarks/api_server/maxtext_generator.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎benchmarks/mmlu/mmlu_eval.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/mmlu/mmlu_eval.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/run_maxtext/run_maxtext_localhost.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/run_maxtext/run_maxtext_localhost.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/tutorials/first_run.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/tutorials/first_run.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/tutorials/posttraining/multimodal.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/tutorials/posttraining/multimodal.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/MaxText/decode.py‎
Lines changed: 36 additions & 0 deletions b/‎src/MaxText/decode.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎src/MaxText/maxengine_server.py‎
Lines changed: 19 additions & 82 deletions b/‎src/MaxText/maxengine_server.py‎
Lines changed: 19 additions & 82 deletions
diff --git a/‎src/maxtext/checkpoint_conversion/examples/convert_gemma2_to_mt.sh‎
Lines changed: 1 addition & 1 deletion b/‎src/maxtext/checkpoint_conversion/examples/convert_gemma2_to_mt.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/maxtext/checkpoint_conversion/load_and_quantize_checkpoint.py‎
Lines changed: 1 addition & 1 deletion b/‎src/maxtext/checkpoint_conversion/load_and_quantize_checkpoint.py‎
Lines changed: 1 addition & 1 deletion
@@ -8,7 +8,7 @@
       "console": "integratedTerminal",
       "justMyCode": false,
       "python": "python3",
-      "module": "maxtext.decode",
+      "module": "maxtext.inference.decode",
       "args": ["src/maxtext/configs/base.yml",
                "run_name=runner_$(date +%Y-%m-%d-%H-%M)",
                "base_output_directory=gs://test-maxtext-output",
@@ -35,7 +35,7 @@
       "console": "integratedTerminal",
       "justMyCode": false,
       "python": "python3",
-      "module": "maxtext.decode",
+      "module": "maxtext.inference.decode",
       "args": ["src/maxtext/configs/base.yml",
                "run_name=runner_$(date +%Y-%m-%d-%H-%M)",
                "base_output_directory=gs://test-maxtext-output",
 
@@ -34,7 +34,8 @@
 
 from dataclasses import dataclass, field
 
-from MaxText import maxengine, pyconfig
+from MaxText import pyconfig
+from maxtext.inference.maxengine import maxengine
 from maxtext.multimodal import processor as mm_processor
 from maxtext.multimodal import utils as mm_utils
 from maxtext.utils import max_logging, max_utils
 
@@ -57,7 +57,7 @@
 from tqdm import tqdm
 
 from MaxText import pyconfig
-from MaxText import maxengine
+from maxtext.inference.maxengine import maxengine
 from maxtext.utils import max_logging
 from maxtext.utils import max_utils
 
 
@@ -72,7 +72,7 @@ python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml \
 To demonstrate model output, run the following command:
 
 ```bash
-python3 -m maxtext.decode src/maxtext/configs/base.yml \
+python3 -m maxtext.inference.decode src/maxtext/configs/base.yml \
   run_name=$YOUR_JOB_NAME \
   base_output_directory=gs://<my-bucket> \
   per_device_batch_size=1
 
@@ -61,7 +61,7 @@ Optional: If you want to try training on a Hugging Face dataset, see [Data Input
 5. To demonstrate model output, run the following command:
 
 ```sh
-python3 -m maxtext.decode src/maxtext/configs/base.yml \
+python3 -m maxtext.inference.decode src/maxtext/configs/base.yml \
   run_name=$YOUR_JOB_NAME \
   base_output_directory=gs://<my-bucket> \
   per_device_batch_size=1
@@ -93,7 +93,7 @@ python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml \
 3. To demonstrate model output, run the following command:
 
 ```sh
-python3 -m maxtext.decode src/maxtext/configs/base.yml \
+python3 -m maxtext.inference.decode src/maxtext/configs/base.yml \
   run_name=$YOUR_JOB_NAME \
   base_output_directory=gs://<my-bucket> \
   per_device_batch_size=1
 
@@ -72,7 +72,7 @@ To run a forward pass and verify the model's output, use the following command:
 
 ```shell
 # Gemma3 decode
-python -m maxtext.decode \
+python -m maxtext.inference.decode \
     maxtext/configs/base.yml \
     model_name=gemma3-4b \
     hf_access_token=$HF_ACCESS_TOKEN \
@@ -108,7 +108,7 @@ To decode with multiple images at once, you can provide multiple image paths lik
 export TARGET_LENGTH=...  # Adjust to fit expected output length
 export PREDICT_LENGTH=...  # Adjust to fit image tokens + text prompt
 
-python -m maxtext.decode \
+python -m maxtext.inference.decode \
     maxtext/configs/base.yml \
     model_name=gemma3-4b \
     ... \
 
@@ -0,0 +1,36 @@
+# Copyright 2023–2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Shim for inference decode in `src/maxtext/inference/decode`."""
+
+import sys
+import importlib
+
+from absl import logging
+
+from maxtext.utils import max_logging
+
+OLD_MODULE_PATH = "MaxText.decode"
+NEW_MODULE_PATH = "maxtext.inference.decode"
+
+if __name__ == "__main__":
+  try:
+    logging.set_verbosity(logging.INFO)
+    _new_module = importlib.import_module(NEW_MODULE_PATH)
+    if hasattr(_new_module, "main"):
+      max_logging.warning(f"'{OLD_MODULE_PATH}' is deprecated; use '{NEW_MODULE_PATH}' instead.\n")
+      _new_module.main(sys.argv)
+  except ImportError as e:
+    max_logging.error(f"Shim could not find target module: '{NEW_MODULE_PATH}'\n")
+    raise e
@@ -1,4 +1,4 @@
-# Copyright 2023–2025 Google LLC
+# Copyright 2023–2026 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,94 +12,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Runs a server with maxtext."""
-
-from __future__ import annotations
+"""Shim for maxengine_server in `src/maxtext/inference/maxengine/maxengine_server`."""
 
 import os
 import sys
-from typing import Any
+import importlib
 
 import jax
+from absl import logging
 
 from MaxText import pyconfig
-from MaxText import maxengine_config
-from maxtext.common import gcloud_stub
-
-# _PORT = flags.DEFINE_integer('port', 9000, 'port to listen on')
-# _THREADS = flags.DEFINE_integer(
-#     'threads', 64, 'number of worker threads in thread pool'
-# )
-# _CONFIG = flags.DEFINE_string(
-#     'config',
-#     'MaxtextInterleavedServer',
-#     'available servers',
-# )
-
-
-def _create_prefix_caching_config(config, config_lib_module):
-  if not config.enable_prefix_caching:
-    return None
-
-  if not config.use_chunked_prefill:
-    raise ValueError("Prefix caching requires chunked prefill.")
-
-  return config_lib_module.PrefixCachingConfig(
-      max_hbm_byte=config.prefix_caching_hbm_byte,
-      max_dram_byte=config.prefix_caching_dram_byte,
-  )
-
-
-def main(config):
-  # Obtain the jetstream helper modules (or stubs if appropriate).
-  config_lib, _engine_api, *_ = gcloud_stub.jetstream()
-
-  # If running decoupled and gcloud_stub returned lightweight stubs, skip
-  # starting the real server. Use the explicit _IS_STUB marker when present.
-  config_lib_is_stub = getattr(config_lib, "_IS_STUB", False)
-  engine_api_is_stub = getattr(_engine_api, "_IS_STUB", False)
-  if gcloud_stub.is_decoupled() and (config_lib_is_stub or engine_api_is_stub):
-    raise RuntimeError(
-        "JetStream helper modules are stubbed or DECOUPLE_GCLOUD=TRUE; server cannot be started in decoupled mode. "
-        "Unset DECOUPLE_GCLOUD or install JetStream to run the server."
-    )
-
-  # Import the real server_lib now that it's known present.
-  from jetstream.core import server_lib  # type: ignore  # pylint: disable=import-outside-toplevel
-  import pathwaysutils  # pylint: disable=unused-import,import-outside-toplevel
-
-  pathwaysutils.initialize()
-
-  # No devices for local cpu test. A None for prefill and a None for generate.
-  devices = server_lib.get_devices()
-  server_config = maxengine_config.get_server_config(config.inference_server, config)
-
-  metrics_server_config: Any | None = None
-  if config.prometheus_port != 0:
-    metrics_server_config = config_lib.MetricsServerConfig(port=config.prometheus_port)
-
-  # We separate credential from run so that we can unit test it with
-  # local credentials.
-  # TODO: Add grpc credentials for OSS.
-  # pylint: disable=unexpected-keyword-arg
-  jetstream_server = server_lib.run(
-      threads=256,
-      port=9000,
-      config=server_config,
-      devices=devices,
-      metrics_server_config=metrics_server_config,
-      enable_jax_profiler=config.enable_jax_profiler if config.enable_jax_profiler else False,
-      jax_profiler_port=config.jax_profiler_port if config.jax_profiler_port else 9999,
-      enable_model_warmup=config.enable_model_warmup if config.enable_model_warmup else False,
-      lora_input_adapters_path=config.lora_input_adapters_path,
-      multi_sampling=config.multi_sampling if config.multi_sampling else False,
-      prefix_caching_config=_create_prefix_caching_config(config, config_lib),
-  )
-  jetstream_server.wait_for_termination()
+from maxtext.utils import max_logging
 
+OLD_MODULE_PATH = "MaxText.maxengine_server"
+NEW_MODULE_PATH = "maxtext.inference.maxengine.maxengine_server"
 
 if __name__ == "__main__":
-  jax.config.update("jax_default_prng_impl", "unsafe_rbg")
-  os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0"
-  cfg = pyconfig.initialize(sys.argv)
-  main(cfg)
+  try:
+    jax.config.update("jax_default_prng_impl", "unsafe_rbg")
+    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0"
+    logging.set_verbosity(logging.INFO)
+    max_logging.warning(f"'{OLD_MODULE_PATH}' is deprecated; use '{NEW_MODULE_PATH}' instead.\n")
+    _new_module = importlib.import_module(NEW_MODULE_PATH)
+    if hasattr(_new_module, "main"):
+      cfg = pyconfig.initialize(sys.argv)
+      _new_module.main(cfg)
+  except ImportError as e:
+    max_logging.error(f"Shim could not find target module: '{NEW_MODULE_PATH}'\n")
+    raise e
@@ -34,7 +34,7 @@ echo "--- Checkpoint Conversion Complete ---"
 # --- Step 2 (Optional): Decode using the Converted Checkpoint ---
 
 echo "--- Starting Decoding ---"
-python3 -m maxtext.decode \
+python3 -m maxtext.inference.decode \
   ${MAXTEXT_CONFIGS_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/configs}/base.yml \
   model_name="${MODEL_NAME}" \
   tokenizer_path="${TOKENIZER_PATH}" \
 
@@ -21,8 +21,8 @@
 
 import jax
 
-from MaxText import maxengine
 from MaxText import pyconfig
+from maxtext.inference.maxengine import maxengine
 from maxtext.utils import max_utils