Fix formatting in docstrings for src/MaxText

melissawm · melissawm · commit 498d4827a87d · 2026-02-25T14:48:30.000-03:00
diff --git a/src/MaxText/estimator.py b/src/MaxText/estimator.py
@@ -24,9 +24,9 @@
 size that does not cause an out-of-memory (OOM) error.
 
 The key functions in this script are:
-- `is_oom`: Checks if a given configuration results in an OOM error.
-- `largest_batch_size`: Finds the largest batch size for a given policy.
-- `search`: The main algorithm that iterates through policies and batch sizes.
+- ``is_oom``: Checks if a given configuration results in an OOM error.
+- ``largest_batch_size``: Finds the largest batch size for a given policy.
+- ``search``: The main algorithm that iterates through policies and batch sizes.
 
 By automating this search, the script helps to efficiently find the most
 performant and memory-efficient training configurations.
@@ -141,8 +141,8 @@ def next_policy(policy: dict) -> dict[str, str] | None:
   Generates the next rematerialization policy in the sequence.
 
   This function iterates through the policy and changes the first tensor it
-  finds with a 'device' value to 'offload', or the first 'offload' to 'remat'.
-  If all tensors are already set to 'remat', it returns None.
+  finds with a ``device`` value to ``offload``, or the first ``offload`` to
+  ``remat``. If all tensors are already set to ``remat``, it returns None.
 
   Args:
     policy: The current policy dictionary.
@@ -166,18 +166,20 @@ def next_policy(policy: dict) -> dict[str, str] | None:
 
 def largest_batch_size(base_argv, policy, min_pdb, max_pdb=64) -> int:
   """
-  Finds the largest possible per_device_batch_size (pdb) that does not cause an OOM error.
+  Finds the largest possible ``per_device_batch_size`` (pdb) that does not cause
+  an OOM error.
 
   This function uses a binary search algorithm within the provided min and max
   range to efficiently find the optimal batch size.
 
   Args:
     policy: The rematerialization policy dictionary.
-    min_pdb: The minimum per_device_batch_size to test.
-    max_pdb: The maximum per_device_batch_size to test.
+    min_pdb: The minimum ``per_device_batch_size`` to test.
+    max_pdb: The maximum ``per_device_batch_size`` to test.
 
   Returns:
-    The largest per_device_batch_size within the range that does not result in an OOM error.
+    The largest ``per_device_batch_size`` within the range that does not result
+    in an OOM error.
   """
   print(f"Starting binary search for the largest batch size between {min_pdb} and {max_pdb}.")
 
@@ -263,7 +265,7 @@ def search_policy_only(
     base_argv: The base command-line arguments.
     pdb: The fixed per-device batch size to test against.
     init_policy: The policy to start searching from. If None, defaults to
-                  'full_device_policy' (no remat).
+      ``full_device_policy`` (no remat).
 
   Returns:
     The first rematerialization policy that did *not* OOM.
@@ -300,7 +302,7 @@ def search(
 
   Args:
     config: The model configuration.
-    max_pdb: The maximum per_device_batch_size to test.
+    max_pdb: The maximum ``per_device_batch_size`` to test.
 
   Returns:
     A list of tuples, where each tuple contains a batch size and its
@@ -341,12 +343,13 @@ def get_parameter_value(config_tuple, prefix):
 
   Args:
     config_tuple: A tuple of strings to search.
-    prefix: The prefix string to look for (e.g., 'key=').
+    prefix: The prefix string to look for (e.g., ``key=``).
 
   Returns:
     A tuple of (bool, str or None).
-    - (True, value) if the prefix is found.
-    - (False, None) if the prefix is not found.
+
+    * ``(True, value)`` if the prefix is found.
+    * ``(False, None)`` if the prefix is not found.
   """
   for item in config_tuple:
     if item.startswith(prefix):
@@ -361,15 +364,16 @@ def get_parameter_value(config_tuple, prefix):
 
 def find_batch_size(base_argv):
   """
-  Parses the base arguments to find the 'per_device_batch_size'.
+  Parses the base arguments to find the ``per_device_batch_size``.
 
   Args:
-      base_argv: The tuple of command-line arguments.
+    base_argv: The tuple of command-line arguments.
 
   Returns:
-      A tuple of (bool, int or None):
-      - (True, batch_size) if 'per_device_batch_size=...' was found.
-      - (False, None) if it was not found.
+    A tuple of (bool, int or None)
+
+    * ``(True, batch_size)`` if ``per_device_batch_size=...`` was found.
+    * ``(False, None)`` if it was not found.
   """
   pdb_provided, pdb_str = get_parameter_value(base_argv, prefix="per_device_batch_size=")
 
@@ -384,10 +388,10 @@ def find_remat_policy_tensor_names(base_argv):
   to be considered for rematerialization.
 
   Args:
-      base_argv: The tuple of command-line arguments.
+    base_argv: The tuple of command-line arguments.
 
   Returns:
-      A list of tensor names that were passed as flags.
+    A list of tensor names that were passed as flags.
   """
   full_tensor_list = [
       "context",
diff --git a/src/MaxText/generate_param_only_checkpoint.py b/src/MaxText/generate_param_only_checkpoint.py
@@ -14,11 +14,12 @@
 
 # pylint: disable=g-bad-todo, abstract-method, consider-using-with
 """Transforms a "full state" including optimizer state to a bfloat16 "parameter state" without optimizer state.
-   This typically used for turning a state output by training.py into a state than can be consumed by decode.py.
 
-   The input "fullstate" is passed in via:
-     load_full_state_path.
-   The output "parameter state" is output to the checkpoint directory. Additionally it is cast down to bf16.
+This typically used for turning a state output by training.py into a state than can be consumed by decode.py.
+
+The input "fullstate" is passed in via ``load_full_state_path``.
+
+The output "parameter state" is output to the checkpoint directory. Additionally it is cast down to bf16.
 """
 
 import os.path
@@ -155,8 +156,9 @@ def _save_decode_checkpoint(config, state, checkpoint_manager):
 def generate_decode_checkpoint(config):
   """
   Generate an decode checkpoint from a given training checkpoint.
-  - Training checkpoint is loaded from config.load_full_state_path.
-  - Inference checkpoint will be saved at the config's checkpoint directory.
+
+  * Training checkpoint is loaded from config.load_full_state_path.
+  * Inference checkpoint will be saved at the config's checkpoint directory.
   """
 
   devices_array = maxtext_utils.create_device_mesh(config)
diff --git a/src/MaxText/pyconfig_deprecated.py b/src/MaxText/pyconfig_deprecated.py
@@ -1285,12 +1285,16 @@ def validate_and_update_keys(raw_keys, model_keys, config_name: str):
 
 def get_individual_scales(scale):
   """Choose appropriate scales for individual dimensions based on global scale
+
   We choose to rotate between doubling:
-    num_head and mlp_dim
-    embed_dim
-    num_layers
+
+  * ``num_head`` and ``mlp_dim``
+  * ``embed_dim``
+  * ``num_layers``
+
   Any one of these steps is not a perfect doubling, although going through a cycle
-  of three is a near perfect 8x scaling except for the linear -> softmax -> output step"""
+  of three is a near perfect 8x scaling except for the linear -> softmax -> output step
+  """
 
   log_2_scale = math.floor((math.log2(scale)))
   if 2**log_2_scale != scale:
diff --git a/src/MaxText/sft/sft_trainer.py b/src/MaxText/sft/sft_trainer.py
@@ -12,7 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-"""Shim for SFT Trainer in `src/maxtext/trainers/post_train/sft`."""
+"""Shim for SFT Trainer in ``src/maxtext/trainers/post_train/sft``."""
 
 import sys
 import importlib
diff --git a/src/MaxText/train_tokenizer.py b/src/MaxText/train_tokenizer.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Shim for `train_tokenizer` in `src/maxtext/trainers/tokenizer`."""
+"""Shim for ``train_tokenizer`` in ``src/maxtext/trainers/tokenizer``."""
 
 from absl import logging