Modalities
diff --git a/‎.gitignore‎
Lines changed: 5 additions & 1 deletion b/‎.gitignore‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎CHANGELOG_DEV.md‎
Lines changed: 17 additions & 1 deletion b/‎CHANGELOG_DEV.md‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎CITATION.cff‎
Lines changed: 1 addition & 1 deletion b/‎CITATION.cff‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 53 additions & 56 deletions b/‎README.md‎
Lines changed: 53 additions & 56 deletions
diff --git a/‎config_files/training/config_lorem_ipsum_long_fsdp2.yaml‎
Lines changed: 6 additions & 3 deletions b/‎config_files/training/config_lorem_ipsum_long_fsdp2.yaml‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎config_files/training/config_lorem_ipsum_long_fsdp2_pp.yaml‎
Lines changed: 4 additions & 4 deletions b/‎config_files/training/config_lorem_ipsum_long_fsdp2_pp.yaml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎config_files/training/config_lorem_ipsum_long_fsdp2_pp_tp.yaml‎
Lines changed: 1 addition & 1 deletion b/‎config_files/training/config_lorem_ipsum_long_fsdp2_pp_tp.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/components/components.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/components/components.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 76 additions & 9 deletions b/‎pyproject.toml‎
Lines changed: 76 additions & 9 deletions
@@ -171,4 +171,8 @@ tutorials/instruction_tuning/prepared_data
 config_files/instruction_tuning
 data/lorem_ipsum_instruct.jsonl
 tutorials/scaling_up/logs*
-tutorials/scaling_up/experiments_old/*
+tutorials/scaling_up/experiments_old/*
+results/*
+tutorials/einsum_transformer/experiments/*
+tutorials/warmstart/experiments/*
+
@@ -201,4 +201,20 @@ Tensors can be either normal Tensors or DTensors.
 This PR fixes the MFU and throughput calculations by taking the dp degree into account instead of the world size. When we use parallelization strategies on top of FSDP, then the world size is different from the  data parallel degree. This needs to be reflected in throughput and MFU metric calculations, as done by this PR. 
 
 **Breaking Changes**
-* Existing configs need to be adapted to correctly use dp degree rather than world size. 
+* Existing configs need to be adapted to correctly use dp degree rather than world size. 
+
+
+## PR #425 Monitoring improvements
+This PR improves training monitoring and logging across runs besides some other changes we did along while testing out scalability.
+
+**General Changes**
+* Configurable multi-layer FSDP units
+* Option to provide experiment root path to modalities
+* Added steppable profiler (e.g., for tracing of forward/backward passes)
+* Fix: Hybrid sharding now correctly configurable
+* Completely refactored the Profiling 
+* Improved error handling. Errors are now captured and stored as JSON
+* Add tutorials on Einsum Transformer (Example model integration) and profiling
+
+**Breaking Changes**
+* experiments_root_path is now exposed on an API level
@@ -14,7 +14,7 @@ authors:
   - family-names: Rutmann
     given-names: Richard
 title: 'Modalities: A PyTorch-native framework for distributed and reproducible foundation model training.'
-version: 0.4.0
+version: 0.5.0
 url: https://github.com/Modalities/modalities
 date-released: '2024-12-02'
 preferred-citation:
 
@@ -71,7 +71,7 @@ train_dataset:
   config:
     raw_data_path: ${settings.paths.train_dataset_path}
     sequence_length: ${settings.step_profile.sequence_length}
-    sample_key:  ${settings.referencing_keys.sample_key}
+    sample_key: ${settings.referencing_keys.sample_key}
 
 train_dataloader:
   component_key: data_loader
@@ -195,7 +195,7 @@ app_state:
   component_key: app_state
   variant_key: raw
   config:
-    model: 
+    model:
       instance_key: initialized_model
       pass_type: BY_REFERENCE
     optimizer:
@@ -305,7 +305,7 @@ optimizer:
     eps: 1e-8
     weight_decay: 1e-1
     weight_decay_groups_excluded: [embedding, layernorm]
-    wrapped_model: 
+    wrapped_model:
       instance_key: initialized_model
       pass_type: BY_REFERENCE
 
@@ -318,6 +318,9 @@ gradient_clipper:
       pass_type: BY_REFERENCE
     norm_type: P2_NORM
     max_norm: 1.0
+    device_mesh:
+      instance_key: device_mesh
+      pass_type: BY_REFERENCE
 
 progress_subscriber:
   component_key: progress_subscriber
 
@@ -194,7 +194,7 @@ device_mesh:
   config:
     device_type: cuda
     data_parallel_replicate_degree: 1
-    pipeline_parallel_degree: 2
+    pipeline_parallel_degree: 4
     data_parallel_shard_degree: -1
     world_size: ${settings.cuda_env.world_size}
 
@@ -251,7 +251,7 @@ scheduled_pipeline:
     loss_fn:
       instance_key: loss_fn
       pass_type: BY_REFERENCE
-    pp_schedule_name: gpipe
+    pp_schedule_name: Interleaved1F1B
     batch_size: ${settings.step_profile.local_train_micro_batch_size}
     microbatch_size: 2
     pp_degree: ${device_mesh.config.pipeline_parallel_degree}
@@ -318,7 +318,7 @@ staged_pipeline:
       instance_key: device_mesh
       pass_type: BY_REFERENCE
     local_rank: ${settings.cuda_env.local_rank}
-    pp_schedule_name: gpipe
+    pp_schedule_name: ${scheduled_pipeline.config.pp_schedule_name}
     num_layers_per_stage: 2
 
 model_raw:
@@ -332,7 +332,7 @@ model_raw:
     sequence_length: ${settings.step_profile.sequence_length}
     prediction_key: ${loss_fn.config.prediction_key}
     vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
-    n_layer: 2
+    n_layer: 6
     n_head_q: 8
     n_head_kv: 4
     ffn_hidden: 128
 
@@ -308,7 +308,7 @@ staged_pipeline:
       instance_key: device_mesh
       pass_type: BY_REFERENCE
     local_rank: ${settings.cuda_env.local_rank}
-    pp_schedule_name: gpipe
+    pp_schedule_name: ${scheduled_pipeline.config.pp_schedule_name}
     num_layers_per_stage: 2
 
 model_raw:
 
@@ -40,6 +40,7 @@
 | scheduler | constant_lr | [ConstantLR](https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.ConstantLR.html#torch.optim.lr_scheduler.ConstantLR)| [ConstantLRSchedulerConfig](../../src/modalities/config/config.py) | [LRScheduler](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate) | Multiplies the learning rate of each parameter group by a small constant factor until the number of steps reaches a pre-defined milestone |
 | scheduler | onecycle_lr | [OneCycleLR](https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.OneCycleLR.html#torch.optim.lr_scheduler.OneCycleLR)| [OneCycleLRSchedulerConfig](../../src/modalities/config/config.py) | [LRScheduler](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate) | Sets the learning rate of each parameter group according to the 1cycle learning rate policy. |
 | scheduler | cosine_annealing_lr | [CosineAnnealingLR](https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.CosineAnnealingLR.html#torch.optim.lr_scheduler.CosineAnnealingLR)| [CosineAnnealingLRSchedulerConfig](../../src/modalities/config/config.py) | [LRScheduler](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate) | Set the learning rate of each parameter group using a cosine annealing schedule |
+| scheduler | linear_warmup_cosine_annealing_lr | [LinearWarmupCosineAnnealingLRScheduler](../../src/modalities/optimizers/lr_schedulers.py) | [LinearWarmupCosineAnnealingLRSchedulerConfig](../../src/modalities/config/config.py) | [LRScheduler](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate) | Linearly warms up to the base learning rate, then decays with cosine annealing for the remaining training steps |
 
 
 ## Tokenization
 
@@ -1,16 +1,16 @@
 [project]
 name = "modalities"
-version = "0.4.0"
-requires-python = ">=3.10,<=3.13"
+version = "0.5.0"
+requires-python = ">=3.10,<3.14"
 description = "Modalities, a PyTorch-native framework for distributed and reproducible foundation model training."
 readme = "README.md"
 dependencies = [
     "numpy",
-    "torch",
+    "ninja",
     "packaging",
     "tqdm",
     "pyyaml",
-    "transformers",
+    "transformers>=4.57.4,<5.0.0",
     "datasets",
     "protobuf",
     "SentencePiece",
@@ -31,18 +31,85 @@ dependencies = [
 Homepage = "https://github.com/Modalities/modalities"
 Issues = "https://github.com/Modalities/modalities/issues"
 
-[project.optional-dependencies]
-linting = ["pre-commit"]
-tests = ["pytest", "pytest-cov", "debugpy"]
-install_helper = ["ninja"]
-
 [project.scripts]
 modalities = "modalities.__main__:main"
 
 [build-system]
 requires = ["setuptools >= 61.0.0"]
 build-backend = "setuptools.build_meta"
 
+[project.optional-dependencies]
+linting = ["pre-commit"]
+tests = ["pytest", "pytest-cov", "debugpy"]
+
+cpu = ["torch>=2.10,<2.11.0", "torchvision"]
+cu126 = [
+  "torch>=2.10,<2.11.0",
+  "torchvision",
+  "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'"
+]
+cu128 = [
+  "torch>=2.10,<2.11.0",
+  "torchvision",
+  "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'"
+]
+cu130 = [
+  "torch>=2.10,<2.11.0",
+  "torchvision",
+  "flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'"
+]
+
+[tool.uv]
+conflicts = [
+  [
+    { extra = "cpu" },
+    { extra = "cu126" },
+    { extra = "cu128" },
+    { extra = "cu130" },
+  ],
+]
+
+[tool.uv.sources]
+torch = [
+  { index = "pytorch-cpu", extra = "cpu" },
+  { index = "pytorch-cu126", extra = "cu126" },
+  { index = "pytorch-cu128", extra = "cu128" },
+  { index = "pytorch-cu130", extra = "cu130" },
+]
+torchvision = [
+  { index = "pytorch-cpu", extra = "cpu" },
+  { index = "pytorch-cu126", extra = "cu126" },
+  { index = "pytorch-cu128", extra = "cu128" },
+  { index = "pytorch-cu130", extra = "cu130" },
+]
+
+[[tool.uv.index]]
+name = "pytorch-cpu"
+url = "https://download.pytorch.org/whl/cpu"
+explicit = true
+
+[[tool.uv.index]]
+name = "pytorch-cu126"
+url = "https://download.pytorch.org/whl/cu126"
+explicit = true
+
+[[tool.uv.index]]
+name = "pytorch-cu128"
+url = "https://download.pytorch.org/whl/cu128"
+explicit = true
+
+[[tool.uv.index]]
+name = "pytorch-cu130"
+url = "https://download.pytorch.org/whl/cu130"
+explicit = true
+
+
+[tool.uv.extra-build-dependencies]
+flash-attn = [
+    { requirement = "torch", match-runtime = true },
+    { requirement = "ninja", match-runtime = true },
+]
+
 [tool.black]
 target-version = ["py310"]
 line-length = 120