Modalities
diff --git a/‎.github/workflows/build_and_deploy_documentation.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/build_and_deploy_documentation.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/tests_full.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/tests_full.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.gitignore‎
Lines changed: 7 additions & 0 deletions b/‎.gitignore‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎CHANGELOG_DEV.md‎
Lines changed: 32 additions & 0 deletions b/‎CHANGELOG_DEV.md‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎CITATION.cff‎
Lines changed: 1 addition & 1 deletion b/‎CITATION.cff‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 197 additions & 129 deletions b/‎README.md‎
Lines changed: 197 additions & 129 deletions
diff --git a/‎config_files/training/config_example_coca.yaml‎
Lines changed: 3 additions & 2 deletions b/‎config_files/training/config_example_coca.yaml‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎config_files/training/config_lorem_ipsum_long_fsdp1.yaml‎
Lines changed: 3 additions & 2 deletions b/‎config_files/training/config_lorem_ipsum_long_fsdp1.yaml‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎config_files/training/config_lorem_ipsum_long_fsdp1_warmstart.yaml‎
Lines changed: 4 additions & 3 deletions b/‎config_files/training/config_lorem_ipsum_long_fsdp1_warmstart.yaml‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎config_files/training/config_lorem_ipsum_long_fsdp2.yaml‎
Lines changed: 40 additions & 21 deletions b/‎config_files/training/config_lorem_ipsum_long_fsdp2.yaml‎
Lines changed: 40 additions & 21 deletions
@@ -25,7 +25,7 @@ jobs:
       run: |
         sudo apt-get update
         sudo apt-get install git -y
-        python -m pip install torch==2.6.0
+        python -m pip install torch==2.7.1
         python -m pip install --upgrade pip setuptools wheel
         export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE
         python -m pip install -e .
 
@@ -23,11 +23,11 @@ jobs:
         sudo apt-get update
         sudo apt-get install curl -y                  # required by coveralls
         sudo apt-get install git -y
-        python -m pip install torch==2.6.0
+        python -m pip install torch==2.7.1
         python -m pip install --upgrade pip setuptools wheel
         export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE
         python -m pip install ninja                   # Lowers compilation time of flash attention significantly 
-        python -m pip install flash-attn==2.7.4.post1 --no-build-isolation
+        python -m pip install flash-attn==2.8.0.post2 --no-build-isolation
         python -m pip install -e .[tests]
     - name: Run tests
       run: |
 
@@ -164,8 +164,15 @@ tests/tmp/*
 *wandb_storage*
 .coverage/*
 *.pbin
+tutorials/scaling_up2/experiments
 tutorials/scaling_up/experiments
 tutorials/profiling/experiments
 tutorials/instruction_tuning/prepared_data
 config_files/instruction_tuning
 data/lorem_ipsum_instruct.jsonl
+tutorials/scaling_up/logs*
+tutorials/scaling_up/experiments_old/*
+results/*
+tutorials/einsum_transformer/experiments/*
+tutorials/warmstart/experiments/*
+
@@ -186,3 +186,35 @@ There are now three AC variants:
 * adds support for Tensor Parallelism (including Sequence Parallelism). 
 * adds a debugging toolkit to track the input and output tensors during a forward pass, gradients during the backward pass and weight tensors.
 Tensors can be either normal Tensors or DTensors.  
+
+
+## PR #389 Benchmark Tooling 
+* adds benchmarking tooling to modalities and allows for scaling benchmarks across varying number of nodes and the cartesian product of configurable hyper parameters.
+
+**Breaking Changes**
+* Renaming: EvaluationResultToDiscSubscriberConfig.output_path -> EvaluationResultToDiscSubscriberConfig.output_file_path
+
+
+
+## PR #410 MFU incorporates dp_degree now instead of world_size
+
+This PR fixes the MFU and throughput calculations by taking the dp degree into account instead of the world size. When we use parallelization strategies on top of FSDP, then the world size is different from the  data parallel degree. This needs to be reflected in throughput and MFU metric calculations, as done by this PR. 
+
+**Breaking Changes**
+* Existing configs need to be adapted to correctly use dp degree rather than world size. 
+
+
+## PR #425 Monitoring improvements
+This PR improves training monitoring and logging across runs besides some other changes we did along while testing out scalability.
+
+**General Changes**
+* Configurable multi-layer FSDP units
+* Option to provide experiment root path to modalities
+* Added steppable profiler (e.g., for tracing of forward/backward passes)
+* Fix: Hybrid sharding now correctly configurable
+* Completely refactored the Profiling 
+* Improved error handling. Errors are now captured and stored as JSON
+* Add tutorials on Einsum Transformer (Example model integration) and profiling
+
+**Breaking Changes**
+* experiments_root_path is now exposed on an API level
@@ -14,7 +14,7 @@ authors:
   - family-names: Rutmann
     given-names: Richard
 title: 'Modalities: A PyTorch-native framework for distributed and reproducible foundation model training.'
-version: 0.3.2
+version: 0.5.0
 url: https://github.com/Modalities/modalities
 date-released: '2024-12-02'
 preferred-citation:
 
@@ -25,21 +25,22 @@ settings:
     gradient_accumulation_steps: 1
     local_train_micro_batch_size: 1
     sequence_length: 256
+    dp_degree: ${settings.cuda_env.world_size}
   training_target:
     num_target_tokens:      
       component_key: number_conversion
       variant_key: num_tokens_from_num_steps
       config:
         num_steps: ${settings.training_target.num_target_steps}
-        num_ranks: ${settings.cuda_env.world_size}
+        dp_degree: ${settings.cuda_env.world_size}
         local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
         sequence_length: ${settings.step_profile.sequence_length}
         gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps}
     num_target_steps:  # for the batch progress subscriber
       component_key: number_conversion
       variant_key: num_steps_from_num_samples
       config:
-        num_ranks: ${settings.cuda_env.world_size}
+        dp_degree: ${settings.cuda_env.world_size}
         local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
         global_num_samples: ${settings.coca_example_settings.train_num_samples}
         gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps}
 
@@ -26,21 +26,22 @@ settings:
     gradient_accumulation_steps: 2
     local_train_micro_batch_size: 1
     sequence_length: 256
+    dp_degree: ${settings.cuda_env.world_size}
   training_target:
     num_target_tokens:
       component_key: number_conversion
       variant_key: num_tokens_from_packed_mem_map_dataset_continuous
       config:
         dataset_path: ${settings.paths.train_dataset_path}
         sequence_length: ${settings.step_profile.sequence_length}
-        num_ranks: ${settings.cuda_env.world_size}
+        dp_degree: ${settings.cuda_env.world_size}
         local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
         gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps}
     num_target_steps:  # for the batch progress subscriber
       component_key: number_conversion
       variant_key: num_steps_from_num_tokens
       config:
-        num_ranks: ${settings.cuda_env.world_size}
+        dp_degree: ${settings.cuda_env.world_size}
         local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
         global_num_tokens: ${settings.training_target.num_target_tokens}
         sequence_length: ${settings.step_profile.sequence_length}
 
@@ -26,21 +26,22 @@ settings:
     gradient_accumulation_steps: 2
     local_train_micro_batch_size: 1
     sequence_length: 256
+    dp_degree: ${settings.cuda_env.world_size}
   training_target:
     num_target_tokens:
       component_key: number_conversion
       variant_key: num_tokens_from_packed_mem_map_dataset_continuous
       config:
         dataset_path: ${settings.paths.train_dataset_path}
         sequence_length: ${settings.step_profile.sequence_length}
-        num_ranks: ${settings.cuda_env.world_size}
+        dp_degree: ${settings.cuda_env.world_size}
         local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
         gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps}
     num_target_steps:  # for the batch progress subscriber
       component_key: number_conversion
       variant_key: num_steps_from_num_tokens
       config:
-        num_ranks: ${settings.cuda_env.world_size}
+        dp_degree: ${settings.cuda_env.world_size}
         local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
         global_num_tokens: ${settings.training_target.num_target_tokens}
         sequence_length: ${settings.step_profile.sequence_length}
@@ -67,7 +68,7 @@ settings:
       variant_key: last_step_from_checkpoint_path
       config:
         checkpoint_path: ${settings.warmstart_checkpoint_paths.model_checkpoint_path}
-  warmstart_checkpoint_paths: ${warmstart_env:checkpoint_paths}
+  warmstart_checkpoint_paths: ${warmstart_env:checkpoint_paths}  # use modalities warmstart [..] --last_checkpoint_info_file_path [..]
 
 collate_fn:
   component_key: collate_fn
 
@@ -18,29 +18,36 @@ settings:
     checkpointing_interval_in_steps: 32
     evaluation_interval_in_steps: 32
   consistency_enforcement:
-    enforce_tokens_per_step_consistency: true
+    enforce_tokens_per_step_consistency: false
     enforce_last_step_logged: false
     enforce_last_step_evaluated: false
     enforce_last_step_checkpointed: false
   step_profile:
     gradient_accumulation_steps: 1
     local_train_micro_batch_size: 1
     sequence_length: 256
+    dp_degree:
+      instance_key: dp_degree
+      pass_type: BY_REFERENCE
   training_target:
     num_target_tokens:
       component_key: number_conversion
       variant_key: num_tokens_from_packed_mem_map_dataset_continuous
       config:
         dataset_path: ${settings.paths.train_dataset_path}
         sequence_length: ${settings.step_profile.sequence_length}
-        num_ranks: ${settings.cuda_env.world_size}
+        dp_degree:
+          instance_key: dp_degree
+          pass_type: BY_REFERENCE
         local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
         gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps}
     num_target_steps:  # for the batch progress subscriber
       component_key: number_conversion
       variant_key: num_steps_from_num_tokens
       config:
-        num_ranks: ${settings.cuda_env.world_size}
+        dp_degree:
+          instance_key: dp_degree
+          pass_type: BY_REFERENCE
         local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
         global_num_tokens: ${settings.training_target.num_target_tokens}
         sequence_length: ${settings.step_profile.sequence_length}
@@ -64,7 +71,7 @@ train_dataset:
   config:
     raw_data_path: ${settings.paths.train_dataset_path}
     sequence_length: ${settings.step_profile.sequence_length}
-    sample_key:  ${settings.referencing_keys.sample_key}
+    sample_key: ${settings.referencing_keys.sample_key}
 
 train_dataloader:
   component_key: data_loader
@@ -172,14 +179,23 @@ device_mesh:
   config:
     device_type: cuda
     data_parallel_replicate_degree: 1
-    data_parallel_shard_degree: ${settings.cuda_env.world_size} # i.e., fully sharded
+    data_parallel_shard_degree: -1
     world_size: ${settings.cuda_env.world_size}
 
+dp_degree:
+  component_key: number_conversion
+  variant_key: parallel_degree
+  config: # get the parallel degree from the device mesh
+    device_mesh:
+      instance_key: device_mesh
+      pass_type: BY_REFERENCE
+    parallelism_methods: [dp_shard, dp_replicate]
+
 app_state:
   component_key: app_state
   variant_key: raw
   config:
-    model: 
+    model:
       instance_key: initialized_model
       pass_type: BY_REFERENCE
     optimizer:
@@ -289,7 +305,7 @@ optimizer:
     eps: 1e-8
     weight_decay: 1e-1
     weight_decay_groups_excluded: [embedding, layernorm]
-    wrapped_model: 
+    wrapped_model:
       instance_key: initialized_model
       pass_type: BY_REFERENCE
 
@@ -302,6 +318,9 @@ gradient_clipper:
       pass_type: BY_REFERENCE
     norm_type: P2_NORM
     max_norm: 1.0
+    device_mesh:
+      instance_key: device_mesh
+      pass_type: BY_REFERENCE
 
 progress_subscriber:
   component_key: progress_subscriber
@@ -326,17 +345,17 @@ evaluation_subscriber:
     directory: wandb_storage
     config_file_path: ${settings.config_file_path}
 
-# mfu_calculator:
-#   component_key: mfu_calculator
-#   variant_key: gpt2
-#   config:
-#     n_layer: ${model_raw.config.n_layer}
-#     sequence_length: ${settings.step_profile.sequence_length}
-#     n_embd: ${model_raw.config.n_embd}
-#     world_size: ${settings.cuda_env.world_size}
-#     raw_model:
-#       instance_key: model_raw
-#       pass_type: BY_REFERENCE
-#     wrapped_model:
-#       instance_key: initialized_model
-#       pass_type: BY_REFERENCE
+mfu_calculator:
+  component_key: mfu_calculator
+  variant_key: gpt2
+  config:
+    n_layer: ${model_raw.config.n_layer}
+    sequence_length: ${settings.step_profile.sequence_length}
+    n_embd: ${model_raw.config.n_embd}
+    world_size: ${settings.cuda_env.world_size}
+    wrapped_model:
+      instance_key: initialized_model
+      pass_type: BY_REFERENCE
+    device_mesh:
+      instance_key: device_mesh
+      pass_type: BY_REFERENCE