Skip to content

Commit 093fa33

Browse files
committed
chore: Merge branch 'main' into seed
2 parents 2e248ed + b856127 commit 093fa33

123 files changed

Lines changed: 5036 additions & 1206 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitignore

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,4 +171,8 @@ tutorials/instruction_tuning/prepared_data
171171
config_files/instruction_tuning
172172
data/lorem_ipsum_instruct.jsonl
173173
tutorials/scaling_up/logs*
174-
tutorials/scaling_up/experiments_old/*
174+
tutorials/scaling_up/experiments_old/*
175+
results/*
176+
tutorials/einsum_transformer/experiments/*
177+
tutorials/warmstart/experiments/*
178+

CHANGELOG_DEV.md

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,4 +201,20 @@ Tensors can be either normal Tensors or DTensors.
201201
This PR fixes the MFU and throughput calculations by taking the dp degree into account instead of the world size. When we use parallelization strategies on top of FSDP, then the world size is different from the data parallel degree. This needs to be reflected in throughput and MFU metric calculations, as done by this PR.
202202

203203
**Breaking Changes**
204-
* Existing configs need to be adapted to correctly use dp degree rather than world size.
204+
* Existing configs need to be adapted to correctly use dp degree rather than world size.
205+
206+
207+
## PR #425 Monitoring improvements
208+
This PR improves training monitoring and logging across runs besides some other changes we did along while testing out scalability.
209+
210+
**General Changes**
211+
* Configurable multi-layer FSDP units
212+
* Option to provide experiment root path to modalities
213+
* Added steppable profiler (e.g., for tracing of forward/backward passes)
214+
* Fix: Hybrid sharding now correctly configurable
215+
* Completely refactored the Profiling
216+
* Improved error handling. Errors are now captured and stored as JSON
217+
* Add tutorials on Einsum Transformer (Example model integration) and profiling
218+
219+
**Breaking Changes**
220+
* experiments_root_path is now exposed on an API level

CITATION.cff

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ authors:
1414
- family-names: Rutmann
1515
given-names: Richard
1616
title: 'Modalities: A PyTorch-native framework for distributed and reproducible foundation model training.'
17-
version: 0.4.0
17+
version: 0.5.0
1818
url: https://github.com/Modalities/modalities
1919
date-released: '2024-12-02'
2020
preferred-citation:

README.md

Lines changed: 53 additions & 56 deletions
Large diffs are not rendered by default.

config_files/training/config_lorem_ipsum_long_fsdp2.yaml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ train_dataset:
7171
config:
7272
raw_data_path: ${settings.paths.train_dataset_path}
7373
sequence_length: ${settings.step_profile.sequence_length}
74-
sample_key: ${settings.referencing_keys.sample_key}
74+
sample_key: ${settings.referencing_keys.sample_key}
7575

7676
train_dataloader:
7777
component_key: data_loader
@@ -195,7 +195,7 @@ app_state:
195195
component_key: app_state
196196
variant_key: raw
197197
config:
198-
model:
198+
model:
199199
instance_key: initialized_model
200200
pass_type: BY_REFERENCE
201201
optimizer:
@@ -305,7 +305,7 @@ optimizer:
305305
eps: 1e-8
306306
weight_decay: 1e-1
307307
weight_decay_groups_excluded: [embedding, layernorm]
308-
wrapped_model:
308+
wrapped_model:
309309
instance_key: initialized_model
310310
pass_type: BY_REFERENCE
311311

@@ -318,6 +318,9 @@ gradient_clipper:
318318
pass_type: BY_REFERENCE
319319
norm_type: P2_NORM
320320
max_norm: 1.0
321+
device_mesh:
322+
instance_key: device_mesh
323+
pass_type: BY_REFERENCE
321324

322325
progress_subscriber:
323326
component_key: progress_subscriber

config_files/training/config_lorem_ipsum_long_fsdp2_pp.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ device_mesh:
194194
config:
195195
device_type: cuda
196196
data_parallel_replicate_degree: 1
197-
pipeline_parallel_degree: 2
197+
pipeline_parallel_degree: 4
198198
data_parallel_shard_degree: -1
199199
world_size: ${settings.cuda_env.world_size}
200200

@@ -251,7 +251,7 @@ scheduled_pipeline:
251251
loss_fn:
252252
instance_key: loss_fn
253253
pass_type: BY_REFERENCE
254-
pp_schedule_name: gpipe
254+
pp_schedule_name: Interleaved1F1B
255255
batch_size: ${settings.step_profile.local_train_micro_batch_size}
256256
microbatch_size: 2
257257
pp_degree: ${device_mesh.config.pipeline_parallel_degree}
@@ -318,7 +318,7 @@ staged_pipeline:
318318
instance_key: device_mesh
319319
pass_type: BY_REFERENCE
320320
local_rank: ${settings.cuda_env.local_rank}
321-
pp_schedule_name: gpipe
321+
pp_schedule_name: ${scheduled_pipeline.config.pp_schedule_name}
322322
num_layers_per_stage: 2
323323

324324
model_raw:
@@ -332,7 +332,7 @@ model_raw:
332332
sequence_length: ${settings.step_profile.sequence_length}
333333
prediction_key: ${loss_fn.config.prediction_key}
334334
vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
335-
n_layer: 2
335+
n_layer: 6
336336
n_head_q: 8
337337
n_head_kv: 4
338338
ffn_hidden: 128

config_files/training/config_lorem_ipsum_long_fsdp2_pp_tp.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -308,7 +308,7 @@ staged_pipeline:
308308
instance_key: device_mesh
309309
pass_type: BY_REFERENCE
310310
local_rank: ${settings.cuda_env.local_rank}
311-
pp_schedule_name: gpipe
311+
pp_schedule_name: ${scheduled_pipeline.config.pp_schedule_name}
312312
num_layers_per_stage: 2
313313

314314
model_raw:

docs/components/components.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
| scheduler | constant_lr | [ConstantLR](https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.ConstantLR.html#torch.optim.lr_scheduler.ConstantLR)| [ConstantLRSchedulerConfig](../../src/modalities/config/config.py) | [LRScheduler](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate) | Multiplies the learning rate of each parameter group by a small constant factor until the number of steps reaches a pre-defined milestone |
4141
| scheduler | onecycle_lr | [OneCycleLR](https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.OneCycleLR.html#torch.optim.lr_scheduler.OneCycleLR)| [OneCycleLRSchedulerConfig](../../src/modalities/config/config.py) | [LRScheduler](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate) | Sets the learning rate of each parameter group according to the 1cycle learning rate policy. |
4242
| scheduler | cosine_annealing_lr | [CosineAnnealingLR](https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.CosineAnnealingLR.html#torch.optim.lr_scheduler.CosineAnnealingLR)| [CosineAnnealingLRSchedulerConfig](../../src/modalities/config/config.py) | [LRScheduler](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate) | Set the learning rate of each parameter group using a cosine annealing schedule |
43+
| scheduler | linear_warmup_cosine_annealing_lr | [LinearWarmupCosineAnnealingLRScheduler](../../src/modalities/optimizers/lr_schedulers.py) | [LinearWarmupCosineAnnealingLRSchedulerConfig](../../src/modalities/config/config.py) | [LRScheduler](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate) | Linearly warms up to the base learning rate, then decays with cosine annealing for the remaining training steps |
4344

4445

4546
## Tokenization

pyproject.toml

Lines changed: 76 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
[project]
22
name = "modalities"
3-
version = "0.4.0"
4-
requires-python = ">=3.10,<=3.13"
3+
version = "0.5.0"
4+
requires-python = ">=3.10,<3.14"
55
description = "Modalities, a PyTorch-native framework for distributed and reproducible foundation model training."
66
readme = "README.md"
77
dependencies = [
88
"numpy",
9-
"torch",
9+
"ninja",
1010
"packaging",
1111
"tqdm",
1212
"pyyaml",
13-
"transformers",
13+
"transformers>=4.57.4,<5.0.0",
1414
"datasets",
1515
"protobuf",
1616
"SentencePiece",
@@ -31,18 +31,85 @@ dependencies = [
3131
Homepage = "https://github.com/Modalities/modalities"
3232
Issues = "https://github.com/Modalities/modalities/issues"
3333

34-
[project.optional-dependencies]
35-
linting = ["pre-commit"]
36-
tests = ["pytest", "pytest-cov", "debugpy"]
37-
install_helper = ["ninja"]
38-
3934
[project.scripts]
4035
modalities = "modalities.__main__:main"
4136

4237
[build-system]
4338
requires = ["setuptools >= 61.0.0"]
4439
build-backend = "setuptools.build_meta"
4540

41+
[project.optional-dependencies]
42+
linting = ["pre-commit"]
43+
tests = ["pytest", "pytest-cov", "debugpy"]
44+
45+
cpu = ["torch>=2.10,<2.11.0", "torchvision"]
46+
cu126 = [
47+
"torch>=2.10,<2.11.0",
48+
"torchvision",
49+
"flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'"
50+
]
51+
cu128 = [
52+
"torch>=2.10,<2.11.0",
53+
"torchvision",
54+
"flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'"
55+
]
56+
cu130 = [
57+
"torch>=2.10,<2.11.0",
58+
"torchvision",
59+
"flash-attn==2.8.3; platform_system != 'Darwin' and platform_machine != 'aarch64'"
60+
]
61+
62+
[tool.uv]
63+
conflicts = [
64+
[
65+
{ extra = "cpu" },
66+
{ extra = "cu126" },
67+
{ extra = "cu128" },
68+
{ extra = "cu130" },
69+
],
70+
]
71+
72+
[tool.uv.sources]
73+
torch = [
74+
{ index = "pytorch-cpu", extra = "cpu" },
75+
{ index = "pytorch-cu126", extra = "cu126" },
76+
{ index = "pytorch-cu128", extra = "cu128" },
77+
{ index = "pytorch-cu130", extra = "cu130" },
78+
]
79+
torchvision = [
80+
{ index = "pytorch-cpu", extra = "cpu" },
81+
{ index = "pytorch-cu126", extra = "cu126" },
82+
{ index = "pytorch-cu128", extra = "cu128" },
83+
{ index = "pytorch-cu130", extra = "cu130" },
84+
]
85+
86+
[[tool.uv.index]]
87+
name = "pytorch-cpu"
88+
url = "https://download.pytorch.org/whl/cpu"
89+
explicit = true
90+
91+
[[tool.uv.index]]
92+
name = "pytorch-cu126"
93+
url = "https://download.pytorch.org/whl/cu126"
94+
explicit = true
95+
96+
[[tool.uv.index]]
97+
name = "pytorch-cu128"
98+
url = "https://download.pytorch.org/whl/cu128"
99+
explicit = true
100+
101+
[[tool.uv.index]]
102+
name = "pytorch-cu130"
103+
url = "https://download.pytorch.org/whl/cu130"
104+
explicit = true
105+
106+
107+
[tool.uv.extra-build-dependencies]
108+
flash-attn = [
109+
{ requirement = "torch", match-runtime = true },
110+
{ requirement = "ninja", match-runtime = true },
111+
]
112+
46113
[tool.black]
47114
target-version = ["py310"]
48115
line-length = 120

0 commit comments

Comments
 (0)