diff --git a/configs/1B/H100.toml b/configs/1B/H100.toml index fc219eb..810aa77 100644 --- a/configs/1B/H100.toml +++ b/configs/1B/H100.toml @@ -4,11 +4,12 @@ type_model = "llama2" [train] micro_bs = 16 +reshard_after_forward = true [optim] batch_size = 2048 -warmup_steps = 1000 -total_steps = 88_000 +warmup_steps = 500 +total_steps = 8192 [optim.optim] lr = 4e-4 \ No newline at end of file diff --git a/configs/1B/H100_c4.toml b/configs/1B/H100_c4.toml deleted file mode 100644 index 9bf0128..0000000 --- a/configs/1B/H100_c4.toml +++ /dev/null @@ -1,17 +0,0 @@ -name_model = "1B" -project = "debug_1B_zero_band" -type_model = "llama2" - -[train] -micro_bs = 16 - -[optim] -batch_size = 128 -warmup_steps = 1000 -total_steps = 88_000 - -[optim.optim] -lr = 3e-4 - -[data] -seq_length = 2048 \ No newline at end of file diff --git a/configs/1B/H100_llama2_edu.toml b/configs/1B/H100_llama2_edu.toml deleted file mode 100644 index f45f21b..0000000 --- a/configs/1B/H100_llama2_edu.toml +++ /dev/null @@ -1,23 +0,0 @@ -name_model = "1B" -project = "debug_1B_zero_band" -type_model = "llama2" - -[train] -micro_bs = 4 -reshard_after_forward = true - -[data] -seq_length = 8192 -num_workers = 4 -dataset_name_or_paths = "/data/datasets/fineweb-edu" -reverse_data_files = true - -[optim] -batch_size = 256 -warmup_steps = 1000 -total_steps = 1_000_000_000_000 -sched_type = "wsd-sqrt" -z_loss = true - -[optim.optim] -lr = 4e-4 diff --git a/configs/1B/H100_llama2_edu_no_feat.toml b/configs/1B/H100_llama2_edu_no_feat.toml deleted file mode 100644 index fbe2901..0000000 --- a/configs/1B/H100_llama2_edu_no_feat.toml +++ /dev/null @@ -1,25 +0,0 @@ -name_model = "1B" -project = "debug_1B_zero_band" -type_model = "llama2" - -[train] -micro_bs = 4 -reshard_after_forward = true -attn_fn = "sdpa" -sequence_packing = false - -[data] -seq_length = 8192 -num_workers = 4 -dataset_name_or_paths = "/data/datasets/fineweb-edu" -reverse_data_files = true - -[optim] -batch_size = 256 -warmup_steps = 1000 -total_steps = 1_000_000_000_000 -sched_type = "wsd-sqrt" -z_loss = false - -[optim.optim] -lr = 2e-4 diff --git a/configs/1B/H100_llama3.toml b/configs/1B/H100_llama3.toml deleted file mode 100644 index 497dff5..0000000 --- a/configs/1B/H100_llama3.toml +++ /dev/null @@ -1,23 +0,0 @@ -name_model = "1B" -project = "debug_1B_zero_band" -type_model = "llama3" - -[train] -micro_bs = 1 -reshard_after_forward = true - -[data] -seq_length = 8192 -num_workers = 4 -dataset_name_or_paths = "/data/datasets/fineweb-edu" -reverse_data_files = true - -[optim] -batch_size = 256 -warmup_steps = 1000 -total_steps = 1_000_000_000_000 -sched_type = "wsd-sqrt" -z_loss = true - -[optim.optim] -lr = 4e-4 diff --git a/configs/1B_diloco/H100.toml b/configs/1B_diloco/H100.toml deleted file mode 100644 index 4cf7d7b..0000000 --- a/configs/1B_diloco/H100.toml +++ /dev/null @@ -1,27 +0,0 @@ -name_model = "1B" -project = "debug_1B_zero_band" -type_model = "llama2" - -[train] -micro_bs = 16 - -[optim] -batch_size = 2048 -warmup_steps = 1000 -total_steps = 88_000 - -z_loss = true - -[optim.optim] -lr = 4e-4 - - -[diloco] -inner_steps = 50 -compression = "uint8" - -[ckpt] -interval = 50 -topk = 3 -path = "outputs_1b_diloco_50" -