diff --git a/examples/xglm/example_config_moe.yaml b/examples/xglm/example_config_moe.yaml new file mode 100644 index 00000000..aa0d739c --- /dev/null +++ b/examples/xglm/example_config_moe.yaml @@ -0,0 +1,113 @@ +checkpoints: + checkpoint_interval: 1000 + checkpoints_path: checkpoints-xglm-moe-8x564M + checkpoints_path_is_shared_file_system: false + resume_checkpoint_path: checkpoints-xglm-moe-8x564M + save_initial_state: false +data_stages: + - data: + dataset: + training_folder: + - datasets/c4-es/train + - datasets/c4-en/train + - datasets/c4-fr/train + validation_folder: + - datasets/c4-es/validation + - datasets/c4-en/validation + - datasets/c4-fr/validation + languages: + - es + - en + - fr + num_loading_workers: 1 + seed: 42 + name: General purpose training (Blended dataset) + start_training_step: 1 +general: + benchmark_csv_path: null + consumed_train_samples: null + ignore_sanity_checks: true + project: multilingual-moe-init + run: xglm-topk2-8x564M + seed: 42 + step: null +lighteval: null +logging: + iteration_step_info_interval: 1 + log_level: info + log_level_replica: info +model: + ddp_bucket_cap_mb: 25 + dtype: bfloat16 + init_method: + # for random init: + # std: 0.02 + # or upcycling after using the converter scripts: + path: /mloscratch/homes/haegele/swissai/nanotron-multilingual/xglm/xglm-moe + make_vocab_size_divisible_by: 1 + model_config: + activation_function: gelu + attn_pdrop: 0.0 + embd_pdrop: 0.0 + scale_embedding: true + eos_token_id: 2 + hidden_size: 1024 + intermediate_size: 4096 + layer_norm_epsilon: 0.00001 + max_position_embeddings: 2048 + num_attention_heads: 16 + num_hidden_layers: 24 + resid_pdrop: 0.0 + scale_attention_softmax_in_fp32: true + scale_attn_weights: true + vocab_size: 256008 + sinusoidal_position_embedding: true + position_embedding_offset: 2 + use_spda: false + act_pdrop: 0.0 + is_moe: true + moe_num_experts: 8 + num_experts_per_tok: 2 + moe_loss_weight: 0.01 + moe_z_loss_weight: 0.001 + moe_glu: false +optimizer: + accumulate_grad_in_fp32: true + clip_grad: 1.0 + learning_rate_scheduler: + learning_rate: 0.001 + lr_decay_starting_step: 2000 + lr_decay_steps: 500 + lr_decay_style: 1-sqrt + lr_warmup_steps: 100 + lr_warmup_style: linear + min_decay_lr: 0 + optimizer_factory: + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_eps: 1.0e-08 + name: adamW + torch_adam_is_fused: true + weight_decay: 0.1 + zero_stage: 0 +parallelism: + dp: 3 + expert_parallel_size: 1 + pp: 1 + pp_engine: 1f1b + tp: 1 + tp_linear_async_communication: false + tp_mode: REDUCE_SCATTER +profiler: null +tokenizer: + tokenizer_max_length: null + tokenizer_name_or_path: facebook/xglm-564M + tokenizer_revision: null +tokens: + batch_accumulation_per_replica: 5 + limit_test_batches: 0 + limit_val_batches: 10 + micro_batch_size: 4 # fits on one 80GB A100 for 8x564M + sequence_length: 2048 + train_steps: 2500 + val_check_interval: -1