NVIDIA-NeMo · RayenTian · Nov 19, 2025
@@ -46,3 +46,6 @@ code_snapshots*/
 # Runtime env
 *runtime_env.yaml
 !default_runtime_env.yaml
+
+# temp
+slurm/
@@ -40,18 +40,18 @@ repos:
         exclude: '^\.github/'
         types: [file]
 
-  - repo: local
-    hooks:
-      - id: pyrefly-typecheck
-        name: pyrefly check
-        entry: uv run --group dev pyrefly check
-        types_or: [python, pyi]
-        language: system
-        pass_filenames: false # Pyrefly reads config & project roots itself.
-        args: []
-        require_serial: true
-        additional_dependencies: []
-        minimum_pre_commit_version: "2.9.2"
+  # - repo: local
+  #   hooks:
+  #     - id: pyrefly-typecheck
+  #       name: pyrefly check
+  #       entry: uv run --group dev pyrefly check
+  #       types_or: [python, pyi]
+  #       language: system
+  #       pass_filenames: false # Pyrefly reads config & project roots itself.
+  #       args: []
+  #       require_serial: true
+  #       additional_dependencies: []
+  #       minimum_pre_commit_version: "2.9.2"
 
   # This pre-commit hook ensures that the config file is minimized and reflects exactly what you
   # intend to merge. Without it, you might run experiments with one config, but when merging upstream,
@@ -63,28 +63,28 @@ repos:
   #
   # If this check is disruptive, you can disable the pre-commit hook locally. However, before a recipe
   # is accepted upstream, we expect the config to be minimized.
-  - repo: local
-    hooks:
-      - id: configs-minimize-check-llm
-        name: minimize-check llm recipes
-        language: system
-        pass_filenames: false
-        entry: bash
-        args:
-          - -lc
-          - |
-            set -euo pipefail
-            base="examples/configs/dpo.yaml"; for f in examples/configs/recipes/llm/dpo-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
-            base="examples/configs/grpo_math_1B.yaml"; for f in examples/configs/recipes/llm/grpo-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
-            base="examples/configs/sft.yaml"; for f in examples/configs/recipes/llm/sft-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
-            base="examples/configs/distillation_math.yaml"; for f in examples/configs/recipes/llm/distillation-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
-      - id: configs-minimize-check-vlm
-        name: minimize-check vlm recipes
-        language: system
-        pass_filenames: false
-        entry: bash
-        args:
-          - -lc
-          - |
-            set -euo pipefail
-            base="examples/configs/vlm_grpo_3B.yaml"; for f in examples/configs/recipes/vlm/vlm_grpo-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
+  # - repo: local
+  #   hooks:
+  #     - id: configs-minimize-check-llm
+  #       name: minimize-check llm recipes
+  #       language: system
+  #       pass_filenames: false
+  #       entry: bash
+  #       args:
+  #         - -lc
+  #         - |
+  #           set -euo pipefail
+  #           base="examples/configs/dpo.yaml"; for f in examples/configs/recipes/llm/dpo-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
+  #           base="examples/configs/grpo_math_1B.yaml"; for f in examples/configs/recipes/llm/grpo-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
+  #           base="examples/configs/sft.yaml"; for f in examples/configs/recipes/llm/sft-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
+  #           base="examples/configs/distillation_math.yaml"; for f in examples/configs/recipes/llm/distillation-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
+  #     - id: configs-minimize-check-vlm
+  #       name: minimize-check vlm recipes
+  #       language: system
+  #       pass_filenames: false
+  #       entry: bash
+  #       args:
+  #         - -lc
+  #         - |
+  #           set -euo pipefail
+  #           base="examples/configs/vlm_grpo_3B.yaml"; for f in examples/configs/recipes/vlm/vlm_grpo-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
@@ -5,6 +5,19 @@ checkpointing:
   checkpoint_dir: results/sft-llama3.2-1b-1n8g-fsdp2tp1
   save_period: 100
 policy:
+  dtensor_cfg:
+    _v2: true
+    lora:
+      enabled: false
+      target_modules: [] # match all linear modules takes precendence
+      exclude_modules: []
+      match_all_linear: true
+      dim: 32
+      alpha: 32
+      dropout: 0.0
+      dropout_position: "post"
+      lora_A_init: "xavier"
+      use_triton: true
   tokenizer:
     name: meta-llama/Llama-3.2-1B
   make_sequence_length_divisible_by: 1

@@ -0,0 +1,53 @@
+defaults: ../../sft.yaml
+sft:
+  max_num_steps: 350
+  val_period: 20
+  val_global_batch_size: 128
+  val_micro_batch_size: 2
+  val_batches: 8
+checkpointing:
+  checkpoint_dir: results/sft-tmblog-llama3.1-8b
+  save_period: 20
+policy:
+  model_name: meta-llama/Llama-3.1-8B
+  tokenizer:
+    name: meta-llama/Llama-3.1-8B-Instruct
+    chat_template: default
+  train_global_batch_size: 128
+  train_micro_batch_size: 1
+  max_total_sequence_length: 4096
+  precision: "bfloat16"
+  dtensor_cfg:
+    tensor_parallel_size: 1
+    _v2: true
+    lora:
+      enabled: false
+      target_modules: [] # match all linear modules takes precendence
+      exclude_modules: []
+      match_all_linear: true
+      dim: 32
+      alpha: 32
+      dropout: 0.0
+      dropout_position: "post"
+      lora_A_init: "xavier"
+      use_triton: true
+  make_sequence_length_divisible_by: 2
+  optimizer:
+    kwargs:
+      lr: 2.0e-05
+      weight_decay: 0.01
+      eps: 1.0e-08
+data:
+  dataset_name: tulu3
+  add_generation_prompt: true
+  seed: 42
+logger:
+  log_dir: logs/sft-tmblog-llama3.1-8b
+  tensorboard_enabled: false
+  wandb:
+    project: nemo-rl
+    name: sft-tmblog-llama3.1-8b
+  tensorboard:
+    log_dir: tb_logs-sft-dev-tulu3
+cluster:
+  gpus_per_node: 8
@@ -0,0 +1,213 @@
+# SFT Algorithm Configuration
+sft:
+  ## total number of steps to train will equal
+  ## min((max_num_epochs * len(train_dataloader)), max_num_steps)
+  max_num_epochs: 1
+  max_num_steps: 60
+
+  val_period: 10
+  val_batches: 8
+  val_global_batch_size: 32
+  val_micro_batch_size: 1
+  val_at_start: true
+  seed: 42
+
+checkpointing:
+  enabled: false
+  checkpoint_dir: "results/sft"
+  metric_name: "val:val_loss" # one of "val:" or "train:" followed by the metric name
+  higher_is_better: false
+  keep_top_k: 3
+  save_period: 10
+  checkpoint_must_save_by: null
+
+policy:
+  model_name: "/models/Qwen3-0.6B"
+  tokenizer:
+    name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
+    # chat_template can be a Jinja template string or path to a .jinja file
+    chat_template: "{% for message in messages %}{%- if message['role'] == 'system'  %}{{'Context: ' + message['content'].strip()}}{%- elif message['role'] == 'user'  %}{{' Question: ' + message['content'].strip() + ' Answer:'}}{%- elif message['role'] == 'assistant'  %}{{' ' + message['content'].strip()}}{%- endif %}{% endfor %}"
+    chat_template_kwargs: null # can be used to pass kwargs to the chat template, e.g., enable_thinking=true
+  train_global_batch_size: 32
+  train_micro_batch_size: 1
+  max_total_sequence_length: 1024
+  precision: "bfloat16"
+
+  dtensor_cfg:
+    enabled: true
+    _v2: true
+    env_vars: {}
+    cpu_offload: False
+    sequence_parallel: false
+    activation_checkpointing: false
+    tensor_parallel_size: 1
+    context_parallel_size: 1
+    custom_parallel_plan: null
+    lora:
+      enabled: true
+      target_modules: [] # match all linear modules takes precendence
+      exclude_modules: []
+      match_all_linear: true
+      dim: 8
+      alpha: 32
+      dropout: 0.0
+      dropout_position: "post"
+      lora_A_init: "xavier"
+      lora_dtype: ${policy.precision}
+      use_triton: true
+
+  dynamic_batching:
+    enabled: false
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    sequence_length_round: 64
+
+  sequence_packing:
+    enabled: False
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    algorithm: "modified_first_fit_decreasing"
+    sequence_length_round: 64
+
+  # makes the training sequence length divisible by the tensor parallel size
+  # this is useful for sequence parallel training
+  make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
+  max_grad_norm: 1.0
+
+  optimizer:
+    name: "torch.optim.AdamW"
+    kwargs:
+      lr: 5.0e-6
+      weight_decay: 0.1
+      betas: [0.9, 0.98]
+      eps: 1e-5
+      # when using Dtensor, we need to set foreach
+      # and fused to False
+      foreach: False
+      fused: False
+
+  ## ignored since enabled=false, but needed for testing purposes
+  megatron_cfg:
+    enabled: false
+    env_vars: {}
+    empty_unused_memory_level: 1
+    activation_checkpointing: false
+    tensor_model_parallel_size: 1
+    expert_tensor_parallel_size: 1
+    expert_model_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    context_parallel_size: 1
+    pipeline_dtype: ${policy.precision}
+    num_layers_in_first_pipeline_stage: null
+    num_layers_in_last_pipeline_stage: null
+    sequence_parallel: false
+    freeze_moe_router: false
+    moe_router_dtype: null
+    moe_router_load_balancing_type: "aux_loss"
+    moe_router_bias_update_rate: 1e-3
+    moe_permute_fusion: false
+    #gives ~20% training perf speedup with sequence packing 
+    apply_rope_fusion: True
+    # gives ~25% training perf speedup with sequence packing and apply_rope_fusion
+    bias_activation_fusion: True
+    defer_fp32_logits: False
+
+    optimizer:
+      optimizer: "adam"
+      lr: 5.0e-6
+      min_lr: 4.9999e-6
+      weight_decay: 0.1
+      bf16: false
+      fp16: false
+      params_dtype: "float32"
+
+      #adam
+      adam_beta1: 0.9
+      adam_beta2: 0.98
+      adam_eps: 1e-5
+
+      #sgd
+      sgd_momentum: 0.9
+
+      #distributed optimizer
+      use_distributed_optimizer: true
+      use_precision_aware_optimizer: true
+
+      clip_grad: ${policy.max_grad_norm}
+
+      # optimizer cpu offload
+      optimizer_cpu_offload: false
+      optimizer_offload_fraction: 0.0
+
+    scheduler:
+      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      weight_decay_incr_style: "constant"
+      lr_decay_style: "constant"
+      lr_decay_iters: 1000
+      lr_warmup_iters: 50
+      lr_warmup_init: 4.9999e-6
+
+    distributed_data_parallel_config:
+      grad_reduce_in_fp32: false
+      overlap_grad_reduce: true
+      overlap_param_gather: true
+      data_parallel_sharding_strategy: "optim_grads_params"
+      use_custom_fsdp: false
+
+data:
+  max_input_seq_length: ${policy.max_total_sequence_length}
+  add_bos: true
+  add_eos: true
+  add_generation_prompt: false
+  shuffle: true
+  num_workers: 1
+
+  dataset_name: "squad"
+  # You can use custom response datasets for training and validation. For example:
+  #   data:
+  #     dataset_name: ResponseDataset
+  #     train_data_path: <PathToTrainingDataset>  # e.g., /path/to/local/dataset.jsonl or hf_org/hf_dataset_name (HuggingFace)
+  #     val_data_path: <PathToValidationDataset>
+  #     input_key: <QuestionKey>, default is "input"
+  #     output_key: <AnswerKey>, default is "output"
+  #     train_split: <TrainSplit>, default is None  # used for HuggingFace datasets
+  #     val_split: <ValSplit>, default is None  # used for HuggingFace datasets
+  # See https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/sft.md#datasets for more details.
+
+  ## unused with squad dataset
+  prompt_file: null
+  split: null
+  output_key: null
+  seed: null
+
+
+  ## OpenAI format specific configs
+  # train_data_path: "/path/to/train.jsonl"  # Path to training data
+  # val_data_path: "/path/to/val.jsonl"      # Path to validation data
+  # chat_key: "messages"                     # Key for messages in the data
+  # system_key: null                         # Key for system message (optional)
+  # system_prompt: null                      # Default system prompt (optional)
+  # tool_key: "tools"                        # Key for tools in the data
+  # use_preserving_dataset: false            # If true, uses PreservingDataset to preserve heterogeneous schemas (e.g., tool calls with varying argument structures)
+
+logger:
+  log_dir: "logs"  # Base directory for all logs
+  wandb_enabled: false # Make sure you do a ``wandb login [Your API key]'' before running
+  tensorboard_enabled: false
+  mlflow_enabled: false
+  swanlab_enabled: false # Disable SwanLab logging
+  monitor_gpus: true  # If true, will monitor GPU usage and log to wandb and/or tensorboard
+  wandb:
+    project: "sft-dev"
+    name: "sft-dev-${data.dataset_name}"
+  tensorboard:
+    log_dir: "tb_logs-sft-dev-${data.dataset_name}"
+  mlflow:
+    experiment_name: "sft-dev"
+    run_name: "sft-dev-${data.dataset_name}"
+  gpu_monitoring:
+    collection_interval: 10  # How often to collect GPU usage metrics (in seconds)
+    flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)
+
+cluster:
+  gpus_per_node: 1
+  num_nodes: 1
@@ -589,6 +589,7 @@ def sft_train(
                         f"  • Training Model Floating Point Utilization: {100 * total_tflops / theoretical_tflops:.2f}%"
                     )
                     metrics["train_fp_utilization"] = total_tflops / theoretical_tflops
+                print(f"  • Grad norm: {float(metrics['grad_norm']):.4f}")
             print("\n⏱️  Timing:")
             # Display total time first, separately
             total_time = timing_metrics.get("total_step_time", 0)