defaults: ../../sft.yaml
sft:
  max_num_steps: 100
checkpointing:
  enabled: false
policy:
  model_name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16
  train_global_batch_size: 16
  max_total_sequence_length: 2048
  # TODO(automodel-issue): NemotronH needs force_hf=true with transformers v5
  # to bypass custom backbone→model.model attribute mismatch in parallelizer.
  dtensor_cfg:
    automodel_kwargs:
      force_hf: true
    lora_cfg:
      enabled: true
      dim: 256
      alpha: 512
      exclude_modules: ['*out_proj*'] # Exclude all out_proj modules. When NemotronHMamba2Mixer uses cuda_kernels_forward, out_proj LoRA has no gradient.
      match_all_linear: false
      use_triton: false
logger:
  wandb:
    project: nemo-rl
    name: sft-nanov3-30BA3B-2n8g-fsdp2-lora
  tensorboard:
    log_dir: tb_logs-sft-nanov3-30BA3B-2n8g-fsdp2-lora
  mlflow:
    run_name: sft-nanov3-30BA3B-2n8g-fsdp2-lora
cluster:
  gpus_per_node: 8
  num_nodes: 2