defaults: ../../sft.yaml sft: max_num_steps: 100 checkpointing: enabled: false policy: model_name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16 train_global_batch_size: 16 max_total_sequence_length: 2048 # TODO(automodel-issue): NemotronH needs force_hf=true with transformers v5 # to bypass custom backbone→model.model attribute mismatch in parallelizer. dtensor_cfg: automodel_kwargs: force_hf: true lora_cfg: enabled: true dim: 256 alpha: 512 exclude_modules: ['*out_proj*'] # Exclude all out_proj modules. When NemotronHMamba2Mixer uses cuda_kernels_forward, out_proj LoRA has no gradient. match_all_linear: false use_triton: false logger: wandb: project: nemo-rl name: sft-nanov3-30BA3B-2n8g-fsdp2-lora tensorboard: log_dir: tb_logs-sft-nanov3-30BA3B-2n8g-fsdp2-lora mlflow: run_name: sft-nanov3-30BA3B-2n8g-fsdp2-lora cluster: gpus_per_node: 8 num_nodes: 2