hydra: run: dir: . output_subdir: null exp_name: "qwen3-next-80BA3B-rlvr-config" seed: 42 logging_dir: ./output/logs output_dir: ./output system_envs: USE_MODELSCOPE: '1' checkpoint_config: type: file_system output_dir: ./rl_examples/models/${exp_name} #track_with: wandb #tracker_kwargs: # api_key: # project: roll_examples # notes: roll_examples # tags: # - rlvr # - baseline track_with: tensorboard tracker_kwargs: log_dir: ./roll_exp/rlvr/${exp_name}/ num_gpus_per_node: 8 max_steps: 500 save_steps: 100 logging_steps: 1 eval_steps: 10 resume_from_checkpoint: false rollout_batch_size: 64 # prompt prompt_length: 2048 response_length: 6144 num_return_sequences_in_group: 8 ppo_epochs: 1 adv_estimator: "reinforce" # clip value_clip: 0.5 reward_clip: 10 advantage_clip: 2.0 dual_clip_loss: true # normalize reward_norm: null reward_shift: false reward_scale: false # data mask max_len_mask: true difficulty_mask: true difficulty_low_threshold: 0.1 difficulty_high_threshold: 0.95 error_max_len_clip: false # data weight difficulty_loss_weight: false length_loss_weight: false # reward add_token_level_kl: false # advantage whiten_advantages: true # dynamic sampling scheduler # use_additional_prompts: true # max_running_requests: 256 # is_num_return_sequences_expand: false pretrain: Qwen/Qwen3-Next-80B-A3B-Instruct reward_pretrain: Qwen/Qwen3-Next-80B-A3B-Instruct # validation: # data_args: # template: qwen2_5 # file_name: # - data/aime24_25_deal.jsonl # generating_args: # top_p: 0.6 # top_k: 50 # num_beams: 1 # temperature: 0.6 # num_return_sequences: 1 # eval_steps: 10 actor_train: model_args: disable_gradient_checkpointing: false dtype: bf16 model_type: ~ training_args: learning_rate: 1.0e-6 weight_decay: 0 per_device_train_batch_size: 1 gradient_accumulation_steps: 32 warmup_steps: 1 num_train_epochs: 5 data_args: template: native file_name: - data/math_deepmath_deal.jsonl domain_interleave_probs: math_rule: 1.0 dataset_dir: data messages: messages interleave_probs: "1.0" preprocessing_num_workers: 16 strategy_args: strategy_name: megatron_train strategy_config: tensor_model_parallel_size: 1 expert_model_parallel_size: 8 pipeline_model_parallel_size: 4 virtual_pipeline_model_parallel_size: 12 context_parallel_size: 1 use_distributed_optimizer: true # account_for_loss_in_pipeline_split: true moe_token_dispatcher_type: alltoall recompute_granularity: selective recompute_modules: "moe" bias_activation_fusion: true moe_grouped_gemm: true moe_shared_expert_overlap: true bf16: true additional_configs: moe_permute_fusion: true device_mapping: list(range(0,64)) infer_batch_size: 1 actor_infer: model_args: disable_gradient_checkpointing: true dtype: bf16 generating_args: max_new_tokens: ${response_length} top_p: 0.99 top_k: 100 num_beams: 1 temperature: 0.99 num_return_sequences: ${num_return_sequences_in_group} data_args: template: native strategy_args: strategy_name: vllm strategy_config: tensor_parallel_size: 4 gpu_memory_utilization: 0.7 block_size: 16 max_model_len: 8192 enforce_eager: true device_mapping: list(range(0,64)) infer_batch_size: 1 reference: model_args: dtype: bf16 model_type: ~ data_args: template: native strategy_args: strategy_name: megatron_infer strategy_config: tensor_model_parallel_size: 1 expert_model_parallel_size: 8 pipeline_model_parallel_size: 2 virtual_pipeline_model_parallel_size: 12 use_distributed_optimizer: true moe_token_dispatcher_type: alltoall bias_activation_fusion: true moe_grouped_gemm: true moe_shared_expert_overlap: true additional_configs: moe_permute_fusion: true device_mapping: list(range(0,64)) infer_batch_size: 1 rewards: math_rule: worker_cls: roll.pipeline.rlvr.rewards.math_rule_reward_worker.MathRuleRewardWorker model_args: model_name_or_path: ${reward_pretrain} data_args: template: native tag_included: [deepmath_103k, aime] world_size: 8 infer_batch_size: 1