hydra:
  run:
    dir: .
  output_subdir: null

exp_name: "qwen3-235BA22B-rlvr-config"
seed: 42
logging_dir: ./output/logs
output_dir: ./output
system_envs:
  USE_MODELSCOPE: '1'

checkpoint_config:
  type: file_system
  output_dir: ./rl_examples/models/${exp_name}

track_with: tensorboard
tracker_kwargs:
  log_dir: ./rl_examples/llm/tensorboard/roll_exp/rlvr

num_gpus_per_node: 8

max_steps: 500
save_steps: 100
logging_steps: 1
eval_steps: 10
resume_from_checkpoint: false


rollout_batch_size: 64  # prompt
prompt_length: 2048
response_length: 4096

num_return_sequences_in_group: 8
ppo_epochs: 1
adv_estimator: "reinforce"

# clip
value_clip: 0.5
reward_clip: 10
advantage_clip: 2.0
dual_clip_loss: true

# normalize
norm_mean_type: ~
norm_std_type: ~

# data mask
max_len_mask: true
difficulty_mask: true
difficulty_low_threshold: 0.1
difficulty_high_threshold: 0.95
error_max_len_clip: false

# data weight
difficulty_loss_weight: false
length_loss_weight: false

# reward
add_token_level_kl: false

# advantage
whiten_advantages: true

# dynamic sampling scheduler
# use_additional_prompts: true
# max_running_requests: 256
# is_num_return_sequences_expand: false

pretrain: Qwen/Qwen3-235B-A22B
reward_pretrain: Qwen/Qwen3-235B-A22B

validation:
  data_args:
    template: qwen3
    file_name:
      - data/math_benchmarks.jsonl
  generating_args:
    top_p: 0.6
    top_k: 50
    num_beams: 1
    temperature: 0.6
    num_return_sequences: 1
  eval_steps: 10

actor_train:
  model_args:
    disable_gradient_checkpointing: false
    dtype: bf16
    model_type: ~
  training_args:
    learning_rate: 1.0e-6
    weight_decay: 0
    per_device_train_batch_size: 1
    gradient_accumulation_steps: 64
    warmup_steps: 20
    num_train_epochs: 50
  data_args:
    template: qwen3
    file_name:
      - data/code_KodCode_data.jsonl
      # - data/llm_judge_Multi-subject-RLVR_deal_new.jsonl
      - data/math_deepmath_deal.jsonl
      - data/general_ifeval_train_deal.jsonl
      - data/general_CrossThink-QA_deal.jsonl
    domain_interleave_probs:
      math_rule: 0.4
      code_sandbox: 0.3
      # llm_judge: 0.1
      crossthinkqa: 0.1
      ifeval: 0.1
    dataset_dir: data
    messages: messages
    interleave_probs: "1.0"
    preprocessing_num_workers: 16
  strategy_args:
    strategy_name: megatron_train
    strategy_config:
      tensor_model_parallel_size: 4
      pipeline_model_parallel_size: 8
      virtual_pipeline_model_parallel_size: 6
      expert_model_parallel_size: 8
      context_parallel_size: 1
      account_for_loss_in_pipeline_split: true
      account_for_embedding_in_pipeline_split: true
      use_distributed_optimizer: true
      sequence_parallel: true
      overlap_grad_reduce: true
      bias_activation_fusion: true
      apply_rope_fusion: true
      moe_grouped_gemm: true
      moe_layer_recompute: true
      moe_token_dispatcher_type: "alltoall"
  device_mapping: list(range(0,256))
  infer_batch_size: 2

actor_infer:
  model_args:
    disable_gradient_checkpointing: true
    dtype: bf16
  generating_args:
    max_new_tokens: ${response_length}
    top_p: 0.99
    top_k: 100
    num_beams: 1
    temperature: 0.99
    num_return_sequences: ${num_return_sequences_in_group}
  data_args:
    template: qwen3
  strategy_args:
    strategy_name: vllm
    strategy_config:
      gpu_memory_utilization: 0.75
      load_format: dummy
      tensor_parallel_size: 8
  num_gpus_per_worker: 8
  device_mapping: list(range(0,200)) # device share with llm reward
  infer_batch_size: 1

reference:
  model_args:
    dtype: bf16
    model_type: ~
  data_args:
    template: qwen3
  strategy_args:
    strategy_name: megatron_infer
    strategy_config:
      tensor_model_parallel_size: 1
      pipeline_model_parallel_size: 8
      virtual_pipeline_model_parallel_size: 6
      expert_model_parallel_size: 8
      account_for_loss_in_pipeline_split: true
      account_for_embedding_in_pipeline_split: true
      use_distributed_optimizer: true
      sequence_parallel: true
      bias_activation_fusion: true
      apply_rope_fusion: true
      moe_grouped_gemm: true
      moe_token_dispatcher_type: "alltoall"
  device_mapping: list(range(0,256))
  infer_batch_size: 2

rewards:
  crossthinkqa:
    worker_cls: roll.pipeline.rlvr.rewards.crossthinkqa_rule_reward_worker.CrossThinkQARuleRewardWorker
    reward_type: soft
    response_length_penalty_coef: 0.0
    model_args:
      model_name_or_path: ${reward_pretrain}
    data_args:
      template: qwen3
    tag_included: [crossthinkqa]
    world_size: 8
    infer_batch_size: 4
  ifeval:
    worker_cls: roll.pipeline.rlvr.rewards.ifeval_rule_reward_worker.GeneralRuleRewardWorker
    reward_type: soft
    model_args:
      model_name_or_path: ${reward_pretrain}
    data_args:
      template: qwen3
    tag_included: [ifeval]
    world_size: 8
    infer_batch_size: 4
  math_rule:
    worker_cls: roll.pipeline.rlvr.rewards.math_rule_reward_worker.MathRuleRewardWorker
    model_args:
      model_name_or_path: ${reward_pretrain}
    data_args:
      template: qwen3
    tag_included: [deepmath_103k, aime]
    world_size: 8
    infer_batch_size: 1
# dynamic filter config
#    query_filter_config:
#      type: mean_filter
#      filter_args:
#        threshold_up: 0.9
#        threshold_down: 0.1
  code_sandbox:
    use_local: true
    worker_cls: roll.pipeline.rlvr.rewards.code_sandbox_reward_worker.CodeSandboxRewardWorker
    tag_included: [KodCode]
    model_args:
      model_name_or_path: ${reward_pretrain}
    data_args:
      template: qwen3
    world_size: 8
    infer_batch_size: 1
#    query_filter_config:
#      type: std_filter
#      filter_args:
#        std_threshold: 0
  llm_judge:
    # NOTE: llm as judge 也需要gpu, 不能和actor infer共享gpu
    worker_cls: roll.pipeline.rlvr.rewards.llm_judge_reward_worker.LLMJudgeRewardWorker
    judge_prompt: Qwen2.5-7B-Instruct-RLVR-prompt
    judge_model_type: inference
    tag_included: [RLVR]
    model_args:
      model_name_or_path: virtuoussy/Qwen2.5-7B-Instruct-RLVR
      attn_implementation: fa2
      disable_gradient_checkpointing: true
      dtype: bf16
      model_type: trl
    generating_args:
      max_new_tokens: 100
      top_p: 0.8
      top_k: 50
      num_beams: 1
      temperature: 0.8
      num_return_sequences: 1
    data_args:
      template: qwen3
    strategy_args:
      # strategy_name: hf_infer
      # strategy_config: null
      strategy_name: vllm
      strategy_config:
        gpu_memory_utilization: 0.75
        block_size: 16
        max_model_len: 8000
        load_format: auto
    device_mapping: list(range(200,256))
    infer_batch_size: 4