hydra:
  run:
    dir: .
  output_subdir: null

exp_name: "distill_vl_megatron"
seed: 42
logging_dir: ./output/logs
output_dir: ./output

checkpoint_config:
  type: file_system
  output_dir: /data/cpfs_0/rl_examples/models/${exp_name}


save_steps: 100
logging_steps: 1
resume_from_checkpoint: false

student_pretrain: Qwen/Qwen2.5-VL-7B-Instruct
teacher_pretrain: Qwen/Qwen2.5-VL-32B-Instruct

# distill config
logits_topk: 64
distill_loss_weight: 0.85
kd_objective: forward_kl
distill_on_prompt: True

logits_transfer_backend: "nccl-only" # support "ipc+nccl", "nccl_only" and "ray"

sequence_length: 1024
max_grad_norm: 1.0

student:
  model_args:
    attn_implementation: fa2
    disable_gradient_checkpointing: false
    dtype: bf16
    model_type: ~
  training_args:
    learning_rate: 2.0e-5
    lr_scheduler_type: constant
    per_device_train_batch_size: 4
    gradient_accumulation_steps: 1
    warmup_steps: 0
    num_train_epochs: 1
    max_steps: 1000
  data_args:
    template: qwen2-vl
    # use leonardPKU/GEOQA_R1V_Train_8K as dataset
    # download to ./data/geoqa_data from https://huggingface.co/datasets/leonardPKU/GEOQA_R1V_Train_8K
    file_name: data/geoqa_data/
    dataset_dir: ./
    preprocessing_num_workers: 16
  strategy_args:
    strategy_name: megatron_train
    strategy_config:
      tensor_model_parallel_size: 2
      pipeline_model_parallel_size: 2
      use_distributed_optimizer: true
      recompute_granularity: full
  device_mapping: list(range(0,8))

teacher:
  model_args:
    attn_implementation: fa2
    disable_gradient_checkpointing: true
    dtype: bf16
  data_args:
    template: qwen2-vl
  training_args:
    # teacher forward micro_batch_size
    per_device_train_batch_size: 1
  strategy_args:
    strategy_name: megatron_infer
    strategy_config:
      tensor_model_parallel_size: 2
      pipeline_model_parallel_size: 2
      bf16: true
  device_mapping: list(range(0,8))

system_envs:
  RAY_PROFILING: "0"