defaults: - ../config/deepspeed_zero@_here_ - ../config/deepspeed_zero2@_here_ - ../config/deepspeed_zero3@_here_ - ../config/deepspeed_zero3_cpuoffload@_here_ hydra: run: dir: . output_subdir: null exp_name: "qwen2_5_vl_7B_rlvr" seed: 42 logging_dir: ./output/logs output_dir: ./output checkpoint_config: type: file_system output_dir: /data/cpfs_0/yuzhao/models track_with: tensorboard tracker_kwargs: log_dir: /data/oss_bucket_0/yuzhao/llm/tensorboard save_steps: 20 logging_steps: 1 eval_steps: 10 resume_from_checkpoint: false rollout_batch_size: 256 num_return_sequences_in_group: 8 is_num_return_sequences_expand: true prompt_length: 2048 response_length: 4096 ppo_epochs: 1 value_clip: 0.5 reward_clip: 10 advantage_clip: 10.0 whiten_advantages: false init_kl_coef: 0.0 adv_estimator: "grpo" use_kl_loss: true kl_loss_coef: 1.0e-2 pretrain: Qwen/Qwen2.5-VL-7B-Instruct validation: data_args: file_name: - /data/oss_bucket_0/yuzhao/data/One-RL-to-See-Them-All/Orsta-Data-47k/test/test_math_megabench_237.parquet - /data/oss_bucket_0/yuzhao/data/One-RL-to-See-Them-All/Orsta-Data-47k/test/test_detection_coco_test_multi_2000.parquet dataset_dir: ./ generating_args: max_new_tokens: ${response_length} top_p: 0.99 top_k: 100 num_beams: 1 temperature: 0.99 num_return_sequences: 1 eval_steps: ${eval_steps} actor_train: model_args: flash_attn: fa2 attn_implementation: fa2 disable_gradient_checkpointing: false dtype: bf16 model_type: ~ training_args: learning_rate: 1.0e-6 weight_decay: 1.0e-2 per_device_train_batch_size: 2 gradient_accumulation_steps: 256 warmup_steps: 0 num_train_epochs: 50 data_args: # use One-RL-to-See-Them-All/Orsta-Data-47k as train dataset # download from https://huggingface.co/datasets/One-RL-to-See-Them-All/Orsta-Data-47k file_name: - /data/oss_bucket_0/yuzhao/data/One-RL-to-See-Them-All/Orsta-Data-47k/train/train_detection_v3det_4000.parquet - /data/oss_bucket_0/yuzhao/data/One-RL-to-See-Them-All/Orsta-Data-47k/train/train_math_mmmath_3539.parquet domain_interleave_probs: math: 0.5 cv_detection: 0.5 dataset_dir: ./ messages: prompt preprocessing_num_workers: 32 strategy_args: strategy_name: megatron_train strategy_config: sequence_parallel: true tensor_model_parallel_size: 4 context_parallel_size: 2 expert_model_parallel_size: 1 pipeline_model_parallel_size: 1 overlap_grad_reduce: true use_distributed_optimizer: true bf16: true device_mapping: list(range(0,32)) infer_batch_size: 8 actor_infer: model_args: flash_attn: fa2 attn_implementation: fa2 disable_gradient_checkpointing: true dtype: bf16 generating_args: max_new_tokens: ${response_length} top_p: 0.99 top_k: 100 num_beams: 1 temperature: 0.99 num_return_sequences: ${num_return_sequences_in_group} strategy_args: strategy_name: vllm strategy_config: gpu_memory_utilization: 0.8 block_size: 16 # mm preprocessor cache mismatch error occured in vllm084 enable_prefix_caching: false num_gpus_per_worker: 1 device_mapping: list(range(0,32)) infer_batch_size: 32 reference: model_args: flash_attn: fa2 attn_implementation: fa2 disable_gradient_checkpointing: true dtype: bf16 model_type: ~ strategy_args: strategy_name: megatron_infer strategy_config: sequence_parallel: true tensor_model_parallel_size: 2 context_parallel_size: 1 pipeline_model_parallel_size: 1 expert_model_parallel_size: 1 bf16: true device_mapping: list(range(0,32)) infer_batch_size: 8 rewards: math: worker_cls: roll.pipeline.rlvr.rewards.math_rule_reward_worker.MathRuleRewardWorker model_args: model_name_or_path: ${pretrain} # data source whose ability is math in One-RL-to-See-Them-All/Orsta-Data-47k tag_included: [mm_math, megabench_math] world_size: 8 infer_batch_size: 1 cv_detection: worker_cls: roll.pipeline.rlvr.rewards.detection_reward_worker.DetectionRewardWorker model_args: model_name_or_path: ${pretrain} # data source whose ability is cv_detection in One-RL-to-See-Them-All/Orsta-Data-47k tag_included: [v3det_train, object365_train, coco_val_multi_test] world_size: 8 infer_batch_size: 1