defaults: - ../config/deepspeed_zero@_here_ - ../config/deepspeed_zero2@_here_ - ../config/deepspeed_zero3@_here_ - ../config/deepspeed_zero3_cpuoffload@_here_ hydra: run: dir: . output_subdir: null exp_name: "qwen3_5_35BA3_rlvr" seed: 42 logging_dir: ./output/logs output_dir: ./output checkpoint_config: type: file_system output_dir: /data/cpfs_0/models track_with: tensorboard tracker_kwargs: log_dir: ./output/tensorboard max_steps: 500 save_steps: 100 eval_steps: 20 logging_steps: 1 resume_from_checkpoint: false rollout_batch_size: 256 num_return_sequences_in_group: 8 is_num_return_sequences_expand: true prompt_length: 2048 response_length: 4096 ppo_epochs: 1 value_clip: 0.5 reward_clip: 10 advantage_clip: 10.0 whiten_advantages: false init_kl_coef: 0.0 adv_estimator: "grpo" use_kl_loss: true kl_loss_coef: 1.0e-2 pretrain: Qwen/Qwen3.5-35B-A3B validation: data_args: file_name: - data/One-RL-to-See-Them-All/Orsta-Data-47k/test/test_math_megabench_237.parquet - data/One-RL-to-See-Them-All/Orsta-Data-47k/test/test_detection_coco_test_multi_2000.parquet dataset_dir: ./ preprocessing_num_workers: 32 generating_args: max_new_tokens: ${response_length} top_p: 0.99 top_k: 100 num_beams: 1 temperature: 0.99 num_return_sequences: 1 eval_steps: ${eval_steps} actor_train: model_args: flash_attn: sdpa attn_implementation: sdpa disable_gradient_checkpointing: false dtype: bf16 model_type: ~ freeze_module_prefix: vision_model training_args: learning_rate: 1.0e-6 weight_decay: 1.0e-2 per_device_train_batch_size: 1 gradient_accumulation_steps: 256 warmup_steps: 0 num_train_epochs: 50 data_args: # use One-RL-to-See-Them-All/Orsta-Data-47k as train dataset # download from https://huggingface.co/datasets/One-RL-to-See-Them-All/Orsta-Data-47k file_name: - data/One-RL-to-See-Them-All/Orsta-Data-47k/train/train_detection_v3det_4000.parquet - data/One-RL-to-See-Them-All/Orsta-Data-47k/train/train_math_mmmath_3539.parquet domain_interleave_probs: math: 0.5 cv_detection: 0.5 dataset_dir: ./ messages: prompt preprocessing_num_workers: 32 strategy_args: strategy_name: megatron_train strategy_config: sequence_parallel: true tensor_model_parallel_size: 1 context_parallel_size: 1 expert_model_parallel_size: 8 pipeline_model_parallel_size: 2 use_distributed_optimizer: true moe_token_dispatcher_type: alltoall recompute_granularity: selective recompute_modules: "moe,layernorm" bias_activation_fusion: true apply_rope_fusion: true moe_grouped_gemm: true moe_shared_expert_overlap: true bf16: true device_mapping: list(range(0,16)) infer_batch_size: 1 actor_infer: model_args: flash_attn: fa2 attn_implementation: fa2 disable_gradient_checkpointing: true dtype: bf16 generating_args: max_new_tokens: ${response_length} top_p: 0.99 top_k: 100 num_beams: 1 temperature: 0.99 num_return_sequences: ${num_return_sequences_in_group} strategy_args: strategy_name: vllm strategy_config: gpu_memory_utilization: 0.65 max_model_len: 8192 load_format: dummy tensor_parallel_size: 2 optimization_level: 1 sleep_level: 2 num_gpus_per_worker: 2 device_mapping: list(range(0,16)) infer_batch_size: 1 reference: model_args: flash_attn: sdpa attn_implementation: sdpa disable_gradient_checkpointing: true dtype: bf16 model_type: ~ strategy_args: strategy_name: megatron_infer strategy_config: sequence_parallel: true tensor_model_parallel_size: 1 context_parallel_size: 1 pipeline_model_parallel_size: 1 expert_model_parallel_size: 4 bf16: true device_mapping: list(range(0,16)) infer_batch_size: 1 rewards: math: worker_cls: roll.pipeline.rlvr.rewards.math_rule_reward_worker.MathRuleRewardWorker model_args: model_name_or_path: ${pretrain} # data source whose ability is math in One-RL-to-See-Them-All/Orsta-Data-47k tag_included: [mm_math, megabench_math] world_size: 8 infer_batch_size: 1 cv_detection: worker_cls: roll.pipeline.rlvr.rewards.detection_reward_worker.DetectionRewardWorker model_args: model_name_or_path: ${pretrain} # data source whose ability is cv_detection in One-RL-to-See-Them-All/Orsta-Data-47k tag_included: [v3det_train, object365_train, coco_val_multi_test] world_size: 8 infer_batch_size: 1