model: model_path: robbyant/lingbot-vla-4b tokenizer_path: Qwen/Qwen2.5-VL-3B-Instruct data: datasets_type: vla data_name: robot_config_filename train_path: path_to_dataset joints: - arm.position: 14 - effector.position: 2 cameras: - camera_top - camera_wrist_left - camera_wrist_right num_workers: 8 norm_type: meanstd norm_stats_file: norm_path train: output_dir: "output/" data_parallel_mode: fsdp2 enable_full_shard: false module_fsdp_enable: true use_compile: true rmpad: false rmpad_with_pos_ids: false ulysses_parallel_size: 1 freeze_vision_encoder: false tokenizer_max_length: 72 max_action_dim: 75 max_state_dim: 75 lr: 5.0e-5 lr_decay_style: constant micro_batch_size: 32 gradient_accumulation_steps: 1 # global_batch_size = micro_batch_size * gradient_accumulation_steps * 8 = 256 when we train with 8 GPUs max_steps: 40000 ckpt_manager: dcp save_steps: 10000 save_epochs: 0 enable_fp32: true # Control the precision of the action expert enable_resume: true