model: model_path: robbyant/lingbot-vla-4b-depth tokenizer_path: Qwen/Qwen2.5-VL-3B-Instruct moge_path: morgbd/moge2-vitb-normal.pt morgbd_path: lingbot_depth/model_mdm_pre.pt data: datasets_type: vla data_name: robot_config_filename train_path: path_to_dataset joints: - arm.position: 14 - effector.position: 2 cameras: - camera_top - camera_wrist_left - camera_wrist_right num_workers: 8 norm_type: meanstd norm_stats_file: norm_path train: output_dir: "output/" data_parallel_mode: fsdp2 enable_full_shard: false module_fsdp_enable: true use_compile: true rmpad: false rmpad_with_pos_ids: false ulysses_parallel_size: 1 freeze_vision_encoder: false tokenizer_max_length: 72 max_action_dim: 75 max_state_dim: 75 lr: 5.0e-5 lr_decay_style: constant micro_batch_size: 32 gradient_accumulation_steps: 1 # global_batch_size = micro_batch_size * gradient_accumulation_steps * 8 = 256 when we train with 8 GPUs max_steps: 40000 ckpt_manager: dcp save_steps: 10000 save_epochs: 0 enable_fp32: true # Control the precision of the action expert enable_resume: true align_params: mode: 'query' num_task_tokens: 8 use_image_tokens: True use_task_tokens: False use_text_tokens: False use_contrastive: True contrastive_loss_weight: 0.3 depth_loss_weight: 0.004 llm: dim_out: 2048 image_token_size: 8 image_input_size: 224 depth: model_type: MoRGBD num_layers: 1 num_heads: 4 dim_head: 32 ff_mult: 1 num_backbone_tokens: 256 token_size: 16 dim_out: 1024 input_size: 224