model:
  model_path: robbyant/lingbot-vla-4b
  tokenizer_path: Qwen/Qwen2.5-VL-3B-Instruct

data:
  datasets_type: vla
  data_name: robot_config_filename
  train_path: path_to_dataset
  joints:
    - arm.position: 14
    - effector.position: 2
  cameras:
    - camera_top
    - camera_wrist_left
    - camera_wrist_right
  num_workers: 8
  norm_type: meanstd
  norm_stats_file: norm_path

train:
  output_dir: "output/"
  data_parallel_mode: fsdp2
  enable_full_shard: false
  module_fsdp_enable: true
  use_compile: true
  rmpad: false
  rmpad_with_pos_ids: false
  ulysses_parallel_size: 1
  freeze_vision_encoder: false
  tokenizer_max_length: 72
  max_action_dim: 75
  max_state_dim: 75
  lr: 5.0e-5
  lr_decay_style: constant
  micro_batch_size: 32
  gradient_accumulation_steps: 1 # global_batch_size = micro_batch_size * gradient_accumulation_steps * 8 = 256 when we train with 8 GPUs
  max_steps: 40000
  ckpt_manager: dcp
  save_steps: 10000
  save_epochs: 0
  enable_fp32: true   # Control the precision of the action expert
  enable_resume: true