model:
  model_path: robbyant/lingbot-vla-4b-depth
  tokenizer_path: Qwen/Qwen2.5-VL-3B-Instruct
  moge_path: morgbd/moge2-vitb-normal.pt
  morgbd_path: lingbot_depth/model_mdm_pre.pt

data:
  datasets_type: vla
  data_name: robot_config_filename
  train_path: path_to_dataset
  joints:
    - arm.position: 14
    - effector.position: 2
  cameras:
    - camera_top
    - camera_wrist_left
    - camera_wrist_right
  num_workers: 8
  norm_type: meanstd
  norm_stats_file: norm_path

train:
  output_dir: "output/"
  data_parallel_mode: fsdp2
  enable_full_shard: false
  module_fsdp_enable: true
  use_compile: true
  rmpad: false
  rmpad_with_pos_ids: false
  ulysses_parallel_size: 1
  freeze_vision_encoder: false
  tokenizer_max_length: 72
  max_action_dim: 75
  max_state_dim: 75
  lr: 5.0e-5
  lr_decay_style: constant
  micro_batch_size: 32
  gradient_accumulation_steps: 1 # global_batch_size = micro_batch_size * gradient_accumulation_steps * 8 = 256 when we train with 8 GPUs
  max_steps: 40000
  ckpt_manager: dcp
  save_steps: 10000
  save_epochs: 0
  enable_fp32: true   # Control the precision of the action expert
  enable_resume: true
  align_params:
    mode: 'query'
    num_task_tokens: 8
    use_image_tokens: True
    use_task_tokens: False
    use_text_tokens: False
    use_contrastive: True
    contrastive_loss_weight: 0.3
    depth_loss_weight: 0.004
    llm:
      dim_out: 2048
      image_token_size: 8
      image_input_size: 224
    depth:
      model_type: MoRGBD
      num_layers: 1
      num_heads: 4
      dim_head: 32
      ff_mult: 1
      num_backbone_tokens: 256
      token_size: 16
      dim_out: 1024
      input_size: 224