# Minimal end-to-end config using VoxCPM2 (voice design) for TTS.
# Use with: livekit-wakeword setup --config configs/test_voxcpm.yaml
# SkyPilot: see skypilot/train.yaml (CONFIG_FILE=test_voxcpm.yaml).
#
# Multilingual wake words require tts_backend: voxcpm (default Piper is English–US only).
# This file uses a Chinese wake phrase (你好 livekit ≈ "nihao livekit"). Put positives and
# manual negatives in your target script; auto adversarial negatives are English/CMUdict-biased.

model_name: test_voxcpm

target_phrases:
  - "你好 livekit" # Chinese; pinyin: nihao livekit

# ============================================================================
# Data Generation
# ============================================================================

tts_backend: voxcpm

# Small diversification grid for faster smoke runs (defaults are much larger).
voxcpm_tts:
  voice_design_prompts:
    - "A young adult woman, clear mid-pitch voice, moderate pace, calm tone"
    - "A young adult man, warm baritone, steady pace, friendly tone"
  cfg_values: [2.0]
  inference_timesteps_list: [10]

n_samples: 24
n_samples_val: 8
n_background_samples: 40
n_background_samples_val: 8
tts_batch_size: 4

custom_negative_phrases:
  - "livekit"
  - "你好"
  - "嘿 livekit"
  - "hey libby"

# ============================================================================
# Paths
# ============================================================================

data_dir: ./data
output_dir: ./output

# ============================================================================
# Augmentation
# ============================================================================

augmentation:
  clip_duration: 2.0
  batch_size: 8
  rounds: 2
  background_paths: [./data/backgrounds]
  rir_paths: [./data/rirs]

# ============================================================================
# Model Architecture
# ============================================================================

model:
  model_type: dnn
  model_size: tiny

# ============================================================================
# Training
# ============================================================================

steps: 500
learning_rate: 0.001
max_negative_weight: 1000
target_fp_per_hour: 1.0

batch_n_per_class:
  positive: 10
  adversarial_negative: 10
  ACAV100M_sample: 64
  background_noise: 10