# Minimal end-to-end config using VoxCPM2 (voice design) for TTS. # Use with: livekit-wakeword setup --config configs/test_voxcpm.yaml # SkyPilot: see skypilot/train.yaml (CONFIG_FILE=test_voxcpm.yaml). # # Multilingual wake words require tts_backend: voxcpm (default Piper is English–US only). # This file uses a Chinese wake phrase (你好 livekit ≈ "nihao livekit"). Put positives and # manual negatives in your target script; auto adversarial negatives are English/CMUdict-biased. model_name: test_voxcpm target_phrases: - "你好 livekit" # Chinese; pinyin: nihao livekit # ============================================================================ # Data Generation # ============================================================================ tts_backend: voxcpm # Small diversification grid for faster smoke runs (defaults are much larger). voxcpm_tts: voice_design_prompts: - "A young adult woman, clear mid-pitch voice, moderate pace, calm tone" - "A young adult man, warm baritone, steady pace, friendly tone" cfg_values: [2.0] inference_timesteps_list: [10] n_samples: 24 n_samples_val: 8 n_background_samples: 40 n_background_samples_val: 8 tts_batch_size: 4 custom_negative_phrases: - "livekit" - "你好" - "嘿 livekit" - "hey libby" # ============================================================================ # Paths # ============================================================================ data_dir: ./data output_dir: ./output # ============================================================================ # Augmentation # ============================================================================ augmentation: clip_duration: 2.0 batch_size: 8 rounds: 2 background_paths: [./data/backgrounds] rir_paths: [./data/rirs] # ============================================================================ # Model Architecture # ============================================================================ model: model_type: dnn model_size: tiny # ============================================================================ # Training # ============================================================================ steps: 500 learning_rate: 0.001 max_negative_weight: 1000 target_fp_per_hour: 1.0 batch_n_per_class: positive: 10 adversarial_negative: 10 ACAV100M_sample: 64 background_noise: 10