# ============================================================================= # Production wake-word config — VoxCPM2 TTS (voice design, no reference audio) # ============================================================================= # # Install: uv sync --extra train --extra voxcpm # Prefetch: livekit-wakeword setup --config configs/prod_voxcpm.yaml # Pipeline: livekit-wakeword run configs/prod_voxcpm.yaml # # Multilingual wake words require tts_backend: voxcpm (Piper is English–US only). This example # uses Chinese 你好 livekit ("nihao livekit"). Write target_phrases and custom_negative_phrases in # your language; localize voice_design_prompts if you like. Auto adversarial negatives are # English/CMUdict-biased — add manual negatives for non-English. # # Schema: WakeWordConfig in src/livekit/wakeword/config.py # ============================================================================= # ----------------------------------------------------------------------------- # model_name (required) # ----------------------------------------------------------------------------- # Used for output//, checkpoint basename, and exported ONNX filename. model_name: nihao_livekit_voxcpm # ----------------------------------------------------------------------------- # target_phrases (required) # ----------------------------------------------------------------------------- # Exact phrases to synthesize as positives and to defend against via adversarial # negatives. Include spelling variants users might say. target_phrases: - "你好 livekit" # Chinese wake phrase; pinyin: nihao livekit # ============================================================================= # Data generation (top-level) # ============================================================================= # tts_backend — Synthetic speech engine for clip generation. # piper_vits — VITS + SLERP (904 speakers); uses piper_tts + noise_* / slerp_* below. # voxcpm — VoxCPM2 voice-design TTS; uses voxcpm_tts below. Prefer this backend for # multilingual wake words (phrases in target language; no extra language field). tts_backend: voxcpm # n_samples — Training clips per synthetic class (positive + adversarial negative). # Each class gets this many clips before augmentation rounds multiply variants. n_samples: 25000 # n_samples_val — Validation clips per synthetic class (~10–20% of n_samples). n_samples_val: 5000 # n_background_samples / n_background_samples_val — Count of tiled background-noise clips. n_background_samples: 2000 n_background_samples_val: 500 # tts_batch_size — Piper: GPU batch size for VITS. VoxCPM: ignored (sequential synthesis). tts_batch_size: 50 # custom_negative_phrases — Extra adversarial phrases (in addition to auto phoneme edits). # Tuned for Chinese 你好 livekit; add English false friends if users might mix languages. custom_negative_phrases: - "livekit" - "你好" - "您好 livekit" - "嘿 livekit" - "嗨 livekit" - "你好吗 livekit" - "live kit" - "hey libby" - "hey liquid" # ----------------------------------------------------------------------------- # Piper-only TTS controls (ignored when tts_backend is voxcpm; kept for schema parity) # ----------------------------------------------------------------------------- # noise_scales — VITS decoder noise (overall variability). # noise_scale_ws — VITS duration noise. # length_scales — Speaking-rate multipliers. # slerp_weights — Speaker-blend weights for SLERP pairs. # max_speakers — Cap Piper speaker IDs (null = all 904). noise_scales: [0.98] noise_scale_ws: [0.98] length_scales: [0.75, 1.0, 1.25] slerp_weights: [0.2, 0.35, 0.5, 0.65, 0.8] # max_speakers: null # ----------------------------------------------------------------------------- # piper_tts — Piper checkpoint layout (only used when tts_backend is piper_vits) # ----------------------------------------------------------------------------- # checkpoint_relpath — Path to en-us-libritts-high.pt relative to data_dir; JSON beside it. piper_tts: checkpoint_relpath: piper/en-us-libritts-high.pt # ----------------------------------------------------------------------------- # voxcpm_tts — VoxCPM2 settings (only used when tts_backend is voxcpm) # ----------------------------------------------------------------------------- # model_id — Hugging Face repo id for `setup` snapshot_download. # model_cache_relpath — Directory under data_dir for the snapshot (default voxcpm/VoxCPM2). # local_model_path — Optional: absolute path or path relative to data_dir with weights. # If set and directory is non-empty, setup skips download. # load_denoiser — Pass-through to VoxCPM.from_pretrained (heavier load if true). # voice_design_prompts — Text personas; clip index cycles (prompt × cfg × timesteps). # cfg_values — Classifier-free guidance scales per clip (higher = stronger conditioning). # inference_timesteps_list — Diffusion denoising steps at inference (more = slower, often cleaner). voxcpm_tts: model_id: openbmb/VoxCPM2 model_cache_relpath: voxcpm/VoxCPM2 local_model_path: null load_denoiser: false cfg_values: [1.5, 2.0, 2.5, 3.0] inference_timesteps_list: [8, 10, 12] voice_design_prompts: - "A young adult woman, clear mid-pitch voice, moderate pace, calm and professional" - "A young adult man, warm baritone, steady pace, friendly and articulate" - "A middle-aged woman, slightly low pitch, measured pace, confident tone" - "A middle-aged man, deep resonant voice, slow deliberate pace" - "An older adult woman, soft gentle voice, slower pace, kind tone" - "An older adult man, gravelly tenor, moderate pace, matter-of-fact" - "A young woman, bright energetic tone, slightly faster pace" - "A young man, light tenor, quick conversational pace" - "A woman in her thirties, smooth alto, neutral American accent, even pace" - "A man in his thirties, clear baritone, businesslike pace" - "A speaker with a higher pitch, enthusiastic and upbeat, medium-fast pace" - "A speaker with a lower pitch, relaxed and laid-back, slower pace" - "A young adult, gender-neutral delivery, soft volume, careful enunciation" - "A confident presenter voice, strong projection, moderate speed" - "A quiet intimate voice, close-mic feel, slow and clear" - "A news-anchor style voice, authoritative, even rhythm" - "A friendly customer-service tone, slightly smiling, medium pace" - "A tired but clear voice, subdued energy, steady pace" - "A cheerful animated voice, wide pitch range, lively pace" - "A serious formal voice, minimal emotion, precise articulation" - "A Southern US English accent, warm tone, conversational pace" - "A British English accent, clear RP-like delivery, moderate pace" - "A speaker with slight vocal fry, casual young adult, medium pace" - "A very smooth polished voice, studio quality, slow to medium pace" - "A nasal bright tone, energetic, faster than average pace" - "A breathy soft voice, gentle, slower pace" - "A robust athletic-sounding voice, strong and direct, medium pace" - "A scholarly tone, thoughtful pauses, slower academic pace" - "A teenager-sounding voice, casual, slightly fast, light pitch" - "A mature executive voice, controlled low dynamics, steady" - "A sing-song playful tone, varied pitch, medium pace" - "A monotone flat delivery, robotic clarity, even speed" - "A husky voice, low energy, medium-slow pace" - "A crisp precise voice, minimal accent, fast clear speech" # ============================================================================= # Paths # ============================================================================= # data_dir — Root for downloads (setup), backgrounds, RIRs, VoxCPM snapshot, Piper if used. data_dir: ./data # output_dir — Root for generated clips, features, checkpoints, exports (per model_name). output_dir: ./output # ============================================================================= # augmentation (AugmentationConfig) # ============================================================================= # clip_duration — Seconds; fixed window for embedding pipeline (default 2.0 matches ONNX head). # batch_size — Augmentation batch size. # rounds — Augmentation passes per sample (more = stronger regularization, slower). # background_paths — Dirs of raw noise (MUSAN subset from setup). # rir_paths — Room impulse response dirs (from setup). augmentation: clip_duration: 2.0 batch_size: 16 rounds: 3 background_paths: [./data/backgrounds] rir_paths: [./data/rirs] # ============================================================================= # model (ModelConfig) # ============================================================================= # model_type — dnn | rnn | conv_attention # model_size — tiny | small | medium | large (layer width × depth presets) model: model_type: conv_attention model_size: medium # ============================================================================= # Training # ============================================================================= # steps — Phase-1 step budget (trainer uses multi-phase schedule). # learning_rate — AdamW base LR. # weight_decay — AdamW L2-style decay. # label_smoothing — BCE target smoothing (0–0.1 typical). # max_negative_weight — Upper cap for negative-class loss weight ramp. # target_fp_per_hour — Validation FPPH target for adaptive weighting / thresholding. # batch_n_per_class — Samples per class per training step (keys must match trainer). steps: 100000 learning_rate: 0.0001 weight_decay: 0.01 label_smoothing: 0.05 max_negative_weight: 3000 target_fp_per_hour: 0.1 batch_n_per_class: positive: 50 adversarial_negative: 50 ACAV100M_sample: 1024 background_noise: 50