# Model name — used for output directory and exported model filename # # Multilingual wake words: you must use tts_backend: voxcpm — Piper cannot synthesize # arbitrary languages. See configs/prod_voxcpm.yaml (Chinese example: 你好 livekit / nihao livekit). model_name: hey_livekit # Wake word phrases to detect (include spelling variations) target_phrases: ["hey livekit"] # ============================================================================ # Data Generation # ============================================================================ # Training samples per class (positive + negative) # Suggestion: 5000-10000 for quick experiments, 20000+ for production n_samples: 25000 # Validation samples per class # Suggestion: 10-20% of n_samples n_samples_val: 5000 n_background_samples: 2000 n_background_samples_val: 500 # VITS TTS batch size (higher = faster, but more VRAM) # Suggestion: 50 for 8GB GPU, 100+ for 16GB+ tts_batch_size: 50 # Adversarial negatives — phonetically similar phrases that should NOT trigger # Suggestion: add words that sound like your wake word, common misheard phrases custom_negative_phrases: # Bare wake word without prefix — most common false trigger - "livekit" - "live kit" # Phonetically similar prefixes - "hey libby" - "hey livid" - "hey lidocaine" - "hey liquid" - "hey linux" - "hey lyric" # Partial/adjacent phrases - "play live" - "they live" - "hey look at" - "hey look it" # ============================================================================ # TTS Parameters (VITS + SLERP speaker blending) # ============================================================================ # Overall speech variability noise_scales: [0.98] # Phoneme duration variability noise_scale_ws: [0.98] # Speaking rate multipliers (slow / normal / fast) length_scales: [0.75, 1.0, 1.25] # SLERP interpolation weights for speaker blending (0=speaker1, 1=speaker2) # Multiple weights create more diverse synthetic voices — values near 0/1 # sound closer to original speakers, 0.5 is the midpoint blend slerp_weights: [0.2, 0.35, 0.5, 0.65, 0.8] # Cap on speaker IDs (null = all 904 speakers) # max_speakers: null # ============================================================================ # Paths # ============================================================================ # Root data directory (models, backgrounds, RIRs downloaded here via `setup`) data_dir: ./data # Output directory for generated audio, features, checkpoints, and exported model output_dir: ./output # ============================================================================ # Augmentation # ============================================================================ augmentation: # Target clip duration in seconds after augmentation # Controls the fixed-length window that all clips are padded/cropped to. # Default 2.0s is tuned for the built-in embedding model (16 timesteps). # Set a different value when generating data for external models that # expect a different input length. clip_duration: 2.0 # Batch size for augmentation processing batch_size: 16 # Number of augmentation passes per sample # Suggestion: 1 for quick training, 2-3 for better generalization rounds: 3 # Background noise directories (downloaded via `setup`) background_paths: [./data/backgrounds] # Room impulse response directories for reverb (downloaded via `setup`) rir_paths: [./data/rirs] # ============================================================================ # Model Architecture # ============================================================================ model: # Classifier type: # "dnn" — fast, flattens temporal structure # "rnn" — Bi-LSTM, better for variable-length phrases # "conv_attention" — 1D conv + self-attention, best temporal awareness model_type: conv_attention # Model size: tiny (16d), small (32d), medium (128d), large (256d) # Suggestion: small for embedded, medium for server, large for max accuracy model_size: medium # ============================================================================ # Training # ============================================================================ # Total training steps (phase 1) # Suggestion: 30000 for quick experiments, 50000-100000 for production steps: 100000 # Base learning rate # Suggestion: 0.0001 works well for most cases learning_rate: 0.0001 # AdamW weight decay for L2 regularization # Suggestion: 0.01 for most cases, 0.001 for tiny models weight_decay: 0.01 # Label smoothing — softens targets from hard 0/1 to ε/1-ε # Prevents overconfident predictions and improves threshold calibration # Suggestion: 0.05 for production, 0.0 to disable label_smoothing: 0.05 # Max weight for negative class (linearly increases during training) # Higher = fewer false positives, but may reduce recall # Suggestion: 1000-2000 for balanced, 3000+ for very low false positives max_negative_weight: 3000 # Target false positives per hour on validation set # Used for adaptive negative weight doubling and threshold optimization # Suggestion: 0.1-0.5 for production, 1.0+ for prototyping target_fp_per_hour: 0.1 # Batch composition per training step batch_n_per_class: positive: 50 # wake word samples adversarial_negative: 50 # phonetically similar non-wake-words ACAV100M_sample: 1024 # random speech from ACAV100M dataset background_noise: 50 # pure ambient noise (HVAC, music, etc.)