# Model name — used for output directory and exported model filename
#
# Multilingual wake words: you must use tts_backend: voxcpm — Piper cannot synthesize
# arbitrary languages. See configs/prod_voxcpm.yaml (Chinese example: 你好 livekit / nihao livekit).
model_name: hey_livekit

# Wake word phrases to detect (include spelling variations)
target_phrases: ["hey livekit"]

# ============================================================================
# Data Generation
# ============================================================================

# Training samples per class (positive + negative)
# Suggestion: 5000-10000 for quick experiments, 20000+ for production
n_samples: 25000

# Validation samples per class
# Suggestion: 10-20% of n_samples
n_samples_val: 5000
n_background_samples: 2000
n_background_samples_val: 500

# VITS TTS batch size (higher = faster, but more VRAM)
# Suggestion: 50 for 8GB GPU, 100+ for 16GB+
tts_batch_size: 50

# Adversarial negatives — phonetically similar phrases that should NOT trigger
# Suggestion: add words that sound like your wake word, common misheard phrases
custom_negative_phrases:
  # Bare wake word without prefix — most common false trigger
  - "livekit"
  - "live kit"
  # Phonetically similar prefixes
  - "hey libby"
  - "hey livid"
  - "hey lidocaine"
  - "hey liquid"
  - "hey linux"
  - "hey lyric"
  # Partial/adjacent phrases
  - "play live"
  - "they live"
  - "hey look at"
  - "hey look it"

# ============================================================================
# TTS Parameters (VITS + SLERP speaker blending)
# ============================================================================

# Overall speech variability
noise_scales: [0.98]

# Phoneme duration variability
noise_scale_ws: [0.98]

# Speaking rate multipliers (slow / normal / fast)
length_scales: [0.75, 1.0, 1.25]

# SLERP interpolation weights for speaker blending (0=speaker1, 1=speaker2)
# Multiple weights create more diverse synthetic voices — values near 0/1
# sound closer to original speakers, 0.5 is the midpoint blend
slerp_weights: [0.2, 0.35, 0.5, 0.65, 0.8]

# Cap on speaker IDs (null = all 904 speakers)
# max_speakers: null

# ============================================================================
# Paths
# ============================================================================

# Root data directory (models, backgrounds, RIRs downloaded here via `setup`)
data_dir: ./data

# Output directory for generated audio, features, checkpoints, and exported model
output_dir: ./output

# ============================================================================
# Augmentation
# ============================================================================

augmentation:
  # Target clip duration in seconds after augmentation
  # Controls the fixed-length window that all clips are padded/cropped to.
  # Default 2.0s is tuned for the built-in embedding model (16 timesteps).
  # Set a different value when generating data for external models that
  # expect a different input length.
  clip_duration: 2.0

  # Batch size for augmentation processing
  batch_size: 16

  # Number of augmentation passes per sample
  # Suggestion: 1 for quick training, 2-3 for better generalization
  rounds: 3

  # Background noise directories (downloaded via `setup`)
  background_paths: [./data/backgrounds]

  # Room impulse response directories for reverb (downloaded via `setup`)
  rir_paths: [./data/rirs]

# ============================================================================
# Model Architecture
# ============================================================================

model:
  # Classifier type:
  #   "dnn"            — fast, flattens temporal structure
  #   "rnn"            — Bi-LSTM, better for variable-length phrases
  #   "conv_attention" — 1D conv + self-attention, best temporal awareness
  model_type: conv_attention

  # Model size: tiny (16d), small (32d), medium (128d), large (256d)
  # Suggestion: small for embedded, medium for server, large for max accuracy
  model_size: medium

# ============================================================================
# Training
# ============================================================================

# Total training steps (phase 1)
# Suggestion: 30000 for quick experiments, 50000-100000 for production
steps: 100000

# Base learning rate
# Suggestion: 0.0001 works well for most cases
learning_rate: 0.0001

# AdamW weight decay for L2 regularization
# Suggestion: 0.01 for most cases, 0.001 for tiny models
weight_decay: 0.01

# Label smoothing — softens targets from hard 0/1 to ε/1-ε
# Prevents overconfident predictions and improves threshold calibration
# Suggestion: 0.05 for production, 0.0 to disable
label_smoothing: 0.05

# Max weight for negative class (linearly increases during training)
# Higher = fewer false positives, but may reduce recall
# Suggestion: 1000-2000 for balanced, 3000+ for very low false positives
max_negative_weight: 3000

# Target false positives per hour on validation set
# Used for adaptive negative weight doubling and threshold optimization
# Suggestion: 0.1-0.5 for production, 1.0+ for prototyping
target_fp_per_hour: 0.1

# Batch composition per training step
batch_n_per_class:
  positive: 50 # wake word samples
  adversarial_negative: 50 # phonetically similar non-wake-words
  ACAV100M_sample: 1024 # random speech from ACAV100M dataset
  background_noise: 50 # pure ambient noise (HVAC, music, etc.)