Marcus/Config/config_Voice.json

{
  "tts": {
    "backend": "builtin_ttsmaker",
    "builtin_speaker_id": 2,
    "target_sample_rate": 16000
  },
  "stt": {
    "backend": "faster_whisper",
    "_comment": "Custom energy wake detector (instant, no ML) + faster-whisper base.en int8 on CPU for command transcription. Wake fires on any 0.2-1.5s speech burst; Whisper only runs on the recorded command, so it's ~10x less busy than a Whisper-polling setup.",

    "_mode_comment": "Three modes:  'wake_and_command' = instant acoustic wake detector (no ML) hears 'Sanad', THEN records a ~2s command, transcribes once — fastest, most reliable on G1 mic.  'always_on' = continuous VAD → Whisper every utterance, dispatch all (chatty, LLM gets every noise).  'always_on_gated' = continuous transcribe, dispatch only utterances containing 'Sanad' (Sanad-style but Whisper hallucinates commands from TTS echo on G1 mic, creating feedback loops — keep as opt-in, not default).",
    "mode": "wake_and_command",

    "_always_on_comment": "Tunables for always_on mode only. Lower entry threshold catches quieter speech (since there's no wake gate). silence_duration is how long of quiet closes an utterance. idle_log_sec is how often to print an ambient-level summary so you can see what the mic is hearing even when nobody is talking.",
    "always_on_speech_entry_rms":       150.0,
    "always_on_silence_exit_rms":        70.0,
    "always_on_silence_duration_sec":   0.8,
    "always_on_min_utterance_sec":      0.3,
    "always_on_max_utterance_sec":     12.0,
    "always_on_idle_log_sec":           5.0,
    "always_on_ambient_mult":           1.4,
    "always_on_ambient_window_chunks":  100,


    "whisper_model":        "base.en",
    "whisper_device":       "cpu",
    "whisper_compute_type": "int8",

    "_whisper_tuning_comment": "base.en is the only model that decodes fast enough on Jetson Orin NX CPU. TESTED: small.en takes 10-12s per 1s burst (unusable); base.en runs ~2-3s per burst. tiny.en is even faster (~1s) but noticeably worse accuracy. If accuracy is poor on base.en (garbled transcriptions), the fix is hardware — switch to a close-talking USB mic (Hollyland) via mic.backend:pactl_parec. small.en cached in ~/.cache/huggingface/hub/ if you want to experiment again — try it on an x86 dev machine to see the accuracy gain before blaming Jetson.",
    "mic_gain":                             1.0,
    "whisper_beam_size":                    8,
    "whisper_no_speech_threshold":          0.85,
    "whisper_log_prob_threshold":          -1.8,
    "whisper_compression_ratio_threshold":  3.0,
    "whisper_temperature_fallback":         [0.0, 0.2, 0.4],
    "_whisper_temp_comment": "Temperature fallback: Whisper first tries greedy (T=0). If the output fails its own confidence gates, it retries at 0.2, then 0.4. On noisy audio this often rescues a bad greedy decode — the small random noise in softmax helps unstick the decoder from a local minimum.",
    "_whisper_gates_comment": "Looser than faster-whisper defaults (0.6 / -1.0 / 2.4) because G1 far-field mic audio has poor SNR and frequently falls below the default log-prob. A segment is dropped only if (no_speech_prob > 0.85 AND log_prob < -1.8) — lets more shaky-but-real speech through. Hallucination risk is cushioned by the GARBAGE_PATTERNS filter downstream and the fuzzy-match to command_vocab.",
    "_initial_prompt_comment": "EMPTY BY DEFAULT. Any bias prompt leaks — on unclear/short audio Whisper echoes the prompt verbatim as the transcription (seen repeatedly: 'This is a robot assistant' hallucinated from <1s of ambient). Clean, unbiased decode is worse at recognising 'Sanad' specifically but doesn't produce phantom commands. Set to a single short cue if you want nudging and can tolerate occasional echoes.",
    "whisper_initial_prompt":               "Robot voice command.",

    "_vocab_comment": "Tunable vocab lists for voice post-processing. wake_words = fuzzy variants of the wake word 'Sanad' Whisper might produce (used by always_on_gated mode). command_vocab = canonical commands the voice layer fuzzy-matches transcriptions against (e.g. 'Turn right up' → 'turn right' so command_parser's regex catches it). Edit these to add new vocabulary — NO code change required.",
    "wake_words": [
      "sanad", "sannad", "sennad", "sunnad", "sinnad", "sonnad",
      "sanat", "sunnat", "sonnat", "sinnat", "sennat",
      "sanid", "sanud", "saned", "sanod", "sanaad",
      "senad", "sinad", "sonad", "sunad",
      "sanah", "sanath", "sanadh", "sonadh",
      "samad", "somad", "sumad",
      "thanad", "zanad",
      "sa nad", "san ad", "san odd", "san add"
    ],
    "_wake_words_exclude_comment": "DELIBERATELY EXCLUDED from wake_words: 'said', 'sent', 'sand', 'sandy', 'sunday', 'signed', 'synod', 'sonata', 'sonnet', 'senate', 'sane', 'saint', 'sana'. These collide with common English and would false-trigger the gate.",
    "command_vocab": [
      "what do you see", "what can you see", "look around",
      "come to me", "come here", "come back", "come closer",
      "approach", "get closer", "come",
      "go home", "go back", "go forward", "go backward",
      "go left", "go right", "go",
      "sit down", "stand up", "sit", "stand",
      "raise arm", "lower arm", "wave hello", "wave", "point",
      "turn left", "turn right", "turn around",
      "move forward", "move backward", "move back",
      "move left", "move right",
      "walk forward", "walk backward", "walk back",
      "step forward", "step back", "step left", "step right",
      "forward", "backward", "back", "left", "right",
      "patrol", "stop", "halt", "wait", "pause", "freeze", "hold",
      "hello", "hi", "hey", "help",
      "who are you", "where are you", "where am i", "what is your name",
      "remember this", "forget", "do it again", "repeat", "undo",
      "follow me", "stay here"
    ],
    "command_vocab_cutoff": 0.72,
    "_garbage_comment": "Whisper's known 'no phonetic content' outputs on low-SNR audio. YouTube outros, filler words, single-letter hallucinations. Any transcription matching one of these (or shorter than min_transcription_length) is rejected before fuzzy-match — treated as silence.",
    "garbage_patterns": [
      "thanks for watching", "thank you for watching",
      "thank you", "thanks",
      "bye", "goodbye",
      ".", "you", "yeah",
      "okay", "ok",
      "um", "uh", "hmm", "mm",
      "i", "a"
    ],
    "min_transcription_length": 3,


    "_wake_comment": "Custom energy-based wake detector with adaptive noise floor. speech_threshold is a FLOOR — the effective trigger is max(speech_threshold, ambient_baseline * wake_adaptive_mult). CRITICAL: speech_threshold must be ABOVE your room's ambient RMS, else state stays in SPEAKING forever and baseline can't learn. Check logs — if `baseline=0` and `state=SPEAKING` persist with peak values well below your 'Sanad' peaks, raise this floor. Normal G1 room: 80-150. Noisy room (fans, HVAC): 300-500. Measured-here room ambient ≈ 250-350, so 400 gives margin.",
    "speech_threshold":         400.0,
    "min_word_duration":        0.25,
    "max_word_duration":        2.50,
    "post_silence":             0.20,
    "wake_cooldown":            1.00,
    "wake_chunk_ms":            50,
    "wake_adaptive_window_n":   50,
    "wake_adaptive_mult":       3.0,
    "wake_diag_log_sec":        3.0,

    "wake_ack":             "tts",
    "_wake_ack_comment":    "tts = spoken 'Yes' via TtsMaker (~1.7s G1 firmware latency). none = silent, relies on terminal print (fast).",

    "_wake_verify_comment": "DISABLED for speed. When enabled, runs a ~3s Whisper decode on each wake burst and rejects non-/sa-/ speech — good for filtering coughs/claps, but adds 3s latency per wake. With it off, every acoustic wake fires — faster response, more false wakes from loud noises (but those drop silently at the 'no speech' recording stage, so user impact is near-zero). Set true again if background noise is triggering too many false wakes.",
    "wake_verify_enabled": false,


    "_vad_comment": "Hysteretic VAD with adaptive ambient baseline. speech_entry_rms = 'user started talking' floor; silence_exit_rms = 'user stopped' floor (must be < entry). Adaptive: we reuse the wake detector's baseline (measured during idle silence) to bump both up if the room is noisier than the floors. Tune for YOUR ambient: check `command audio: peak=X rms=Y` in voice.log — speech_entry should sit roughly 2× above ambient rms. Room-measured ambient ≈ 250-350 → entry 400 with margin.",
    "speech_entry_rms":     400.0,
    "silence_exit_rms":     200.0,
    "_vad_tuning_comment": "silence_duration_sec = how long of quiet ends an utterance. 0.6 cuts short commands fast (good UX) but may clip a thinking pause. ambient_mult = effective_entry multiplier over measured ambient (cmd is entry = max(speech_entry_rms, ambient * mult * 1.8)). Smaller = more eager, catches quieter speech. 1.5 matches the wake-mult behavior.",
    "silence_duration_sec": 0.6,
    "max_record_sec":       5.0,
    "min_record_sec":       0.4,
    "ambient_probe_sec":    0.2,
    "ambient_mult":         1.5,
    "ambient_cap_rms":      200.0,
    "_recording_comment": "Debug recording — save every command turn's audio to Data/Voice/Recordings/ as WAV. Filename includes epoch timestamp + transcription slug so you can replay what Whisper got and compare to what it heard. Rotates to keep most recent N files. Filename prefixes: 'cmd_*' = successful transcription, 'unk_*' = empty/rejected.",
    "recording_enabled":   true,
    "recording_keep_count": 50,
    "command_cooldown_sec": 1.5,
    "post_tts_settle_sec":  0.4,
    "_post_tts_settle_comment": "Time the mic ignores input AFTER the robot finishes speaking. Too short → TTS echo becomes false utterance. Too long → user speaks during the dead window and first syllables are clipped. 0.4s matches the G1 speaker decay at mic_gain=1.0; raise if you bump mic_gain above 1.5, lower if users report 'it cut off my first word'."
  },
  "mic": {
    "backend": "builtin_udp",
    "source_index": "3",
    "format": "s16le",
    "rate": 16000,
    "channels": 1
  },
  "mic_udp": {
    "group": "239.168.123.161",
    "port": 5555,
    "buffer_max_bytes": 64000,
    "read_timeout_sec": 0.04
  },
  "speaker": {
    "dds_interface": "eth0",
    "volume": 100,
    "app_name": "sanad"
  },
  "audio": {
    "data_dir": "Data/Voice/Recordings",
    "log_file": "logs/voice.log"
  },
  "messages": {
    "wake_heard": "Yes",
    "no_speech": "I didn't catch that, please say it again",
    "error_tts": "Speech synthesis failed",
    "error_mic": "Microphone error",
    "ready": "Voice system ready"
  }
}