Marcus/Config/config_Voice.json

{
  "tts": {
    "backend": "builtin_ttsmaker",
    "builtin_speaker_id": 2,
    "target_sample_rate": 16000
  },
  "stt": {
    "_backend_comment": "'faster_whisper' (Whisper base.en int8 on CPU) or 'moonshine' (useful-sensors Moonshine via onnxruntime). Moonshine has a different error profile — worth trying when Whisper consistently mishears commands as short hallucinations like 'Yes.', 'Bye.', 'It.'. Moonshine requires `pip install moonshine-voice` on the Jetson.",
    "backend": "moonshine",
    "moonshine_language": "en",
    "_comment": "Custom energy wake detector (instant, no ML) + faster-whisper base.en int8 on CPU for command transcription. Wake fires on any 0.2-1.5s speech burst; Whisper only runs on the recorded command, so it's ~10x less busy than a Whisper-polling setup.",

    "_mode_comment": "Three modes. 'always_on_gated' (default, Sanad-style) = continuously transcribe+log every utterance to logs/transcript.log. Dispatch only when wake word is in the utterance. 'Sanad <cmd>' dispatches the cmd. Bare 'Sanad' → speak 'Yes' and treat the NEXT utterance as the command (within await_command_timeout_sec). 'wake_and_command' = classic acoustic wake first, then record a separate command (no always-listen overhead). 'always_on' = transcribe + dispatch everything, no gate (chatty).",
    "mode": "wake_and_command",
    "await_command_timeout_sec": 10.0,

    "_always_on_comment": "Tunables for always_on mode only. Lower entry threshold catches quieter speech (since there's no wake gate). silence_duration is how long of quiet closes an utterance. idle_log_sec is how often to print an ambient-level summary so you can see what the mic is hearing even when nobody is talking.",
    "always_on_speech_entry_rms":       150.0,
    "always_on_silence_exit_rms":        70.0,
    "always_on_silence_duration_sec":   0.8,
    "always_on_min_utterance_sec":      0.3,
    "always_on_max_utterance_sec":     12.0,
    "always_on_idle_log_sec":           5.0,
    "always_on_ambient_mult":           1.4,
    "always_on_ambient_window_chunks":  100,


    "whisper_model":        "base.en",
    "whisper_device":       "cpu",
    "whisper_compute_type": "int8",

    "_whisper_tuning_comment": "base.en is the only model that decodes fast enough on Jetson Orin NX CPU. TESTED: small.en takes 10-12s per 1s burst (unusable); base.en runs ~2-3s per burst. tiny.en is even faster (~1s) but noticeably worse accuracy. If accuracy is poor on base.en (garbled transcriptions), the fix is hardware — switch to a close-talking USB mic (Hollyland) via mic.backend:pactl_parec. small.en cached in ~/.cache/huggingface/hub/ if you want to experiment again — try it on an x86 dev machine to see the accuracy gain before blaming Jetson.",
    "mic_gain":                             1.0,
    "whisper_beam_size":                    8,
    "whisper_no_speech_threshold":          0.85,
    "whisper_log_prob_threshold":          -1.8,
    "whisper_compression_ratio_threshold":  3.0,
    "whisper_temperature_fallback":         [0.0, 0.2, 0.4],
    "_whisper_temp_comment": "Temperature fallback: Whisper first tries greedy (T=0). If the output fails its own confidence gates, it retries at 0.2, then 0.4. On noisy audio this often rescues a bad greedy decode — the small random noise in softmax helps unstick the decoder from a local minimum.",
    "_whisper_gates_comment": "Looser than faster-whisper defaults (0.6 / -1.0 / 2.4) because G1 far-field mic audio has poor SNR and frequently falls below the default log-prob. A segment is dropped only if (no_speech_prob > 0.85 AND log_prob < -1.8) — lets more shaky-but-real speech through. Hallucination risk is cushioned by the GARBAGE_PATTERNS filter downstream and the fuzzy-match to command_vocab.",
    "_initial_prompt_comment": "EMPTY BY DEFAULT. Any bias prompt leaks — on unclear/short audio Whisper echoes the prompt verbatim as the transcription (seen repeatedly: 'This is a robot assistant' hallucinated from <1s of ambient). Clean, unbiased decode is worse at recognising 'Sanad' specifically but doesn't produce phantom commands. Set to a single short cue if you want nudging and can tolerate occasional echoes.",
    "whisper_initial_prompt":               "Sanad. Turn left. Turn right. Move forward. Move back. Stop. Sit down. Stand up. Wave hello. Follow me. Come here. Go home. Look around. What do you see.",

    "_vocab_comment": "Tunable vocab lists for voice post-processing. wake_words = fuzzy variants of the wake word 'Sanad' Whisper might produce (used by always_on_gated mode). command_vocab = canonical commands the voice layer fuzzy-matches transcriptions against (e.g. 'Turn right up' → 'turn right' so command_parser's regex catches it). Edit these to add new vocabulary — NO code change required.",
    "wake_words": [
      "sanad", "sannad", "sennad", "sunnad", "sinnad", "sonnad",
      "sanat", "sunnat", "sonnat", "sinnat", "sennat",
      "sanid", "sanud", "saned", "sanod", "sanaad",
      "senad", "sinad", "sonad", "sunad",
      "sanah", "sanath", "sanadh", "sonadh",
      "samad", "somad", "sumad",
      "thanad", "zanad",
      "sa nad", "san ad", "san odd", "san add"
    ],
    "_wake_words_exclude_comment": "DELIBERATELY EXCLUDED from wake_words: 'said', 'sent', 'sand', 'sandy', 'sunday', 'signed', 'synod', 'sonata', 'sonnet', 'senate', 'sane', 'saint', 'sana'. These collide with common English and would false-trigger the gate.",
    "command_vocab": [
      "what do you see", "what can you see", "look around",
      "come to me", "come here", "come back", "come closer",
      "approach", "get closer", "come",
      "go home", "go back", "go forward", "go backward",
      "go left", "go right", "go",
      "sit down", "stand up", "sit", "stand",
      "raise arm", "lower arm", "wave hello", "wave", "point",
      "turn left", "turn right", "turn around",
      "move forward", "move backward", "move back",
      "move left", "move right",
      "walk forward", "walk backward", "walk back",
      "step forward", "step back", "step left", "step right",
      "forward", "backward", "back", "left", "right",
      "patrol", "stop", "halt", "wait", "pause", "freeze", "hold",
      "hello", "hi", "hey", "help",
      "who are you", "where are you", "where am i", "what is your name",
      "remember this", "forget", "do it again", "repeat", "undo",
      "follow me", "stay here"
    ],
    "command_vocab_cutoff": 0.72,
    "_garbage_comment": "Whisper's known 'no phonetic content' outputs on low-SNR audio. YouTube outros, filler words, single-letter hallucinations. Any transcription matching one of these (or shorter than min_transcription_length) is rejected before fuzzy-match — treated as silence.",
    "garbage_patterns": [
      "thanks for watching", "thank you for watching",
      "thank you", "thanks",
      "bye", "goodbye",
      ".", "you", "yeah",
      "okay", "ok",
      "um", "uh", "hmm", "mm",
      "i", "a"
    ],
    "min_transcription_length": 3,


    "_wake_comment": "Custom energy-based wake detector with adaptive noise floor. speech_threshold is a FLOOR — the effective trigger is max(speech_threshold, ambient_baseline * wake_adaptive_mult). CRITICAL: speech_threshold must be ABOVE your room's ambient RMS, else state stays in SPEAKING forever and baseline can't learn. Check logs — if `baseline=0` and `state=SPEAKING` persist with peak values well below your 'Sanad' peaks, raise this floor. Normal G1 room: 80-150. Noisy room (fans, HVAC): 300-500. Measured-here room ambient ≈ 250-350, so 400 gives margin.",
    "speech_threshold":         200.0,
    "min_word_duration":        0.25,
    "max_word_duration":        2.50,
    "post_silence":             0.20,
    "wake_cooldown":            1.00,
    "wake_chunk_ms":            50,
    "wake_adaptive_window_n":   50,
    "wake_adaptive_mult":       2.0,
    "wake_diag_log_sec":        3.0,

    "wake_ack":             "tts",
    "_wake_ack_comment":    "tts = spoken 'Yes' via TtsMaker (~1.7s G1 firmware latency). none = silent, relies on terminal print (fast).",

    "_wake_verify_comment": "ENABLED — confirm each acoustic wake with a lightweight Whisper decode on the triggering burst BEFORE speaking 'Yes' / opening the record window. Accept if the transcription contains any wake-word variant OR starts with s/sh/z (Whisper's consistent signature for mishearing 'Sanad' as 'Stop', 'Sand', 'Set', etc.). Reject pure noise (empty whisper) and clearly non-/sa-/ speech silently. Cost: ~1-2s extra per wake for the Whisper decode. Trade-off: slower wake response, but no 'Yes' played on stray loud noises. Set false if you prefer speed over false-wake filtering.",
    "wake_verify_enabled": true,


    "_vad_comment": "Hysteretic VAD with adaptive ambient baseline. speech_entry_rms = 'user started talking' floor; silence_exit_rms = 'user stopped' floor (must be < entry). Adaptive: we reuse the wake detector's baseline (measured during idle silence) to bump both up if the room is noisier than the floors. Tune for YOUR ambient: check `command audio: peak=X rms=Y` in voice.log — speech_entry should sit roughly 2× above ambient rms. Room-measured ambient ≈ 250-350 → entry 400 with margin.",
    "speech_entry_rms":     400.0,
    "silence_exit_rms":     200.0,
    "_vad_tuning_comment": "silence_duration_sec = how long of quiet ends an utterance. 0.6 cuts short commands fast (good UX) but may clip a thinking pause. ambient_mult = effective_entry multiplier over measured ambient (cmd is entry = max(speech_entry_rms, ambient * mult * 1.8)). Smaller = more eager, catches quieter speech. 1.5 matches the wake-mult behavior.",
    "silence_duration_sec": 0.6,
    "max_record_sec":       5.0,
    "min_record_sec":       0.4,
    "ambient_probe_sec":    0.2,
    "ambient_mult":         1.5,
    "ambient_cap_rms":      200.0,
    "_recording_comment": "Debug recording — save every command turn's audio to Data/Voice/Recordings/ as WAV. Filename includes epoch timestamp + transcription slug so you can replay what Whisper got and compare to what it heard. Rotates to keep most recent N files. Filename prefixes: 'cmd_*' = successful transcription, 'unk_*' = empty/rejected.",
    "recording_enabled":   true,
    "recording_keep_count": 50,
    "command_cooldown_sec": 1.5,
    "post_tts_settle_sec":  0.4,
    "_post_tts_settle_comment": "Time the mic ignores input AFTER the robot finishes speaking. Too short → TTS echo becomes false utterance. Too long → user speaks during the dead window and first syllables are clipped. 0.4s matches the G1 speaker decay at mic_gain=1.0; raise if you bump mic_gain above 1.5, lower if users report 'it cut off my first word'."
  },
  "mic": {
    "backend": "builtin_udp",
    "source_index": "3",
    "format": "s16le",
    "rate": 16000,
    "channels": 1
  },
  "mic_udp": {
    "group": "239.168.123.161",
    "port": 5555,
    "buffer_max_bytes": 64000,
    "read_timeout_sec": 0.04
  },
  "speaker": {
    "dds_interface": "eth0",
    "volume": 100,
    "app_name": "sanad"
  },
  "audio": {
    "data_dir": "Data/Voice/Recordings",
    "log_file": "logs/voice.log"
  },
  "messages": {
    "wake_heard": "Yes",
    "no_speech": "I didn't catch that, please say it again",
    "error_tts": "Speech synthesis failed",
    "error_mic": "Microphone error",
    "ready": "Voice system ready"
  }
}