155 lines
11 KiB
JSON
155 lines
11 KiB
JSON
{
|
||
"tts": {
|
||
"backend": "builtin_ttsmaker",
|
||
"builtin_speaker_id": 2,
|
||
"target_sample_rate": 16000
|
||
},
|
||
"stt": {
|
||
"_backend_comment": "'faster_whisper' (Whisper base.en int8 on CPU) or 'moonshine' (useful-sensors Moonshine via onnxruntime). Moonshine has a different error profile — worth trying when Whisper consistently mishears commands as short hallucinations like 'Yes.', 'Bye.', 'It.'. Moonshine requires `pip install moonshine-voice` on the Jetson.",
|
||
"backend": "moonshine",
|
||
"moonshine_language": "en",
|
||
"_comment": "Custom energy wake detector (instant, no ML) + faster-whisper base.en int8 on CPU for command transcription. Wake fires on any 0.2-1.5s speech burst; Whisper only runs on the recorded command, so it's ~10x less busy than a Whisper-polling setup.",
|
||
|
||
"_mode_comment": "Three modes. 'always_on_gated' (default, Sanad-style) = continuously transcribe+log every utterance to logs/transcript.log. Dispatch only when wake word is in the utterance. 'Sanad <cmd>' dispatches the cmd. Bare 'Sanad' → speak 'Yes' and treat the NEXT utterance as the command (within await_command_timeout_sec). 'wake_and_command' = classic acoustic wake first, then record a separate command (no always-listen overhead). 'always_on' = transcribe + dispatch everything, no gate (chatty).",
|
||
"mode": "wake_and_command",
|
||
"await_command_timeout_sec": 10.0,
|
||
|
||
"_always_on_comment": "Tunables for always_on mode only. Lower entry threshold catches quieter speech (since there's no wake gate). silence_duration is how long of quiet closes an utterance. idle_log_sec is how often to print an ambient-level summary so you can see what the mic is hearing even when nobody is talking.",
|
||
"always_on_speech_entry_rms": 150.0,
|
||
"always_on_silence_exit_rms": 70.0,
|
||
"always_on_silence_duration_sec": 0.8,
|
||
"always_on_min_utterance_sec": 0.3,
|
||
"always_on_max_utterance_sec": 12.0,
|
||
"always_on_idle_log_sec": 5.0,
|
||
"always_on_ambient_mult": 1.4,
|
||
"always_on_ambient_window_chunks": 100,
|
||
|
||
|
||
"whisper_model": "base.en",
|
||
"whisper_device": "cpu",
|
||
"whisper_compute_type": "int8",
|
||
|
||
"_whisper_tuning_comment": "base.en is the only model that decodes fast enough on Jetson Orin NX CPU. TESTED: small.en takes 10-12s per 1s burst (unusable); base.en runs ~2-3s per burst. tiny.en is even faster (~1s) but noticeably worse accuracy. If accuracy is poor on base.en (garbled transcriptions), the fix is hardware — switch to a close-talking USB mic (Hollyland) via mic.backend:pactl_parec. small.en cached in ~/.cache/huggingface/hub/ if you want to experiment again — try it on an x86 dev machine to see the accuracy gain before blaming Jetson.",
|
||
"mic_gain": 1.0,
|
||
"whisper_beam_size": 8,
|
||
"whisper_no_speech_threshold": 0.85,
|
||
"whisper_log_prob_threshold": -1.8,
|
||
"whisper_compression_ratio_threshold": 3.0,
|
||
"whisper_temperature_fallback": [0.0, 0.2, 0.4],
|
||
"_whisper_temp_comment": "Temperature fallback: Whisper first tries greedy (T=0). If the output fails its own confidence gates, it retries at 0.2, then 0.4. On noisy audio this often rescues a bad greedy decode — the small random noise in softmax helps unstick the decoder from a local minimum.",
|
||
"_whisper_gates_comment": "Looser than faster-whisper defaults (0.6 / -1.0 / 2.4) because G1 far-field mic audio has poor SNR and frequently falls below the default log-prob. A segment is dropped only if (no_speech_prob > 0.85 AND log_prob < -1.8) — lets more shaky-but-real speech through. Hallucination risk is cushioned by the GARBAGE_PATTERNS filter downstream and the fuzzy-match to command_vocab.",
|
||
"_initial_prompt_comment": "EMPTY BY DEFAULT. Any bias prompt leaks — on unclear/short audio Whisper echoes the prompt verbatim as the transcription (seen repeatedly: 'This is a robot assistant' hallucinated from <1s of ambient). Clean, unbiased decode is worse at recognising 'Sanad' specifically but doesn't produce phantom commands. Set to a single short cue if you want nudging and can tolerate occasional echoes.",
|
||
"whisper_initial_prompt": "Sanad. Turn left. Turn right. Move forward. Move back. Stop. Sit down. Stand up. Wave hello. Follow me. Come here. Go home. Look around. What do you see.",
|
||
|
||
"_vocab_comment": "Tunable vocab lists for voice post-processing. wake_words = fuzzy variants of the wake word 'Sanad' Whisper might produce (used by always_on_gated mode). command_vocab = canonical commands the voice layer fuzzy-matches transcriptions against (e.g. 'Turn right up' → 'turn right' so command_parser's regex catches it). Edit these to add new vocabulary — NO code change required.",
|
||
"wake_words": [
|
||
"sanad", "sannad", "sennad", "sunnad", "sinnad", "sonnad",
|
||
"sanat", "sunnat", "sonnat", "sinnat", "sennat",
|
||
"sanid", "sanud", "saned", "sanod", "sanaad",
|
||
"senad", "sinad", "sonad", "sunad",
|
||
"sanah", "sanath", "sanadh", "sonadh",
|
||
"samad", "somad", "sumad",
|
||
"thanad", "zanad",
|
||
"sa nad", "san ad", "san odd", "san add"
|
||
],
|
||
"_wake_words_exclude_comment": "DELIBERATELY EXCLUDED from wake_words: 'said', 'sent', 'sand', 'sandy', 'sunday', 'signed', 'synod', 'sonata', 'sonnet', 'senate', 'sane', 'saint', 'sana'. These collide with common English and would false-trigger the gate.",
|
||
"command_vocab": [
|
||
"what do you see", "what can you see", "look around",
|
||
"come to me", "come here", "come back", "come closer",
|
||
"approach", "get closer", "come",
|
||
"go home", "go back", "go forward", "go backward",
|
||
"go left", "go right", "go",
|
||
"sit down", "stand up", "sit", "stand",
|
||
"raise arm", "lower arm", "wave hello", "wave", "point",
|
||
"turn left", "turn right", "turn around",
|
||
"move forward", "move backward", "move back",
|
||
"move left", "move right",
|
||
"walk forward", "walk backward", "walk back",
|
||
"step forward", "step back", "step left", "step right",
|
||
"forward", "backward", "back", "left", "right",
|
||
"patrol", "stop", "halt", "wait", "pause", "freeze", "hold",
|
||
"hello", "hi", "hey", "help",
|
||
"who are you", "where are you", "where am i", "what is your name",
|
||
"remember this", "forget", "do it again", "repeat", "undo",
|
||
"follow me", "stay here"
|
||
],
|
||
"command_vocab_cutoff": 0.72,
|
||
"_garbage_comment": "Whisper's known 'no phonetic content' outputs on low-SNR audio. YouTube outros, filler words, single-letter hallucinations. Any transcription matching one of these (or shorter than min_transcription_length) is rejected before fuzzy-match — treated as silence.",
|
||
"garbage_patterns": [
|
||
"thanks for watching", "thank you for watching",
|
||
"thank you", "thanks",
|
||
"bye", "goodbye",
|
||
".", "you", "yeah",
|
||
"okay", "ok",
|
||
"um", "uh", "hmm", "mm",
|
||
"i", "a"
|
||
],
|
||
"min_transcription_length": 3,
|
||
|
||
|
||
"_wake_comment": "Custom energy-based wake detector with adaptive noise floor. speech_threshold is a FLOOR — the effective trigger is max(speech_threshold, ambient_baseline * wake_adaptive_mult). CRITICAL: speech_threshold must be ABOVE your room's ambient RMS, else state stays in SPEAKING forever and baseline can't learn. Check logs — if `baseline=0` and `state=SPEAKING` persist with peak values well below your 'Sanad' peaks, raise this floor. Normal G1 room: 80-150. Noisy room (fans, HVAC): 300-500. Measured-here room ambient ≈ 250-350, so 400 gives margin.",
|
||
"speech_threshold": 200.0,
|
||
"min_word_duration": 0.25,
|
||
"max_word_duration": 2.50,
|
||
"post_silence": 0.20,
|
||
"wake_cooldown": 1.00,
|
||
"wake_chunk_ms": 50,
|
||
"wake_adaptive_window_n": 50,
|
||
"wake_adaptive_mult": 2.0,
|
||
"wake_diag_log_sec": 3.0,
|
||
|
||
"wake_ack": "tts",
|
||
"_wake_ack_comment": "tts = spoken 'Yes' via TtsMaker (~1.7s G1 firmware latency). none = silent, relies on terminal print (fast).",
|
||
|
||
"_wake_verify_comment": "ENABLED — confirm each acoustic wake with a lightweight Whisper decode on the triggering burst BEFORE speaking 'Yes' / opening the record window. Accept if the transcription contains any wake-word variant OR starts with s/sh/z (Whisper's consistent signature for mishearing 'Sanad' as 'Stop', 'Sand', 'Set', etc.). Reject pure noise (empty whisper) and clearly non-/sa-/ speech silently. Cost: ~1-2s extra per wake for the Whisper decode. Trade-off: slower wake response, but no 'Yes' played on stray loud noises. Set false if you prefer speed over false-wake filtering.",
|
||
"wake_verify_enabled": true,
|
||
|
||
|
||
"_vad_comment": "Hysteretic VAD with adaptive ambient baseline. speech_entry_rms = 'user started talking' floor; silence_exit_rms = 'user stopped' floor (must be < entry). Adaptive: we reuse the wake detector's baseline (measured during idle silence) to bump both up if the room is noisier than the floors. Tune for YOUR ambient: check `command audio: peak=X rms=Y` in voice.log — speech_entry should sit roughly 2× above ambient rms. Room-measured ambient ≈ 250-350 → entry 400 with margin.",
|
||
"speech_entry_rms": 400.0,
|
||
"silence_exit_rms": 200.0,
|
||
"_vad_tuning_comment": "silence_duration_sec = how long of quiet ends an utterance. 0.6 cuts short commands fast (good UX) but may clip a thinking pause. ambient_mult = effective_entry multiplier over measured ambient (cmd is entry = max(speech_entry_rms, ambient * mult * 1.8)). Smaller = more eager, catches quieter speech. 1.5 matches the wake-mult behavior.",
|
||
"silence_duration_sec": 0.6,
|
||
"max_record_sec": 5.0,
|
||
"min_record_sec": 0.4,
|
||
"ambient_probe_sec": 0.2,
|
||
"ambient_mult": 1.5,
|
||
"ambient_cap_rms": 200.0,
|
||
"_recording_comment": "Debug recording — save every command turn's audio to Data/Voice/Recordings/ as WAV. Filename includes epoch timestamp + transcription slug so you can replay what Whisper got and compare to what it heard. Rotates to keep most recent N files. Filename prefixes: 'cmd_*' = successful transcription, 'unk_*' = empty/rejected.",
|
||
"recording_enabled": true,
|
||
"recording_keep_count": 50,
|
||
"command_cooldown_sec": 1.5,
|
||
"post_tts_settle_sec": 0.4,
|
||
"_post_tts_settle_comment": "Time the mic ignores input AFTER the robot finishes speaking. Too short → TTS echo becomes false utterance. Too long → user speaks during the dead window and first syllables are clipped. 0.4s matches the G1 speaker decay at mic_gain=1.0; raise if you bump mic_gain above 1.5, lower if users report 'it cut off my first word'."
|
||
},
|
||
"mic": {
|
||
"backend": "builtin_udp",
|
||
"source_index": "3",
|
||
"format": "s16le",
|
||
"rate": 16000,
|
||
"channels": 1
|
||
},
|
||
"mic_udp": {
|
||
"group": "239.168.123.161",
|
||
"port": 5555,
|
||
"buffer_max_bytes": 64000,
|
||
"read_timeout_sec": 0.04
|
||
},
|
||
"speaker": {
|
||
"dds_interface": "eth0",
|
||
"volume": 100,
|
||
"app_name": "sanad"
|
||
},
|
||
"audio": {
|
||
"data_dir": "Data/Voice/Recordings",
|
||
"log_file": "logs/voice.log"
|
||
},
|
||
"messages": {
|
||
"wake_heard": "Yes",
|
||
"no_speech": "I didn't catch that, please say it again",
|
||
"error_tts": "Speech synthesis failed",
|
||
"error_mic": "Microphone error",
|
||
"ready": "Voice system ready"
|
||
}
|
||
}
|