diff --git a/Config/config_Voice.json b/Config/config_Voice.json index 0332956..ce58843 100644 --- a/Config/config_Voice.json +++ b/Config/config_Voice.json @@ -5,11 +5,14 @@ "target_sample_rate": 16000 }, "stt": { - "backend": "faster_whisper", + "_backend_comment": "'faster_whisper' (Whisper base.en int8 on CPU) or 'moonshine' (useful-sensors Moonshine via onnxruntime). Moonshine has a different error profile — worth trying when Whisper consistently mishears commands as short hallucinations like 'Yes.', 'Bye.', 'It.'. Moonshine requires `pip install moonshine-voice` on the Jetson.", + "backend": "moonshine", + "moonshine_language": "en", "_comment": "Custom energy wake detector (instant, no ML) + faster-whisper base.en int8 on CPU for command transcription. Wake fires on any 0.2-1.5s speech burst; Whisper only runs on the recorded command, so it's ~10x less busy than a Whisper-polling setup.", - "_mode_comment": "Three modes: 'wake_and_command' = instant acoustic wake detector (no ML) hears 'Sanad', THEN records a ~2s command, transcribes once — fastest, most reliable on G1 mic. 'always_on' = continuous VAD → Whisper every utterance, dispatch all (chatty, LLM gets every noise). 'always_on_gated' = continuous transcribe, dispatch only utterances containing 'Sanad' (Sanad-style but Whisper hallucinates commands from TTS echo on G1 mic, creating feedback loops — keep as opt-in, not default).", + "_mode_comment": "Three modes. 'always_on_gated' (default, Sanad-style) = continuously transcribe+log every utterance to logs/transcript.log. Dispatch only when wake word is in the utterance. 'Sanad ' dispatches the cmd. Bare 'Sanad' → speak 'Yes' and treat the NEXT utterance as the command (within await_command_timeout_sec). 'wake_and_command' = classic acoustic wake first, then record a separate command (no always-listen overhead). 'always_on' = transcribe + dispatch everything, no gate (chatty).", "mode": "wake_and_command", + "await_command_timeout_sec": 10.0, "_always_on_comment": "Tunables for always_on mode only. Lower entry threshold catches quieter speech (since there's no wake gate). silence_duration is how long of quiet closes an utterance. idle_log_sec is how often to print an ambient-level summary so you can see what the mic is hearing even when nobody is talking.", "always_on_speech_entry_rms": 150.0, @@ -36,7 +39,7 @@ "_whisper_temp_comment": "Temperature fallback: Whisper first tries greedy (T=0). If the output fails its own confidence gates, it retries at 0.2, then 0.4. On noisy audio this often rescues a bad greedy decode — the small random noise in softmax helps unstick the decoder from a local minimum.", "_whisper_gates_comment": "Looser than faster-whisper defaults (0.6 / -1.0 / 2.4) because G1 far-field mic audio has poor SNR and frequently falls below the default log-prob. A segment is dropped only if (no_speech_prob > 0.85 AND log_prob < -1.8) — lets more shaky-but-real speech through. Hallucination risk is cushioned by the GARBAGE_PATTERNS filter downstream and the fuzzy-match to command_vocab.", "_initial_prompt_comment": "EMPTY BY DEFAULT. Any bias prompt leaks — on unclear/short audio Whisper echoes the prompt verbatim as the transcription (seen repeatedly: 'This is a robot assistant' hallucinated from <1s of ambient). Clean, unbiased decode is worse at recognising 'Sanad' specifically but doesn't produce phantom commands. Set to a single short cue if you want nudging and can tolerate occasional echoes.", - "whisper_initial_prompt": "Robot voice command.", + "whisper_initial_prompt": "Sanad. Turn left. Turn right. Move forward. Move back. Stop. Sit down. Stand up. Wave hello. Follow me. Come here. Go home. Look around. What do you see.", "_vocab_comment": "Tunable vocab lists for voice post-processing. wake_words = fuzzy variants of the wake word 'Sanad' Whisper might produce (used by always_on_gated mode). command_vocab = canonical commands the voice layer fuzzy-matches transcriptions against (e.g. 'Turn right up' → 'turn right' so command_parser's regex catches it). Edit these to add new vocabulary — NO code change required.", "wake_words": [ @@ -85,21 +88,21 @@ "_wake_comment": "Custom energy-based wake detector with adaptive noise floor. speech_threshold is a FLOOR — the effective trigger is max(speech_threshold, ambient_baseline * wake_adaptive_mult). CRITICAL: speech_threshold must be ABOVE your room's ambient RMS, else state stays in SPEAKING forever and baseline can't learn. Check logs — if `baseline=0` and `state=SPEAKING` persist with peak values well below your 'Sanad' peaks, raise this floor. Normal G1 room: 80-150. Noisy room (fans, HVAC): 300-500. Measured-here room ambient ≈ 250-350, so 400 gives margin.", - "speech_threshold": 400.0, + "speech_threshold": 200.0, "min_word_duration": 0.25, "max_word_duration": 2.50, "post_silence": 0.20, "wake_cooldown": 1.00, "wake_chunk_ms": 50, "wake_adaptive_window_n": 50, - "wake_adaptive_mult": 3.0, + "wake_adaptive_mult": 2.0, "wake_diag_log_sec": 3.0, "wake_ack": "tts", "_wake_ack_comment": "tts = spoken 'Yes' via TtsMaker (~1.7s G1 firmware latency). none = silent, relies on terminal print (fast).", - "_wake_verify_comment": "DISABLED for speed. When enabled, runs a ~3s Whisper decode on each wake burst and rejects non-/sa-/ speech — good for filtering coughs/claps, but adds 3s latency per wake. With it off, every acoustic wake fires — faster response, more false wakes from loud noises (but those drop silently at the 'no speech' recording stage, so user impact is near-zero). Set true again if background noise is triggering too many false wakes.", - "wake_verify_enabled": false, + "_wake_verify_comment": "ENABLED — confirm each acoustic wake with a lightweight Whisper decode on the triggering burst BEFORE speaking 'Yes' / opening the record window. Accept if the transcription contains any wake-word variant OR starts with s/sh/z (Whisper's consistent signature for mishearing 'Sanad' as 'Stop', 'Sand', 'Set', etc.). Reject pure noise (empty whisper) and clearly non-/sa-/ speech silently. Cost: ~1-2s extra per wake for the Whisper decode. Trade-off: slower wake response, but no 'Yes' played on stray loud noises. Set false if you prefer speed over false-wake filtering.", + "wake_verify_enabled": true, "_vad_comment": "Hysteretic VAD with adaptive ambient baseline. speech_entry_rms = 'user started talking' floor; silence_exit_rms = 'user stopped' floor (must be < entry). Adaptive: we reuse the wake detector's baseline (measured during idle silence) to bump both up if the room is noisier than the floors. Tune for YOUR ambient: check `command audio: peak=X rms=Y` in voice.log — speech_entry should sit roughly 2× above ambient rms. Room-measured ambient ≈ 250-350 → entry 400 with margin.", diff --git a/Data/Brain/Sessions/session_001_2026-04-22/alerts.json b/Data/Brain/Sessions/session_001_2026-04-22/alerts.json deleted file mode 100644 index 0637a08..0000000 --- a/Data/Brain/Sessions/session_001_2026-04-22/alerts.json +++ /dev/null @@ -1 +0,0 @@ -[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_001_2026-04-22/commands.json b/Data/Brain/Sessions/session_001_2026-04-22/commands.json deleted file mode 100644 index a420e30..0000000 --- a/Data/Brain/Sessions/session_001_2026-04-22/commands.json +++ /dev/null @@ -1,8 +0,0 @@ -[ - { - "time": "15:29:59", - "cmd": "hello, can you hear me", - "response": "Hello, can you hear me?", - "duration_s": 4.69 - } -] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_001_2026-04-22/detections.json b/Data/Brain/Sessions/session_001_2026-04-22/detections.json deleted file mode 100644 index 0637a08..0000000 --- a/Data/Brain/Sessions/session_001_2026-04-22/detections.json +++ /dev/null @@ -1 +0,0 @@ -[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_001_2026-04-22/places.json b/Data/Brain/Sessions/session_001_2026-04-22/places.json deleted file mode 100644 index 9e26dfe..0000000 --- a/Data/Brain/Sessions/session_001_2026-04-22/places.json +++ /dev/null @@ -1 +0,0 @@ -{} \ No newline at end of file diff --git a/Data/Brain/Sessions/session_002_2026-04-22/alerts.json b/Data/Brain/Sessions/session_002_2026-04-22/alerts.json deleted file mode 100644 index 0637a08..0000000 --- a/Data/Brain/Sessions/session_002_2026-04-22/alerts.json +++ /dev/null @@ -1 +0,0 @@ -[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_002_2026-04-22/commands.json b/Data/Brain/Sessions/session_002_2026-04-22/commands.json deleted file mode 100644 index b5aa0ad..0000000 --- a/Data/Brain/Sessions/session_002_2026-04-22/commands.json +++ /dev/null @@ -1,8 +0,0 @@ -[ - { - "time": "15:37:37", - "cmd": "turn left", - "response": "local command", - "duration_s": 0.0 - } -] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_002_2026-04-22/detections.json b/Data/Brain/Sessions/session_002_2026-04-22/detections.json deleted file mode 100644 index 0637a08..0000000 --- a/Data/Brain/Sessions/session_002_2026-04-22/detections.json +++ /dev/null @@ -1 +0,0 @@ -[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_002_2026-04-22/places.json b/Data/Brain/Sessions/session_002_2026-04-22/places.json deleted file mode 100644 index 9e26dfe..0000000 --- a/Data/Brain/Sessions/session_002_2026-04-22/places.json +++ /dev/null @@ -1 +0,0 @@ -{} \ No newline at end of file diff --git a/Data/Brain/Sessions/session_003_2026-04-22/alerts.json b/Data/Brain/Sessions/session_003_2026-04-22/alerts.json deleted file mode 100644 index 0637a08..0000000 --- a/Data/Brain/Sessions/session_003_2026-04-22/alerts.json +++ /dev/null @@ -1 +0,0 @@ -[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_003_2026-04-22/commands.json b/Data/Brain/Sessions/session_003_2026-04-22/commands.json deleted file mode 100644 index 6f3c242..0000000 --- a/Data/Brain/Sessions/session_003_2026-04-22/commands.json +++ /dev/null @@ -1,8 +0,0 @@ -[ - { - "time": "16:22:54", - "cmd": "hi", - "response": "Hello! I am Sanad. How can I help you?", - "duration_s": 0.0 - } -] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_003_2026-04-22/detections.json b/Data/Brain/Sessions/session_003_2026-04-22/detections.json deleted file mode 100644 index 0637a08..0000000 --- a/Data/Brain/Sessions/session_003_2026-04-22/detections.json +++ /dev/null @@ -1 +0,0 @@ -[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_003_2026-04-22/places.json b/Data/Brain/Sessions/session_003_2026-04-22/places.json deleted file mode 100644 index 9e26dfe..0000000 --- a/Data/Brain/Sessions/session_003_2026-04-22/places.json +++ /dev/null @@ -1 +0,0 @@ -{} \ No newline at end of file diff --git a/Data/Brain/Sessions/session_004_2026-04-22/alerts.json b/Data/Brain/Sessions/session_004_2026-04-22/alerts.json deleted file mode 100644 index 0637a08..0000000 --- a/Data/Brain/Sessions/session_004_2026-04-22/alerts.json +++ /dev/null @@ -1 +0,0 @@ -[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_004_2026-04-22/commands.json b/Data/Brain/Sessions/session_004_2026-04-22/commands.json deleted file mode 100644 index 67b8bc6..0000000 --- a/Data/Brain/Sessions/session_004_2026-04-22/commands.json +++ /dev/null @@ -1,8 +0,0 @@ -[ - { - "time": "16:24:12", - "cmd": "what do you see", - "response": "I see a man sitting in front of a desk with a backpack and a helmet on it. There is also a red and white object hanging on the wall behind him.", - "duration_s": 0.0 - } -] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_004_2026-04-22/detections.json b/Data/Brain/Sessions/session_004_2026-04-22/detections.json deleted file mode 100644 index 0637a08..0000000 --- a/Data/Brain/Sessions/session_004_2026-04-22/detections.json +++ /dev/null @@ -1 +0,0 @@ -[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_004_2026-04-22/places.json b/Data/Brain/Sessions/session_004_2026-04-22/places.json deleted file mode 100644 index 9e26dfe..0000000 --- a/Data/Brain/Sessions/session_004_2026-04-22/places.json +++ /dev/null @@ -1 +0,0 @@ -{} \ No newline at end of file diff --git a/Data/Voice/Recordings/unk_1776999824.wav b/Data/Voice/Recordings/unk_1776999824.wav new file mode 100644 index 0000000..b2eeef9 Binary files /dev/null and b/Data/Voice/Recordings/unk_1776999824.wav differ diff --git a/Data/Voice/Recordings/unk_1777000093.wav b/Data/Voice/Recordings/unk_1777000093.wav new file mode 100644 index 0000000..41ec751 Binary files /dev/null and b/Data/Voice/Recordings/unk_1777000093.wav differ diff --git a/Voice/marcus_voice.py b/Voice/marcus_voice.py index 0dbbb87..59651ea 100644 --- a/Voice/marcus_voice.py +++ b/Voice/marcus_voice.py @@ -60,6 +60,30 @@ logging.basicConfig( log = logging.getLogger("marcus_voice") +# ── Transcript log ───────────────────────────────────────────── +# Every transcribed utterance (wake or not, command or not) is +# written here in a simple one-line-per-entry format so the operator +# can scan everything the mic heard without wading through the full +# voice.log. Rotates every 5 MB × 3 backups. +_TRANSCRIPT_PATH = os.path.join(LOG_DIR, "transcript.log") +_transcript_log = logging.getLogger("transcript") +_transcript_log.setLevel(logging.INFO) +_transcript_log.propagate = False # don't double-emit +if not _transcript_log.handlers: + _th = RotatingFileHandler( + _TRANSCRIPT_PATH, maxBytes=5_000_000, backupCount=3, encoding="utf-8", + ) + _th.setFormatter(logging.Formatter("%(asctime)s %(message)s")) + _transcript_log.addHandler(_th) + + +def _log_transcript(action: str, text: str) -> None: + """Write one line to logs/transcript.log. + action: 'HEARD' / 'WAKE' / 'CMD' / 'UNK' / ... + """ + _transcript_log.info("%-5s %s", action, (text or "").strip()) + + # Module-level vocabulary containers. EMPTY on import — populated by # VoiceModule.__init__ from Config/config_Voice.json::stt.{wake_words, # command_vocab, garbage_patterns}. Config is the single source of truth; @@ -248,13 +272,35 @@ class VoiceModule: if self._mic_gain != 1.0: log.info("mic_gain = %.2fx (applied to all mic reads)", self._mic_gain) - # ── faster-whisper (lazy-init on first wake) ── + # ── STT backend selection ── + # "faster_whisper" (default): Whisper base.en int8 on CPU via CTranslate2. + # "moonshine": useful-sensors Moonshine via moonshine-voice + # + onnxruntime. Different training, different + # error profile from Whisper — useful when + # Whisper's hallucinations (short "Yes.", + # "Bye.", "It.") are the failure mode. + self._backend_name = str(self._stt.get("backend", "faster_whisper")).lower() + + # Lazy-init handles — concrete model loads on first wake so startup + # stays light. `False` marks a failed init so we don't keep retrying. self._fw = None + self._moonshine = None + + # ── Two-turn wake state (always_on_gated mode) ── + # self._awaiting_command: False = listening for wake. + # True = wake heard, next utterance is + # the command. Cleared after the + # command dispatches or after + # await_command_timeout_sec seconds + # so a stray "Sanad" doesn't arm + # forever. + self._awaiting_command = False + self._await_deadline = 0.0 self._running = False self._thread = None self._cooldown_until = 0.0 - log.info("VoiceModule initialized (wake=custom, stt=faster-whisper)") + log.info("VoiceModule initialized (wake=custom, stt=%s)", self._backend_name) # ─── gain-applied mic read ──────────────────────────── @@ -299,6 +345,49 @@ class VoiceModule: self._fw = None return self._fw + # ─── lazy moonshine init ────────────────────────────── + + def _get_moonshine(self): + """ + Load Moonshine (useful-sensors) on first use. Requires + `pip install moonshine-voice` on the target. Returns None if + the package isn't available — caller should fall back. + """ + if self._moonshine is not None: + return self._moonshine if self._moonshine is not False else None + lang = self._stt.get("moonshine_language", "en") + log.info("Loading Moonshine: language=%s", lang) + try: + from moonshine_voice import Transcriber + from moonshine_voice.download import download_model + model_path, model_arch = download_model(language=lang) + self._moonshine = Transcriber( + model_path=model_path, model_arch=model_arch, + ) + log.info("Moonshine ready: arch=%s", model_arch) + except Exception as e: + log.error("Moonshine init failed: %s — voice will be wake-only", e) + self._moonshine = False + return None + return self._moonshine + + def _moonshine_decode(self, audio_f32: np.ndarray) -> str: + """Run Moonshine one-shot on a float32 16kHz mono array. Returns ''.""" + m = self._get_moonshine() + if m is None: + return "" + try: + result = m.transcribe_without_streaming( + audio_data=audio_f32, sample_rate=self._sample_rate, + ) + lines = getattr(result, "lines", None) or [] + text = " ".join(getattr(ln, "text", "") for ln in lines).strip() + log.info("moonshine: text=%r", text[:80]) + return text + except Exception as e: + log.error("moonshine transcribe failed: %s", e) + return "" + # ─── command recording ──────────────────────────────── def _record_command(self) -> np.ndarray: @@ -445,7 +534,10 @@ class VoiceModule: # ─── transcription ──────────────────────────────────── def _transcribe(self, audio_i16: np.ndarray) -> str: - """int16 PCM → Whisper transcription. Returns '' on no-speech/noise.""" + """int16 PCM → STT transcription. Returns '' on no-speech/noise.""" + if self._backend_name == "moonshine": + return self._transcribe_moonshine(audio_i16, lenient=False) + fw = self._get_fw() if fw is None: return "" @@ -596,6 +688,9 @@ class VoiceModule: The downside (no Sanad nudge) is fine here because the acoustic detector has already gated out non-speech. """ + if self._backend_name == "moonshine": + return self._transcribe_moonshine(audio_i16, lenient=True) + fw = self._get_fw() if fw is None: return "" @@ -634,6 +729,39 @@ class VoiceModule: log.error("whisper-raw transcribe failed: %s", e) return "" + def _transcribe_moonshine(self, audio_i16: np.ndarray, lenient: bool) -> str: + """ + Moonshine decode path. Light DSP only (DC-removal + peak-normalize); + Moonshine has its own internal feature extraction, and the Whisper- + oriented pre-emphasis / 80 Hz HPF are not helpful here. + + lenient=True mirrors _transcribe_raw: skip the garbage-pattern and + min-length filters so wake verify can see short /s-/ phonetic signals. + lenient=False applies the same rejection pipeline as _transcribe. + """ + if audio_i16.size == 0: + return "" + + audio_f32 = audio_i16.astype(np.float32) / 32768.0 + audio_f32 = audio_f32 - np.mean(audio_f32) + peak = float(np.abs(audio_f32).max()) + if peak > 1e-4 and peak < 0.7: + audio_f32 = audio_f32 * (0.7 / peak) + + text = self._moonshine_decode(audio_f32) + if not text: + return "" + if lenient: + return text + + low = text.lower().strip().rstrip(".!?,") + vocab_exact = {c.lower() for c in COMMAND_VOCAB} + if low in GARBAGE_PATTERNS or len(low) < _MIN_TRANSCRIPTION_LENGTH: + if low not in WAKE_WORDS and low not in vocab_exact: + log.info("Rejecting likely noise transcription: %r", text) + return "" + return text + # ─── command transcription ──────────────────────────── def _transcribe_command(self, audio_i16: np.ndarray) -> str: @@ -925,47 +1053,89 @@ class VoiceModule: text = self._transcribe_command(audio) if audio.size else "" if text: log.info("HEARD: %r", text) - # Gated mode: only dispatch if the wake word was - # spoken. Everything is still logged above so the - # operator has full visibility into what the mic - # is picking up. - if gated and not _has_wake_word(text): - log.info(" (no wake word — not dispatched)") - else: - if gated: - command = _strip_wake_word(text) - if command != text: - log.info(" wake-stripped: %r → %r", - text, command) - # Bare wake word ("Sanad.", "Sanad") → - # speak a "Yes" ack, do NOT call the - # brain (it would hallucinate a random - # response from a 1-word prompt). - if not command: - log.info(" wake-only utterance — speaking ack") - try: - self._audio.speak( - self._messages.get("wake_heard", "Yes") - ) - except Exception as e: - log.warning("wake-ack TTS failed: %s", e) - continue - else: - command = text + _log_transcript("HEARD", text) - # Normalize near-misses ("Turn right up" → - # "turn right") so command_parser's regex - # fast-path can hit without an LLM round-trip. + # ── Two-turn gated flow ──────────────────── + # State A — listening for wake: + # non-wake utterance → log only, do not dispatch + # "Sanad " → strip + dispatch now + # "Sanad" alone → speak "Yes", switch to + # state B ("awaiting command") + # State B — awaiting command (after Yes): + # any utterance → dispatch as the command, + # regardless of wake word. + # Then back to state A. + # + # This matches the SanadVoice/gemini_interact + # pattern: always transcribe + log every word, + # say "yes" on wake, treat the next utterance + # as the command. + # Timeout stale await-command state + if self._awaiting_command and time.time() > self._await_deadline: + log.info(" [awaiting-cmd] timed out — back to wake-listen") + self._awaiting_command = False + + if self._awaiting_command: + # State B — next utterance is the command. + command = _strip_wake_word(text) # drop accidental "Sanad," + if not command: + command = text # safety: never drop to empty command = self._normalize_command(command) - + log.info(" [awaiting-cmd] dispatching: %r", command) + _log_transcript("CMD", command) + self._awaiting_command = False print(f' [Sanad] heard: "{command}"') if self._on_command: try: self._on_command(command, "en") except Exception as e: log.error("on_command: %s", e, exc_info=True) + continue + + # State A — listening for wake. + if gated and not _has_wake_word(text): + log.info(" (no wake word — logged only)") + _log_transcript("IGN", text) + continue + + if gated: + command = _strip_wake_word(text) + if command != text: + log.info(" wake-stripped: %r → %r", + text, command) + if not command: + # Bare "Sanad" — speak "Yes" and arm + # the next-utterance-as-command trigger. + log.info(" wake heard alone — speaking 'Yes', " + "next utterance will be treated as command") + _log_transcript("WAKE", text) + try: + self._audio.speak( + self._messages.get("wake_heard", "Yes") + ) + except Exception as e: + log.warning("wake-ack TTS failed: %s", e) + self._awaiting_command = True + self._await_deadline = time.time() + float( + self._stt.get("await_command_timeout_sec", 10.0) + ) + continue + else: + command = text + + # Sanad + command in one utterance (e.g. + # "Sanad, turn left") → fuzzy-normalize + dispatch. + command = self._normalize_command(command) + _log_transcript("CMD", command) + print(f' [Sanad] heard: "{command}"') + if self._on_command: + try: + self._on_command(command, "en") + except Exception as e: + log.error("on_command: %s", e, exc_info=True) else: log.info("utterance rejected (empty/garbage after Whisper)") + _log_transcript("UNK", "(empty)") else: idle_peak_rms = max(idle_peak_rms, rms) idle_sum_rms += rms