Update 2026-04-24 15:23:19

2026-04-24 15:23:19 +04:00 · 2026-04-24 15:23:19 +04:00 · 9485601e18
commit 9485601e18
parent 5d839d4f4e
20 changed files with 214 additions and 85 deletions
--- a/Config/config_Voice.json
+++ b/Config/config_Voice.json
@ -5,11 +5,14 @@
    "target_sample_rate": 16000
  },
  "stt": {
-    "backend": "faster_whisper",
+    "_backend_comment": "'faster_whisper' (Whisper base.en int8 on CPU) or 'moonshine' (useful-sensors Moonshine via onnxruntime). Moonshine has a different error profile — worth trying when Whisper consistently mishears commands as short hallucinations like 'Yes.', 'Bye.', 'It.'. Moonshine requires `pip install moonshine-voice` on the Jetson.",
    "backend": "moonshine",
    "moonshine_language": "en",
    "_comment": "Custom energy wake detector (instant, no ML) + faster-whisper base.en int8 on CPU for command transcription. Wake fires on any 0.2-1.5s speech burst; Whisper only runs on the recorded command, so it's ~10x less busy than a Whisper-polling setup.",
-    "_mode_comment": "Three modes:  'wake_and_command' = instant acoustic wake detector (no ML) hears 'Sanad', THEN records a ~2s command, transcribes once — fastest, most reliable on G1 mic.  'always_on' = continuous VAD → Whisper every utterance, dispatch all (chatty, LLM gets every noise).  'always_on_gated' = continuous transcribe, dispatch only utterances containing 'Sanad' (Sanad-style but Whisper hallucinates commands from TTS echo on G1 mic, creating feedback loops — keep as opt-in, not default).",
+    "_mode_comment": "Three modes. 'always_on_gated' (default, Sanad-style) = continuously transcribe+log every utterance to logs/transcript.log. Dispatch only when wake word is in the utterance. 'Sanad <cmd>' dispatches the cmd. Bare 'Sanad' → speak 'Yes' and treat the NEXT utterance as the command (within await_command_timeout_sec). 'wake_and_command' = classic acoustic wake first, then record a separate command (no always-listen overhead). 'always_on' = transcribe + dispatch everything, no gate (chatty).",
    "mode": "wake_and_command",
    "await_command_timeout_sec": 10.0,
    "_always_on_comment": "Tunables for always_on mode only. Lower entry threshold catches quieter speech (since there's no wake gate). silence_duration is how long of quiet closes an utterance. idle_log_sec is how often to print an ambient-level summary so you can see what the mic is hearing even when nobody is talking.",
    "always_on_speech_entry_rms":       150.0,
@ -36,7 +39,7 @@
    "_whisper_temp_comment": "Temperature fallback: Whisper first tries greedy (T=0). If the output fails its own confidence gates, it retries at 0.2, then 0.4. On noisy audio this often rescues a bad greedy decode — the small random noise in softmax helps unstick the decoder from a local minimum.",
    "_whisper_gates_comment": "Looser than faster-whisper defaults (0.6 / -1.0 / 2.4) because G1 far-field mic audio has poor SNR and frequently falls below the default log-prob. A segment is dropped only if (no_speech_prob > 0.85 AND log_prob < -1.8) — lets more shaky-but-real speech through. Hallucination risk is cushioned by the GARBAGE_PATTERNS filter downstream and the fuzzy-match to command_vocab.",
    "_initial_prompt_comment": "EMPTY BY DEFAULT. Any bias prompt leaks — on unclear/short audio Whisper echoes the prompt verbatim as the transcription (seen repeatedly: 'This is a robot assistant' hallucinated from <1s of ambient). Clean, unbiased decode is worse at recognising 'Sanad' specifically but doesn't produce phantom commands. Set to a single short cue if you want nudging and can tolerate occasional echoes.",
-    "whisper_initial_prompt":               "Robot voice command.",
+    "whisper_initial_prompt":               "Sanad. Turn left. Turn right. Move forward. Move back. Stop. Sit down. Stand up. Wave hello. Follow me. Come here. Go home. Look around. What do you see.",
    "_vocab_comment": "Tunable vocab lists for voice post-processing. wake_words = fuzzy variants of the wake word 'Sanad' Whisper might produce (used by always_on_gated mode). command_vocab = canonical commands the voice layer fuzzy-matches transcriptions against (e.g. 'Turn right up' → 'turn right' so command_parser's regex catches it). Edit these to add new vocabulary — NO code change required.",
    "wake_words": [
@ -85,21 +88,21 @@
    "_wake_comment": "Custom energy-based wake detector with adaptive noise floor. speech_threshold is a FLOOR — the effective trigger is max(speech_threshold, ambient_baseline * wake_adaptive_mult). CRITICAL: speech_threshold must be ABOVE your room's ambient RMS, else state stays in SPEAKING forever and baseline can't learn. Check logs — if `baseline=0` and `state=SPEAKING` persist with peak values well below your 'Sanad' peaks, raise this floor. Normal G1 room: 80-150. Noisy room (fans, HVAC): 300-500. Measured-here room ambient ≈ 250-350, so 400 gives margin.",
-    "speech_threshold":         400.0,
+    "speech_threshold":         200.0,
    "min_word_duration":        0.25,
    "max_word_duration":        2.50,
    "post_silence":             0.20,
    "wake_cooldown":            1.00,
    "wake_chunk_ms":            50,
    "wake_adaptive_window_n":   50,
-    "wake_adaptive_mult":       3.0,
+    "wake_adaptive_mult":       2.0,
    "wake_diag_log_sec":        3.0,
    "wake_ack":             "tts",
    "_wake_ack_comment":    "tts = spoken 'Yes' via TtsMaker (~1.7s G1 firmware latency). none = silent, relies on terminal print (fast).",
-    "_wake_verify_comment": "DISABLED for speed. When enabled, runs a ~3s Whisper decode on each wake burst and rejects non-/sa-/ speech — good for filtering coughs/claps, but adds 3s latency per wake. With it off, every acoustic wake fires — faster response, more false wakes from loud noises (but those drop silently at the 'no speech' recording stage, so user impact is near-zero). Set true again if background noise is triggering too many false wakes.",
+    "_wake_verify_comment": "ENABLED — confirm each acoustic wake with a lightweight Whisper decode on the triggering burst BEFORE speaking 'Yes' / opening the record window. Accept if the transcription contains any wake-word variant OR starts with s/sh/z (Whisper's consistent signature for mishearing 'Sanad' as 'Stop', 'Sand', 'Set', etc.). Reject pure noise (empty whisper) and clearly non-/sa-/ speech silently. Cost: ~1-2s extra per wake for the Whisper decode. Trade-off: slower wake response, but no 'Yes' played on stray loud noises. Set false if you prefer speed over false-wake filtering.",
-    "wake_verify_enabled": false,
+    "wake_verify_enabled": true,
    "_vad_comment": "Hysteretic VAD with adaptive ambient baseline. speech_entry_rms = 'user started talking' floor; silence_exit_rms = 'user stopped' floor (must be < entry). Adaptive: we reuse the wake detector's baseline (measured during idle silence) to bump both up if the room is noisier than the floors. Tune for YOUR ambient: check `command audio: peak=X rms=Y` in voice.log — speech_entry should sit roughly 2× above ambient rms. Room-measured ambient ≈ 250-350 → entry 400 with margin.",
--- a/Data/Brain/Sessions/session_001_2026-04-22/alerts.json
+++ b/Data/Brain/Sessions/session_001_2026-04-22/alerts.json
@ -1 +0,0 @@
 []
--- a/Data/Brain/Sessions/session_001_2026-04-22/commands.json
+++ b/Data/Brain/Sessions/session_001_2026-04-22/commands.json
@ -1,8 +0,0 @@
 [
  {
    "time": "15:29:59",
    "cmd": "hello, can you hear me",
    "response": "Hello, can you hear me?",
    "duration_s": 4.69
  }
 ]
--- a/Data/Brain/Sessions/session_001_2026-04-22/detections.json
+++ b/Data/Brain/Sessions/session_001_2026-04-22/detections.json
@ -1 +0,0 @@
 []
--- a/Data/Brain/Sessions/session_001_2026-04-22/places.json
+++ b/Data/Brain/Sessions/session_001_2026-04-22/places.json
@ -1 +0,0 @@
 {}
--- a/Data/Brain/Sessions/session_002_2026-04-22/alerts.json
+++ b/Data/Brain/Sessions/session_002_2026-04-22/alerts.json
@ -1 +0,0 @@
 []
--- a/Data/Brain/Sessions/session_002_2026-04-22/commands.json
+++ b/Data/Brain/Sessions/session_002_2026-04-22/commands.json
@ -1,8 +0,0 @@
 [
  {
    "time": "15:37:37",
    "cmd": "turn left",
    "response": "local command",
    "duration_s": 0.0
  }
 ]
--- a/Data/Brain/Sessions/session_002_2026-04-22/detections.json
+++ b/Data/Brain/Sessions/session_002_2026-04-22/detections.json
@ -1 +0,0 @@
 []
--- a/Data/Brain/Sessions/session_002_2026-04-22/places.json
+++ b/Data/Brain/Sessions/session_002_2026-04-22/places.json
@ -1 +0,0 @@
 {}
--- a/Data/Brain/Sessions/session_003_2026-04-22/alerts.json
+++ b/Data/Brain/Sessions/session_003_2026-04-22/alerts.json
@ -1 +0,0 @@
 []
--- a/Data/Brain/Sessions/session_003_2026-04-22/commands.json
+++ b/Data/Brain/Sessions/session_003_2026-04-22/commands.json
@ -1,8 +0,0 @@
 [
  {
    "time": "16:22:54",
    "cmd": "hi",
    "response": "Hello! I am Sanad. How can I help you?",
    "duration_s": 0.0
  }
 ]
--- a/Data/Brain/Sessions/session_003_2026-04-22/detections.json
+++ b/Data/Brain/Sessions/session_003_2026-04-22/detections.json
@ -1 +0,0 @@
 []
--- a/Data/Brain/Sessions/session_003_2026-04-22/places.json
+++ b/Data/Brain/Sessions/session_003_2026-04-22/places.json
@ -1 +0,0 @@
 {}
--- a/Data/Brain/Sessions/session_004_2026-04-22/alerts.json
+++ b/Data/Brain/Sessions/session_004_2026-04-22/alerts.json
@ -1 +0,0 @@
 []
--- a/Data/Brain/Sessions/session_004_2026-04-22/commands.json
+++ b/Data/Brain/Sessions/session_004_2026-04-22/commands.json
@ -1,8 +0,0 @@
 [
  {
    "time": "16:24:12",
    "cmd": "what do you see",
    "response": "I see a man sitting in front of a desk with a backpack and a helmet on it. There is also a red and white object hanging on the wall behind him.",
    "duration_s": 0.0
  }
 ]
--- a/Data/Brain/Sessions/session_004_2026-04-22/detections.json
+++ b/Data/Brain/Sessions/session_004_2026-04-22/detections.json
@ -1 +0,0 @@
 []
--- a/Data/Brain/Sessions/session_004_2026-04-22/places.json
+++ b/Data/Brain/Sessions/session_004_2026-04-22/places.json
@ -1 +0,0 @@
 {}
--- a/Data/Voice/Recordings/unk_1776999824.wav
+++ b/Data/Voice/Recordings/unk_1776999824.wav
--- a/Data/Voice/Recordings/unk_1777000093.wav
+++ b/Data/Voice/Recordings/unk_1777000093.wav
--- a/Voice/marcus_voice.py
+++ b/Voice/marcus_voice.py
@ -60,6 +60,30 @@ logging.basicConfig(
 log = logging.getLogger("marcus_voice")
 # ── Transcript log ─────────────────────────────────────────────
 # Every transcribed utterance (wake or not, command or not) is
 # written here in a simple one-line-per-entry format so the operator
 # can scan everything the mic heard without wading through the full
 # voice.log. Rotates every 5 MB × 3 backups.
 _TRANSCRIPT_PATH = os.path.join(LOG_DIR, "transcript.log")
 _transcript_log = logging.getLogger("transcript")
 _transcript_log.setLevel(logging.INFO)
 _transcript_log.propagate = False                    # don't double-emit
 if not _transcript_log.handlers:
    _th = RotatingFileHandler(
        _TRANSCRIPT_PATH, maxBytes=5_000_000, backupCount=3, encoding="utf-8",
    )
    _th.setFormatter(logging.Formatter("%(asctime)s  %(message)s"))
    _transcript_log.addHandler(_th)
 def _log_transcript(action: str, text: str) -> None:
    """Write one line to logs/transcript.log.
    action: 'HEARD' / 'WAKE' / 'CMD' / 'UNK' / ...
    """
    _transcript_log.info("%-5s %s", action, (text or "").strip())
 # Module-level vocabulary containers. EMPTY on import — populated by
 # VoiceModule.__init__ from Config/config_Voice.json::stt.{wake_words,
 # command_vocab, garbage_patterns}. Config is the single source of truth;
@ -248,13 +272,35 @@ class VoiceModule:
        if self._mic_gain != 1.0:
            log.info("mic_gain = %.2fx (applied to all mic reads)", self._mic_gain)
-        # ── faster-whisper (lazy-init on first wake) ──
+        # ── STT backend selection ──
        # "faster_whisper" (default): Whisper base.en int8 on CPU via CTranslate2.
        # "moonshine":                useful-sensors Moonshine via moonshine-voice
        #                             + onnxruntime. Different training, different
        #                             error profile from Whisper — useful when
        #                             Whisper's hallucinations (short "Yes.",
        #                             "Bye.", "It.") are the failure mode.
        self._backend_name = str(self._stt.get("backend", "faster_whisper")).lower()
        # Lazy-init handles — concrete model loads on first wake so startup
        # stays light. `False` marks a failed init so we don't keep retrying.
        self._fw = None
        self._moonshine = None
        # ── Two-turn wake state (always_on_gated mode) ──
        # self._awaiting_command: False = listening for wake.
        #                         True  = wake heard, next utterance is
        #                                 the command. Cleared after the
        #                                 command dispatches or after
        #                                 await_command_timeout_sec seconds
        #                                 so a stray "Sanad" doesn't arm
        #                                 forever.
        self._awaiting_command = False
        self._await_deadline = 0.0
        self._running = False
        self._thread = None
        self._cooldown_until = 0.0
-        log.info("VoiceModule initialized (wake=custom, stt=faster-whisper)")
+        log.info("VoiceModule initialized (wake=custom, stt=%s)", self._backend_name)
    # ─── gain-applied mic read ────────────────────────────
@ -299,6 +345,49 @@ class VoiceModule:
            self._fw = None
        return self._fw
    # ─── lazy moonshine init ──────────────────────────────
    def _get_moonshine(self):
        """
        Load Moonshine (useful-sensors) on first use. Requires
        `pip install moonshine-voice` on the target. Returns None if
        the package isn't available — caller should fall back.
        """
        if self._moonshine is not None:
            return self._moonshine if self._moonshine is not False else None
        lang = self._stt.get("moonshine_language", "en")
        log.info("Loading Moonshine: language=%s", lang)
        try:
            from moonshine_voice import Transcriber
            from moonshine_voice.download import download_model
            model_path, model_arch = download_model(language=lang)
            self._moonshine = Transcriber(
                model_path=model_path, model_arch=model_arch,
            )
            log.info("Moonshine ready: arch=%s", model_arch)
        except Exception as e:
            log.error("Moonshine init failed: %s — voice will be wake-only", e)
            self._moonshine = False
            return None
        return self._moonshine
    def _moonshine_decode(self, audio_f32: np.ndarray) -> str:
        """Run Moonshine one-shot on a float32 16kHz mono array. Returns ''."""
        m = self._get_moonshine()
        if m is None:
            return ""
        try:
            result = m.transcribe_without_streaming(
                audio_data=audio_f32, sample_rate=self._sample_rate,
            )
            lines = getattr(result, "lines", None) or []
            text = " ".join(getattr(ln, "text", "") for ln in lines).strip()
            log.info("moonshine: text=%r", text[:80])
            return text
        except Exception as e:
            log.error("moonshine transcribe failed: %s", e)
            return ""
    # ─── command recording ────────────────────────────────
    def _record_command(self) -> np.ndarray:
@ -445,7 +534,10 @@ class VoiceModule:
    # ─── transcription ────────────────────────────────────
    def _transcribe(self, audio_i16: np.ndarray) -> str:
-        """int16 PCM → Whisper transcription. Returns '' on no-speech/noise."""
+        """int16 PCM → STT transcription. Returns '' on no-speech/noise."""
        if self._backend_name == "moonshine":
            return self._transcribe_moonshine(audio_i16, lenient=False)
        fw = self._get_fw()
        if fw is None:
            return ""
@ -596,6 +688,9 @@ class VoiceModule:
        The downside (no Sanad nudge) is fine here because the acoustic
        detector has already gated out non-speech.
        """
        if self._backend_name == "moonshine":
            return self._transcribe_moonshine(audio_i16, lenient=True)
        fw = self._get_fw()
        if fw is None:
            return ""
@ -634,6 +729,39 @@ class VoiceModule:
            log.error("whisper-raw transcribe failed: %s", e)
            return ""
    def _transcribe_moonshine(self, audio_i16: np.ndarray, lenient: bool) -> str:
        """
        Moonshine decode path. Light DSP only (DC-removal + peak-normalize);
        Moonshine has its own internal feature extraction, and the Whisper-
        oriented pre-emphasis / 80 Hz HPF are not helpful here.
        lenient=True mirrors _transcribe_raw: skip the garbage-pattern and
        min-length filters so wake verify can see short /s-/ phonetic signals.
        lenient=False applies the same rejection pipeline as _transcribe.
        """
        if audio_i16.size == 0:
            return ""
        audio_f32 = audio_i16.astype(np.float32) / 32768.0
        audio_f32 = audio_f32 - np.mean(audio_f32)
        peak = float(np.abs(audio_f32).max())
        if peak > 1e-4 and peak < 0.7:
            audio_f32 = audio_f32 * (0.7 / peak)
        text = self._moonshine_decode(audio_f32)
        if not text:
            return ""
        if lenient:
            return text
        low = text.lower().strip().rstrip(".!?,")
        vocab_exact = {c.lower() for c in COMMAND_VOCAB}
        if low in GARBAGE_PATTERNS or len(low) < _MIN_TRANSCRIPTION_LENGTH:
            if low not in WAKE_WORDS and low not in vocab_exact:
                log.info("Rejecting likely noise transcription: %r", text)
                return ""
        return text
    # ─── command transcription ────────────────────────────
    def _transcribe_command(self, audio_i16: np.ndarray) -> str:
@ -925,47 +1053,89 @@ class VoiceModule:
                        text = self._transcribe_command(audio) if audio.size else ""
                        if text:
                            log.info("HEARD: %r", text)
-                            # Gated mode: only dispatch if the wake word was
+                            _log_transcript("HEARD", text)
                            # spoken. Everything is still logged above so the
                            # operator has full visibility into what the mic
                            # is picking up.
                            if gated and not _has_wake_word(text):
                                log.info("  (no wake word — not dispatched)")
                            else:
                                if gated:
                                    command = _strip_wake_word(text)
                                    if command != text:
                                        log.info("  wake-stripped: %r → %r",
                                                 text, command)
                                    # Bare wake word ("Sanad.", "Sanad") →
                                    # speak a "Yes" ack, do NOT call the
                                    # brain (it would hallucinate a random
                                    # response from a 1-word prompt).
                                    if not command:
                                        log.info("  wake-only utterance — speaking ack")
                                        try:
                                            self._audio.speak(
                                                self._messages.get("wake_heard", "Yes")
                                            )
                                        except Exception as e:
                                            log.warning("wake-ack TTS failed: %s", e)
                                        continue
                                else:
                                    command = text
-                                # Normalize near-misses ("Turn right up" →
+                            # ── Two-turn gated flow ────────────────────
-                                # "turn right") so command_parser's regex
+                            # State A — listening for wake:
-                                # fast-path can hit without an LLM round-trip.
+                            #   non-wake utterance → log only, do not dispatch
                            #   "Sanad <cmd>"       → strip + dispatch now
                            #   "Sanad" alone       → speak "Yes", switch to
                            #                         state B ("awaiting command")
                            # State B — awaiting command (after Yes):
                            #   any utterance       → dispatch as the command,
                            #                         regardless of wake word.
                            #                         Then back to state A.
                            #
                            # This matches the SanadVoice/gemini_interact
                            # pattern: always transcribe + log every word,
                            # say "yes" on wake, treat the next utterance
                            # as the command.
                            # Timeout stale await-command state
                            if self._awaiting_command and time.time() > self._await_deadline:
                                log.info("  [awaiting-cmd] timed out — back to wake-listen")
                                self._awaiting_command = False
                            if self._awaiting_command:
                                # State B — next utterance is the command.
                                command = _strip_wake_word(text)  # drop accidental "Sanad,"
                                if not command:
                                    command = text  # safety: never drop to empty
                                command = self._normalize_command(command)
-
+                                log.info("  [awaiting-cmd] dispatching: %r", command)
                                _log_transcript("CMD", command)
                                self._awaiting_command = False
                                print(f'  [Sanad] heard: "{command}"')
                                if self._on_command:
                                    try:
                                        self._on_command(command, "en")
                                    except Exception as e:
                                        log.error("on_command: %s", e, exc_info=True)
                                continue
                            # State A — listening for wake.
                            if gated and not _has_wake_word(text):
                                log.info("  (no wake word — logged only)")
                                _log_transcript("IGN", text)
                                continue
                            if gated:
                                command = _strip_wake_word(text)
                                if command != text:
                                    log.info("  wake-stripped: %r → %r",
                                             text, command)
                                if not command:
                                    # Bare "Sanad" — speak "Yes" and arm
                                    # the next-utterance-as-command trigger.
                                    log.info("  wake heard alone — speaking 'Yes', "
                                             "next utterance will be treated as command")
                                    _log_transcript("WAKE", text)
                                    try:
                                        self._audio.speak(
                                            self._messages.get("wake_heard", "Yes")
                                        )
                                    except Exception as e:
                                        log.warning("wake-ack TTS failed: %s", e)
                                    self._awaiting_command = True
                                    self._await_deadline = time.time() + float(
                                        self._stt.get("await_command_timeout_sec", 10.0)
                                    )
                                    continue
                            else:
                                command = text
                            # Sanad + command in one utterance (e.g.
                            # "Sanad, turn left") → fuzzy-normalize + dispatch.
                            command = self._normalize_command(command)
                            _log_transcript("CMD", command)
                            print(f'  [Sanad] heard: "{command}"')
                            if self._on_command:
                                try:
                                    self._on_command(command, "en")
                                except Exception as e:
                                    log.error("on_command: %s", e, exc_info=True)
                        else:
                            log.info("utterance rejected (empty/garbage after Whisper)")
                            _log_transcript("UNK", "(empty)")
                else:
                    idle_peak_rms = max(idle_peak_rms, rms)
                    idle_sum_rms += rms