Update 2026-04-22 17:54:49

2026-04-22 17:54:50 +04:00 · 2026-04-22 17:54:50 +04:00 · ce09b6920a
commit ce09b6920a
parent 00e52496a9
25 changed files with 355 additions and 93 deletions
--- a/Config/config_Voice.json
+++ b/Config/config_Voice.json
@ -5,17 +5,20 @@
    "target_sample_rate": 16000
  },
  "stt": {
-    "wake_model": "tiny",
+    "backend": "vosk",
-    "command_model": "tiny",
+    "vosk_model_path": "Models/vosk-model-small-en-us-0.15",
    "wake_words_en": [
      "sanad", "sannad", "sanat", "sunnat",
-      "senad", "sennad", "sanid", "sanud", "sand",
+      "senad", "sennad", "sanid", "sanud",
-      "samad", "sandy", "sanday", "sunday", "synod", "signed", "sent"
+      "samad", "sandy", "sanday", "sunday", "synod", "signed",
      "sand", "send", "sent", "set", "seen", "seed",
      "then", "than", "that", "step", "stuck",
      "said", "sad", "saw", "so", "sir", "sun"
    ],
    "language": "en",
    "command_timeout_sec": 10,
-    "silence_threshold": 500,
+    "silence_threshold": 150,
-    "silence_duration_sec": 1.5,
+    "silence_duration_sec": 2.0,
    "max_record_sec": 15
  },
  "mic": {
--- a/Data/Brain/Sessions/session_003_2026-04-22/alerts.json
+++ b/Data/Brain/Sessions/session_003_2026-04-22/alerts.json
@ -0,0 +1 @@
 []
--- a/Data/Brain/Sessions/session_003_2026-04-22/commands.json
+++ b/Data/Brain/Sessions/session_003_2026-04-22/commands.json
@ -0,0 +1,8 @@
 [
  {
    "time": "16:22:54",
    "cmd": "hi",
    "response": "Hello! I am Sanad. How can I help you?",
    "duration_s": 0.0
  }
 ]
--- a/Data/Brain/Sessions/session_003_2026-04-22/detections.json
+++ b/Data/Brain/Sessions/session_003_2026-04-22/detections.json
@ -0,0 +1 @@
 []
--- a/Data/Brain/Sessions/session_003_2026-04-22/places.json
+++ b/Data/Brain/Sessions/session_003_2026-04-22/places.json
@ -0,0 +1 @@
 {}
--- a/Data/Brain/Sessions/session_004_2026-04-22/alerts.json
+++ b/Data/Brain/Sessions/session_004_2026-04-22/alerts.json
@ -0,0 +1 @@
 []
--- a/Data/Brain/Sessions/session_004_2026-04-22/commands.json
+++ b/Data/Brain/Sessions/session_004_2026-04-22/commands.json
@ -0,0 +1,8 @@
 [
  {
    "time": "16:24:12",
    "cmd": "what do you see",
    "response": "I see a man sitting in front of a desk with a backpack and a helmet on it. There is also a red and white object hanging on the wall behind him.",
    "duration_s": 0.0
  }
 ]
--- a/Data/Brain/Sessions/session_004_2026-04-22/detections.json
+++ b/Data/Brain/Sessions/session_004_2026-04-22/detections.json
@ -0,0 +1 @@
 []
--- a/Data/Brain/Sessions/session_004_2026-04-22/places.json
+++ b/Data/Brain/Sessions/session_004_2026-04-22/places.json
@ -0,0 +1 @@
 {}
--- a/Models/vosk-model-small-en-us-0.15.zip
+++ b/Models/vosk-model-small-en-us-0.15.zip
--- a/Models/vosk-model-small-en-us-0.15/README
+++ b/Models/vosk-model-small-en-us-0.15/README
@ -0,0 +1,9 @@
 US English model for mobile Vosk applications
 Copyright 2020 Alpha Cephei Inc
 Accuracy: 10.38 (tedlium test) 9.85 (librispeech test-clean)
 Speed: 0.11xRT (desktop)
 Latency: 0.15s (right context)
--- a/Models/vosk-model-small-en-us-0.15/am/final.mdl
+++ b/Models/vosk-model-small-en-us-0.15/am/final.mdl
--- a/Models/vosk-model-small-en-us-0.15/conf/mfcc.conf
+++ b/Models/vosk-model-small-en-us-0.15/conf/mfcc.conf
@ -0,0 +1,7 @@
 --sample-frequency=16000
 --use-energy=false
 --num-mel-bins=40
 --num-ceps=40
 --low-freq=20
 --high-freq=7600
 --allow-downsample=true
--- a/Models/vosk-model-small-en-us-0.15/conf/model.conf
+++ b/Models/vosk-model-small-en-us-0.15/conf/model.conf
@ -0,0 +1,10 @@
 --min-active=200
 --max-active=3000
 --beam=10.0
 --lattice-beam=2.0
 --acoustic-scale=1.0
 --frame-subsampling-factor=3
 --endpoint.silence-phones=1:2:3:4:5:6:7:8:9:10
 --endpoint.rule2.min-trailing-silence=0.5
 --endpoint.rule3.min-trailing-silence=0.75
 --endpoint.rule4.min-trailing-silence=1.0
--- a/Models/vosk-model-small-en-us-0.15/graph/Gr.fst
+++ b/Models/vosk-model-small-en-us-0.15/graph/Gr.fst
--- a/Models/vosk-model-small-en-us-0.15/graph/HCLr.fst
+++ b/Models/vosk-model-small-en-us-0.15/graph/HCLr.fst
--- a/Models/vosk-model-small-en-us-0.15/graph/disambig_tid.int
+++ b/Models/vosk-model-small-en-us-0.15/graph/disambig_tid.int
@ -0,0 +1,17 @@
 10015
 10016
 10017
 10018
 10019
 10020
 10021
 10022
 10023
 10024
 10025
 10026
 10027
 10028
 10029
 10030
 10031
--- a/Models/vosk-model-small-en-us-0.15/graph/phones/word_boundary.int
+++ b/Models/vosk-model-small-en-us-0.15/graph/phones/word_boundary.int
@ -0,0 +1,166 @@
 1 nonword
 2 begin
 3 end
 4 internal
 5 singleton
 6 nonword
 7 begin
 8 end
 9 internal
 10 singleton
 11 begin
 12 end
 13 internal
 14 singleton
 15 begin
 16 end
 17 internal
 18 singleton
 19 begin
 20 end
 21 internal
 22 singleton
 23 begin
 24 end
 25 internal
 26 singleton
 27 begin
 28 end
 29 internal
 30 singleton
 31 begin
 32 end
 33 internal
 34 singleton
 35 begin
 36 end
 37 internal
 38 singleton
 39 begin
 40 end
 41 internal
 42 singleton
 43 begin
 44 end
 45 internal
 46 singleton
 47 begin
 48 end
 49 internal
 50 singleton
 51 begin
 52 end
 53 internal
 54 singleton
 55 begin
 56 end
 57 internal
 58 singleton
 59 begin
 60 end
 61 internal
 62 singleton
 63 begin
 64 end
 65 internal
 66 singleton
 67 begin
 68 end
 69 internal
 70 singleton
 71 begin
 72 end
 73 internal
 74 singleton
 75 begin
 76 end
 77 internal
 78 singleton
 79 begin
 80 end
 81 internal
 82 singleton
 83 begin
 84 end
 85 internal
 86 singleton
 87 begin
 88 end
 89 internal
 90 singleton
 91 begin
 92 end
 93 internal
 94 singleton
 95 begin
 96 end
 97 internal
 98 singleton
 99 begin
 100 end
 101 internal
 102 singleton
 103 begin
 104 end
 105 internal
 106 singleton
 107 begin
 108 end
 109 internal
 110 singleton
 111 begin
 112 end
 113 internal
 114 singleton
 115 begin
 116 end
 117 internal
 118 singleton
 119 begin
 120 end
 121 internal
 122 singleton
 123 begin
 124 end
 125 internal
 126 singleton
 127 begin
 128 end
 129 internal
 130 singleton
 131 begin
 132 end
 133 internal
 134 singleton
 135 begin
 136 end
 137 internal
 138 singleton
 139 begin
 140 end
 141 internal
 142 singleton
 143 begin
 144 end
 145 internal
 146 singleton
 147 begin
 148 end
 149 internal
 150 singleton
 151 begin
 152 end
 153 internal
 154 singleton
 155 begin
 156 end
 157 internal
 158 singleton
 159 begin
 160 end
 161 internal
 162 singleton
 163 begin
 164 end
 165 internal
 166 singleton
--- a/Models/vosk-model-small-en-us-0.15/ivector/final.dubm
+++ b/Models/vosk-model-small-en-us-0.15/ivector/final.dubm
--- a/Models/vosk-model-small-en-us-0.15/ivector/final.ie
+++ b/Models/vosk-model-small-en-us-0.15/ivector/final.ie
--- a/Models/vosk-model-small-en-us-0.15/ivector/final.mat
+++ b/Models/vosk-model-small-en-us-0.15/ivector/final.mat
--- a/Models/vosk-model-small-en-us-0.15/ivector/global_cmvn.stats
+++ b/Models/vosk-model-small-en-us-0.15/ivector/global_cmvn.stats
@ -0,0 +1,3 @@
 [
  1.682383e+11 -1.1595e+10 -1.521733e+10 4.32034e+09 -2.257938e+10 -1.969666e+10 -2.559265e+10 -1.535687e+10 -1.276854e+10 -4.494483e+09 -1.209085e+10 -5.64008e+09 -1.134847e+10 -3.419512e+09 -1.079542e+10 -4.145463e+09 -6.637486e+09 -1.11318e+09 -3.479773e+09 -1.245932e+08 -1.386961e+09 6.560655e+07 -2.436518e+08 -4.032432e+07 4.620046e+08 -7.714964e+07 9.551484e+08 -4.119761e+08 8.208582e+08 -7.117156e+08 7.457703e+08 -4.3106e+08 1.202726e+09 2.904036e+08 1.231931e+09 3.629848e+08 6.366939e+08 -4.586172e+08 -5.267629e+08 -3.507819e+08 1.679838e+09 
  1.741141e+13 8.92488e+11 8.743834e+11 8.848896e+11 1.190313e+12 1.160279e+12 1.300066e+12 1.005678e+12 9.39335e+11 8.089614e+11 7.927041e+11 6.882427e+11 6.444235e+11 5.151451e+11 4.825723e+11 3.210106e+11 2.720254e+11 1.772539e+11 1.248102e+11 6.691599e+10 3.599804e+10 1.207574e+10 1.679301e+09 4.594778e+08 5.821614e+09 1.451758e+10 2.55803e+10 3.43277e+10 4.245286e+10 4.784859e+10 4.988591e+10 4.925451e+10 5.074584e+10 4.9557e+10 4.407876e+10 3.421443e+10 3.138606e+10 2.539716e+10 1.948134e+10 1.381167e+10 0 ]
--- a/Models/vosk-model-small-en-us-0.15/ivector/online_cmvn.conf
+++ b/Models/vosk-model-small-en-us-0.15/ivector/online_cmvn.conf
@ -0,0 +1 @@
 # configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
--- a/Models/vosk-model-small-en-us-0.15/ivector/splice.conf
+++ b/Models/vosk-model-small-en-us-0.15/ivector/splice.conf
@ -0,0 +1,2 @@
 --left-context=3
 --right-context=3
--- a/Voice/marcus_voice.py
+++ b/Voice/marcus_voice.py
@ -27,6 +27,7 @@ import sys
 import threading
 import time
 from logging.handlers import RotatingFileHandler
 from typing import Optional
 import numpy as np
@ -87,9 +88,10 @@ class VoiceModule:
        self._stt = self._config["stt"]
        self._mic = self._config["mic"]
-        # Whisper models — lazy loaded on first _voice_loop() iteration
+        # STT (Vosk) — lazy loaded on first _voice_loop() iteration.
-        self._wake_model = None
+        # One Model instance, recognizers are created fresh per-utterance.
-        self._cmd_model = None
+        self._vosk_model = None
        self._KaldiRecognizer = None
        # Wake words (English only — built-in TTS doesn't do Arabic)
        self._wake_en = [w.lower() for w in self._stt.get("wake_words_en",
@ -115,29 +117,52 @@ class VoiceModule:
    # ─── MODEL LOADING ────────────────────────────────────
-    def _load_whisper(self):
+    def _load_stt(self):
        """
-        Lazy-load Whisper models on CPU.
+        Load Vosk ASR model. Replaces openai-whisper which produced garbage
        (!!!!!!!) on this Jetson's torch-aarch64 install regardless of
        audio quality. Vosk uses Kaldi's own CPU kernels — no torch, no
        numerical instability, ~10× faster than Whisper base on CPU.
-        Force device='cpu' regardless of torch.cuda.is_available(). On the
+        Model path is configured via stt.vosk_model_path (relative to
-        Jetson the torch install sometimes claims CUDA but can't deserialize
+        PROJECT_ROOT, or absolute). Default: the small English model,
-        to it (aarch64 wheel mismatch), and Whisper's default device-auto
+        which is ~40 MB and plenty for short voice commands.
        then crashes with:
          _pickle.UnpicklingError: Weights only load failed.
          Attempting to deserialize object on CUDA device 0
        CPU-only inference is plenty fast for Whisper tiny (~80 MB model).
        """
-        import whisper
+        from vosk import Model, KaldiRecognizer, SetLogLevel
        SetLogLevel(-1)   # silence Vosk's stderr spam
-        if self._wake_model is None:
+        if self._vosk_model is None:
-            log.info("Loading Whisper '%s' for wake word (CPU)...", self._stt["wake_model"])
+            rel = self._stt.get("vosk_model_path", "Models/vosk-model-small-en-us-0.15")
-            self._wake_model = whisper.load_model(self._stt["wake_model"], device="cpu")
+            model_path = rel if os.path.isabs(rel) else os.path.join(PROJECT_ROOT, rel)
-            log.info("Wake model ready")
+            if not os.path.isdir(model_path):
                raise RuntimeError(
                    "[Voice] Vosk model not found at " + model_path + "\n"
                    "  Download it on the Jetson:\n"
                    "    cd ~/Marcus/Models\n"
                    "    wget https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip\n"
                    "    unzip vosk-model-small-en-us-0.15.zip"
                )
            log.info("Loading Vosk model: %s", model_path)
            self._vosk_model = Model(model_path)
            self._KaldiRecognizer = KaldiRecognizer
            log.info("Vosk model ready")
-        if self._cmd_model is None:
+            # NO restricted grammar. Vosk's small English model's lexicon
-            log.info("Loading Whisper '%s' for commands (CPU)...", self._stt["command_model"])
+            # doesn't contain "sanad" (it's not an English word), so passing
-            self._cmd_model = whisper.load_model(self._stt["command_model"], device="cpu")
+            # it in a restricted grammar makes Vosk drop the word with:
-            log.info("Command model ready")
+            #   WARNING (VoskAPI:UpdateGrammarFst) Ignoring word missing in
            #   vocabulary: 'sanad'
            # and the decoder then only has "[unk]" → never matches
            # anything → Transcribed always empty.
            #
            # Instead: open vocabulary transcription, fuzzy-match against
            # the stt.wake_words_en list which contains the English words
            # Vosk ACTUALLY hears when you say "sanad" (then, send, sand,
            # step, signed, etc.).
            self._wake_grammar = None
    # Back-compat alias for any caller that still references the old name
    _load_whisper = _load_stt
    # ─── MIC RECORDING (G1 built-in UDP) ──────────────────
@ -189,72 +214,55 @@ class VoiceModule:
    # ─── TRANSCRIPTION ────────────────────────────────────
-    def _transcribe(self, audio: np.ndarray, model, task: str = "transcribe") -> str:
+    def _transcribe(self, audio: np.ndarray, grammar: Optional[str] = None) -> str:
-        """Transcribe audio using Whisper. Returns text."""
+        """
-        import warnings
+        Transcribe audio using Vosk.
        import whisper
-        # Audio stats — log before transcribe so we can see exactly what
+        When `grammar` is a JSON list string (e.g. `'["sanad","[unk]"]'`),
-        # Whisper is being fed. Useful when wake-word never fires: if
+        Vosk is constrained to that vocabulary only — perfect for wake-word
-        # peak_int16 is always < 500 the mic is too quiet regardless of
+        detection where we KNOW the exact word we want to hear. Pass
-        # any software gain.
+        grammar=None for open-vocabulary transcription (used for commands).
        """
        import json as _json
        # Audio stats — still useful for "mic is silent" diagnostics.
        peak_i16 = int(np.abs(audio).max()) if audio.size else 0
        rms_i16  = float(np.sqrt(np.mean(audio.astype(np.float64) ** 2))) if audio.size else 0.0
        log.info("audio stats: samples=%d  peak=%d  rms=%.1f", audio.size, peak_i16, rms_i16)
-        # Convert int16 to float32 [-1, 1]
+        if audio.size == 0:
        audio_f32 = audio.astype(np.float32) / 32768.0
        # Normalize to ~0.9 peak so Whisper's mel features carry real energy.
        # Harmless on already-loud audio. Skip if peak is essentially zero
        # (no signal at all) — amplifying pure noise doesn't help.
        peak = float(np.abs(audio_f32).max())
        if peak > 1e-4 and peak < 0.9:
            audio_f32 = audio_f32 * (0.9 / peak)
            log.info("normalized ×%.1f (peak %.4f → 0.9)", 0.9 / peak, peak)
        # Suppress the per-call "Performing inference on CPU when CUDA is
        # available" UserWarning. A module-level warnings.filterwarnings()
        # doesn't catch it because whisper re-issues the warning every call
        # via its own logger path. catch_warnings scoped to this call is
        # the clean way.
        #
        # CRITICAL: temperature=0.0 (greedy, no fallback).
        # Whisper's default temperature schedule is (0.0, 0.2, 0.4, 0.6,
        # 0.8, 1.0) — it retries with higher temperatures when the greedy
        # pass misses a quality gate. The retry path calls
        # `Categorical(logits=logits / temperature).sample()` which blows
        # up on Jetson's torch-aarch64 (logits overflow to inf → softmax
        # becomes NaN). Traceback (2026-04-22):
        #   ValueError: Expected parameter logits ... found invalid values:
        #   tensor([[nan, nan, nan, ..., nan, nan, nan]])
        # The voice thread crashed every 2 s and wake-word never fired.
        # Forcing temperature=0.0 stays on the greedy path (argmax), which
        # has no Categorical sampler and no numerical instability.
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            result = model.transcribe(
                audio_f32,
                language=self._stt["language"],   # None = auto-detect
                task=task,
                fp16=False,
                temperature=0.0,                  # no fallback — avoids NaN bug
                condition_on_previous_text=False, # no accumulated context
            )
        text = result["text"].strip()
        detected_lang = result.get("language", "unknown")
        # Filter Whisper's "no phonetic content" degeneration patterns.
        # Near-silence or very quiet speech can produce repetitive filler
        # like "!!!!!!!!!", ". . . . .", "... ... ...", or a single
        # repeated word. Treat anything with < 3 distinct alphanumeric
        # characters as silence so the wake-word check doesn't see it.
        alnum = ''.join(c.lower() for c in text if c.isalnum())
        if not alnum or len(set(alnum)) < 3:
            log.info("Transcribed [%s]: (filtered as noise: %r)", detected_lang, text[:60])
            return ""
-        log.info("Transcribed [%s]: %s", detected_lang, text[:100])
+        # Fresh recognizer per utterance. Pass grammar if provided.
        if grammar:
            rec = self._KaldiRecognizer(self._vosk_model, self._sample_rate, grammar)
        else:
            rec = self._KaldiRecognizer(self._vosk_model, self._sample_rate)
        rec.SetWords(False)
        # Single-shot: feed the whole utterance in one AcceptWaveform call,
        # then take FinalResult. Chunk-based feeding split short "sanad"
        # utterances across chunk boundaries and Vosk's decoder often
        # refused to commit, returning empty. Single-shot works for every
        # voice-assistant example in Vosk's docs.
        #
        # When FinalResult is empty, also check PartialResult — sometimes
        # Vosk heard something but didn't reach a segmentation boundary
        # yet. PartialResult still has the text, just not "finalized".
        rec.AcceptWaveform(audio.tobytes())
        final = _json.loads(rec.FinalResult()).get("text", "").strip()
        if not final:
            partial = _json.loads(rec.PartialResult()).get("partial", "").strip()
            if partial:
                final = partial
                log.info("  (partial only, no final commit)")
        text = final
        if not text:
            log.info("Transcribed: (empty)")
            return ""
        log.info("Transcribed: %s", text[:100])
        return text
    def _check_wake_word(self, text: str) -> bool:
@ -298,23 +306,29 @@ class VoiceModule:
                time.sleep(1)
    def _do_idle(self):
-        """Listen for wake word in 2-second chunks."""
+        """Listen for wake word in 4-second chunks. Longer windows give
        Vosk's decoder enough context to commit short utterances like a
        single 'sanad'."""
        # Skip if robot is speaking — prevents self-listening
        if self._audio.is_speaking:
            time.sleep(0.2)
            return
-        audio = self._record_chunk(2.0)
+        audio = self._record_chunk(4.0)
        # Double-check speaking didn't start during recording
        if self._audio.is_speaking:
            return
-        # Skip if too quiet (no one talking)
+        # Skip if too quiet (no one talking). Threshold lowered to 60 to
-        if audio.std() < 100:
+        # match the G1 on-board mic's typical noise floor (std ~30-80 when
        # idle, ~150+ when someone speaks). With 100 we were skipping
        # quiet "sanad" utterances entirely.
        if audio.std() < 60:
            return
-        text = self._transcribe(audio, self._wake_model)
+        # Wake-word pass uses restricted Vosk grammar (only "sanad" or "[unk]")
        text = self._transcribe(audio, grammar=self._wake_grammar)
        if self._check_wake_word(text):
            log.info("Wake word detected!")
@ -330,10 +344,18 @@ class VoiceModule:
    def _do_wake_heard(self):
        """Record the command until silence."""
-        # Wait for "Listening..." TTS to finish before recording
+        # Wait for "Yes" TTS to finish before recording.
        while self._audio.is_speaking:
            time.sleep(0.1)
        # CRITICAL: flush the mic ring buffer. The UDP multicast receiver
        # has been accumulating audio continuously (including pre-wake
        # silence and the TTS "Yes" that just played back into the mic
        # path). Without flush, _record_until_silence() reads the old
        # buffered silence instantly, counts 3 silent chunks, and exits
        # before the user has started speaking the command.
        self._mic_capture.flush()
        log.info("Recording command...")
        audio = self._record_until_silence()
@ -348,7 +370,7 @@ class VoiceModule:
    def _do_processing(self):
        """Transcribe the command and send to brain."""
-        text = self._transcribe(self._command_audio, self._cmd_model)
+        text = self._transcribe(self._command_audio)
        self._command_audio = None
        if not text or len(text.strip()) < 2:
		`@ -0,0 +1,17 @@`
							`10015`
							`10016`
							`10017`
							`10018`
							`10019`
							`10020`
							`10021`
							`10022`
							`10023`
							`10024`
							`10025`
							`10026`
							`10027`
							`10028`
							`10029`
							`10030`
							`10031`
		`@ -0,0 +1 @@`
							`# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh`