Update 2026-04-22 17:54:49

2026-04-22 17:54:50 +04:00 · 2026-04-22 17:54:50 +04:00 · ce09b6920a
commit ce09b6920a
parent 00e52496a9
25 changed files with 355 additions and 93 deletions
--- a/Config/config_Voice.json
+++ b/Config/config_Voice.json
@ -5,17 +5,20 @@
    "target_sample_rate": 16000
  },
  "stt": {
-    "wake_model": "tiny",
-    "command_model": "tiny",
+    "backend": "vosk",
+    "vosk_model_path": "Models/vosk-model-small-en-us-0.15",
    "wake_words_en": [
      "sanad", "sannad", "sanat", "sunnat",
-      "senad", "sennad", "sanid", "sanud", "sand",
-      "samad", "sandy", "sanday", "sunday", "synod", "signed", "sent"
+      "senad", "sennad", "sanid", "sanud",
+      "samad", "sandy", "sanday", "sunday", "synod", "signed",
+      "sand", "send", "sent", "set", "seen", "seed",
+      "then", "than", "that", "step", "stuck",
+      "said", "sad", "saw", "so", "sir", "sun"
    ],
    "language": "en",
    "command_timeout_sec": 10,
-    "silence_threshold": 500,
-    "silence_duration_sec": 1.5,
+    "silence_threshold": 150,
+    "silence_duration_sec": 2.0,
    "max_record_sec": 15
  },
  "mic": {
--- a/Data/Brain/Sessions/session_003_2026-04-22/alerts.json
+++ b/Data/Brain/Sessions/session_003_2026-04-22/alerts.json
@ -0,0 +1 @@
+[]
--- a/Data/Brain/Sessions/session_003_2026-04-22/commands.json
+++ b/Data/Brain/Sessions/session_003_2026-04-22/commands.json
@ -0,0 +1,8 @@
+[
+  {
+    "time": "16:22:54",
+    "cmd": "hi",
+    "response": "Hello! I am Sanad. How can I help you?",
+    "duration_s": 0.0
+  }
+]
--- a/Data/Brain/Sessions/session_003_2026-04-22/detections.json
+++ b/Data/Brain/Sessions/session_003_2026-04-22/detections.json
@ -0,0 +1 @@
+[]
--- a/Data/Brain/Sessions/session_003_2026-04-22/places.json
+++ b/Data/Brain/Sessions/session_003_2026-04-22/places.json
@ -0,0 +1 @@
+{}
--- a/Data/Brain/Sessions/session_004_2026-04-22/alerts.json
+++ b/Data/Brain/Sessions/session_004_2026-04-22/alerts.json
@ -0,0 +1 @@
+[]
--- a/Data/Brain/Sessions/session_004_2026-04-22/commands.json
+++ b/Data/Brain/Sessions/session_004_2026-04-22/commands.json
@ -0,0 +1,8 @@
+[
+  {
+    "time": "16:24:12",
+    "cmd": "what do you see",
+    "response": "I see a man sitting in front of a desk with a backpack and a helmet on it. There is also a red and white object hanging on the wall behind him.",
+    "duration_s": 0.0
+  }
+]
--- a/Data/Brain/Sessions/session_004_2026-04-22/detections.json
+++ b/Data/Brain/Sessions/session_004_2026-04-22/detections.json
@ -0,0 +1 @@
+[]
--- a/Data/Brain/Sessions/session_004_2026-04-22/places.json
+++ b/Data/Brain/Sessions/session_004_2026-04-22/places.json
@ -0,0 +1 @@
+{}
--- a/Models/vosk-model-small-en-us-0.15.zip
+++ b/Models/vosk-model-small-en-us-0.15.zip
--- a/Models/vosk-model-small-en-us-0.15/README
+++ b/Models/vosk-model-small-en-us-0.15/README
@ -0,0 +1,9 @@
+US English model for mobile Vosk applications
+
+Copyright 2020 Alpha Cephei Inc
+
+Accuracy: 10.38 (tedlium test) 9.85 (librispeech test-clean)
+Speed: 0.11xRT (desktop)
+Latency: 0.15s (right context)
+
+
--- a/Models/vosk-model-small-en-us-0.15/am/final.mdl
+++ b/Models/vosk-model-small-en-us-0.15/am/final.mdl
--- a/Models/vosk-model-small-en-us-0.15/conf/mfcc.conf
+++ b/Models/vosk-model-small-en-us-0.15/conf/mfcc.conf
@ -0,0 +1,7 @@
+--sample-frequency=16000
+--use-energy=false
+--num-mel-bins=40
+--num-ceps=40
+--low-freq=20
+--high-freq=7600
+--allow-downsample=true
--- a/Models/vosk-model-small-en-us-0.15/conf/model.conf
+++ b/Models/vosk-model-small-en-us-0.15/conf/model.conf
@ -0,0 +1,10 @@
+--min-active=200
+--max-active=3000
+--beam=10.0
+--lattice-beam=2.0
+--acoustic-scale=1.0
+--frame-subsampling-factor=3
+--endpoint.silence-phones=1:2:3:4:5:6:7:8:9:10
+--endpoint.rule2.min-trailing-silence=0.5
+--endpoint.rule3.min-trailing-silence=0.75
+--endpoint.rule4.min-trailing-silence=1.0
--- a/Models/vosk-model-small-en-us-0.15/graph/Gr.fst
+++ b/Models/vosk-model-small-en-us-0.15/graph/Gr.fst
--- a/Models/vosk-model-small-en-us-0.15/graph/HCLr.fst
+++ b/Models/vosk-model-small-en-us-0.15/graph/HCLr.fst
--- a/Models/vosk-model-small-en-us-0.15/graph/disambig_tid.int
+++ b/Models/vosk-model-small-en-us-0.15/graph/disambig_tid.int
@ -0,0 +1,17 @@
+10015
+10016
+10017
+10018
+10019
+10020
+10021
+10022
+10023
+10024
+10025
+10026
+10027
+10028
+10029
+10030
+10031
--- a/Models/vosk-model-small-en-us-0.15/graph/phones/word_boundary.int
+++ b/Models/vosk-model-small-en-us-0.15/graph/phones/word_boundary.int
@ -0,0 +1,166 @@
+1 nonword
+2 begin
+3 end
+4 internal
+5 singleton
+6 nonword
+7 begin
+8 end
+9 internal
+10 singleton
+11 begin
+12 end
+13 internal
+14 singleton
+15 begin
+16 end
+17 internal
+18 singleton
+19 begin
+20 end
+21 internal
+22 singleton
+23 begin
+24 end
+25 internal
+26 singleton
+27 begin
+28 end
+29 internal
+30 singleton
+31 begin
+32 end
+33 internal
+34 singleton
+35 begin
+36 end
+37 internal
+38 singleton
+39 begin
+40 end
+41 internal
+42 singleton
+43 begin
+44 end
+45 internal
+46 singleton
+47 begin
+48 end
+49 internal
+50 singleton
+51 begin
+52 end
+53 internal
+54 singleton
+55 begin
+56 end
+57 internal
+58 singleton
+59 begin
+60 end
+61 internal
+62 singleton
+63 begin
+64 end
+65 internal
+66 singleton
+67 begin
+68 end
+69 internal
+70 singleton
+71 begin
+72 end
+73 internal
+74 singleton
+75 begin
+76 end
+77 internal
+78 singleton
+79 begin
+80 end
+81 internal
+82 singleton
+83 begin
+84 end
+85 internal
+86 singleton
+87 begin
+88 end
+89 internal
+90 singleton
+91 begin
+92 end
+93 internal
+94 singleton
+95 begin
+96 end
+97 internal
+98 singleton
+99 begin
+100 end
+101 internal
+102 singleton
+103 begin
+104 end
+105 internal
+106 singleton
+107 begin
+108 end
+109 internal
+110 singleton
+111 begin
+112 end
+113 internal
+114 singleton
+115 begin
+116 end
+117 internal
+118 singleton
+119 begin
+120 end
+121 internal
+122 singleton
+123 begin
+124 end
+125 internal
+126 singleton
+127 begin
+128 end
+129 internal
+130 singleton
+131 begin
+132 end
+133 internal
+134 singleton
+135 begin
+136 end
+137 internal
+138 singleton
+139 begin
+140 end
+141 internal
+142 singleton
+143 begin
+144 end
+145 internal
+146 singleton
+147 begin
+148 end
+149 internal
+150 singleton
+151 begin
+152 end
+153 internal
+154 singleton
+155 begin
+156 end
+157 internal
+158 singleton
+159 begin
+160 end
+161 internal
+162 singleton
+163 begin
+164 end
+165 internal
+166 singleton
--- a/Models/vosk-model-small-en-us-0.15/ivector/final.dubm
+++ b/Models/vosk-model-small-en-us-0.15/ivector/final.dubm
--- a/Models/vosk-model-small-en-us-0.15/ivector/final.ie
+++ b/Models/vosk-model-small-en-us-0.15/ivector/final.ie
--- a/Models/vosk-model-small-en-us-0.15/ivector/final.mat
+++ b/Models/vosk-model-small-en-us-0.15/ivector/final.mat
--- a/Models/vosk-model-small-en-us-0.15/ivector/global_cmvn.stats
+++ b/Models/vosk-model-small-en-us-0.15/ivector/global_cmvn.stats
@ -0,0 +1,3 @@
+ [
+  1.682383e+11 -1.1595e+10 -1.521733e+10 4.32034e+09 -2.257938e+10 -1.969666e+10 -2.559265e+10 -1.535687e+10 -1.276854e+10 -4.494483e+09 -1.209085e+10 -5.64008e+09 -1.134847e+10 -3.419512e+09 -1.079542e+10 -4.145463e+09 -6.637486e+09 -1.11318e+09 -3.479773e+09 -1.245932e+08 -1.386961e+09 6.560655e+07 -2.436518e+08 -4.032432e+07 4.620046e+08 -7.714964e+07 9.551484e+08 -4.119761e+08 8.208582e+08 -7.117156e+08 7.457703e+08 -4.3106e+08 1.202726e+09 2.904036e+08 1.231931e+09 3.629848e+08 6.366939e+08 -4.586172e+08 -5.267629e+08 -3.507819e+08 1.679838e+09 
+  1.741141e+13 8.92488e+11 8.743834e+11 8.848896e+11 1.190313e+12 1.160279e+12 1.300066e+12 1.005678e+12 9.39335e+11 8.089614e+11 7.927041e+11 6.882427e+11 6.444235e+11 5.151451e+11 4.825723e+11 3.210106e+11 2.720254e+11 1.772539e+11 1.248102e+11 6.691599e+10 3.599804e+10 1.207574e+10 1.679301e+09 4.594778e+08 5.821614e+09 1.451758e+10 2.55803e+10 3.43277e+10 4.245286e+10 4.784859e+10 4.988591e+10 4.925451e+10 5.074584e+10 4.9557e+10 4.407876e+10 3.421443e+10 3.138606e+10 2.539716e+10 1.948134e+10 1.381167e+10 0 ]
--- a/Models/vosk-model-small-en-us-0.15/ivector/online_cmvn.conf
+++ b/Models/vosk-model-small-en-us-0.15/ivector/online_cmvn.conf
@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
--- a/Models/vosk-model-small-en-us-0.15/ivector/splice.conf
+++ b/Models/vosk-model-small-en-us-0.15/ivector/splice.conf
@ -0,0 +1,2 @@
+--left-context=3
+--right-context=3
--- a/Voice/marcus_voice.py
+++ b/Voice/marcus_voice.py
@ -27,6 +27,7 @@ import sys
 import threading
 import time
 from logging.handlers import RotatingFileHandler
+from typing import Optional

 import numpy as np

@ -87,9 +88,10 @@ class VoiceModule:
        self._stt = self._config["stt"]
        self._mic = self._config["mic"]

-        # Whisper models — lazy loaded on first _voice_loop() iteration
-        self._wake_model = None
-        self._cmd_model = None
+        # STT (Vosk) — lazy loaded on first _voice_loop() iteration.
+        # One Model instance, recognizers are created fresh per-utterance.
+        self._vosk_model = None
+        self._KaldiRecognizer = None

        # Wake words (English only — built-in TTS doesn't do Arabic)
        self._wake_en = [w.lower() for w in self._stt.get("wake_words_en",
@ -115,29 +117,52 @@ class VoiceModule:

    # ─── MODEL LOADING ────────────────────────────────────

-    def _load_whisper(self):
+    def _load_stt(self):
        """
-        Lazy-load Whisper models on CPU.
+        Load Vosk ASR model. Replaces openai-whisper which produced garbage
+        (!!!!!!!) on this Jetson's torch-aarch64 install regardless of
+        audio quality. Vosk uses Kaldi's own CPU kernels — no torch, no
+        numerical instability, ~10× faster than Whisper base on CPU.

-        Force device='cpu' regardless of torch.cuda.is_available(). On the
-        Jetson the torch install sometimes claims CUDA but can't deserialize
-        to it (aarch64 wheel mismatch), and Whisper's default device-auto
-        then crashes with:
-          _pickle.UnpicklingError: Weights only load failed.
-          Attempting to deserialize object on CUDA device 0
-        CPU-only inference is plenty fast for Whisper tiny (~80 MB model).
+        Model path is configured via stt.vosk_model_path (relative to
+        PROJECT_ROOT, or absolute). Default: the small English model,
+        which is ~40 MB and plenty for short voice commands.
        """
-        import whisper
+        from vosk import Model, KaldiRecognizer, SetLogLevel
+        SetLogLevel(-1)   # silence Vosk's stderr spam

-        if self._wake_model is None:
-            log.info("Loading Whisper '%s' for wake word (CPU)...", self._stt["wake_model"])
-            self._wake_model = whisper.load_model(self._stt["wake_model"], device="cpu")
-            log.info("Wake model ready")
+        if self._vosk_model is None:
+            rel = self._stt.get("vosk_model_path", "Models/vosk-model-small-en-us-0.15")
+            model_path = rel if os.path.isabs(rel) else os.path.join(PROJECT_ROOT, rel)
+            if not os.path.isdir(model_path):
+                raise RuntimeError(
+                    "[Voice] Vosk model not found at " + model_path + "\n"
+                    "  Download it on the Jetson:\n"
+                    "    cd ~/Marcus/Models\n"
+                    "    wget https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip\n"
+                    "    unzip vosk-model-small-en-us-0.15.zip"
+                )
+            log.info("Loading Vosk model: %s", model_path)
+            self._vosk_model = Model(model_path)
+            self._KaldiRecognizer = KaldiRecognizer
+            log.info("Vosk model ready")

-        if self._cmd_model is None:
-            log.info("Loading Whisper '%s' for commands (CPU)...", self._stt["command_model"])
-            self._cmd_model = whisper.load_model(self._stt["command_model"], device="cpu")
-            log.info("Command model ready")
+            # NO restricted grammar. Vosk's small English model's lexicon
+            # doesn't contain "sanad" (it's not an English word), so passing
+            # it in a restricted grammar makes Vosk drop the word with:
+            #   WARNING (VoskAPI:UpdateGrammarFst) Ignoring word missing in
+            #   vocabulary: 'sanad'
+            # and the decoder then only has "[unk]" → never matches
+            # anything → Transcribed always empty.
+            #
+            # Instead: open vocabulary transcription, fuzzy-match against
+            # the stt.wake_words_en list which contains the English words
+            # Vosk ACTUALLY hears when you say "sanad" (then, send, sand,
+            # step, signed, etc.).
+            self._wake_grammar = None
+
+    # Back-compat alias for any caller that still references the old name
+    _load_whisper = _load_stt

    # ─── MIC RECORDING (G1 built-in UDP) ──────────────────

@ -189,72 +214,55 @@ class VoiceModule:

    # ─── TRANSCRIPTION ────────────────────────────────────

-    def _transcribe(self, audio: np.ndarray, model, task: str = "transcribe") -> str:
-        """Transcribe audio using Whisper. Returns text."""
-        import warnings
-        import whisper
+    def _transcribe(self, audio: np.ndarray, grammar: Optional[str] = None) -> str:
+        """
+        Transcribe audio using Vosk.

-        # Audio stats — log before transcribe so we can see exactly what
-        # Whisper is being fed. Useful when wake-word never fires: if
-        # peak_int16 is always < 500 the mic is too quiet regardless of
-        # any software gain.
+        When `grammar` is a JSON list string (e.g. `'["sanad","[unk]"]'`),
+        Vosk is constrained to that vocabulary only — perfect for wake-word
+        detection where we KNOW the exact word we want to hear. Pass
+        grammar=None for open-vocabulary transcription (used for commands).
+        """
+        import json as _json
+
+        # Audio stats — still useful for "mic is silent" diagnostics.
        peak_i16 = int(np.abs(audio).max()) if audio.size else 0
        rms_i16  = float(np.sqrt(np.mean(audio.astype(np.float64) ** 2))) if audio.size else 0.0
        log.info("audio stats: samples=%d  peak=%d  rms=%.1f", audio.size, peak_i16, rms_i16)

-        # Convert int16 to float32 [-1, 1]
-        audio_f32 = audio.astype(np.float32) / 32768.0
-
-        # Normalize to ~0.9 peak so Whisper's mel features carry real energy.
-        # Harmless on already-loud audio. Skip if peak is essentially zero
-        # (no signal at all) — amplifying pure noise doesn't help.
-        peak = float(np.abs(audio_f32).max())
-        if peak > 1e-4 and peak < 0.9:
-            audio_f32 = audio_f32 * (0.9 / peak)
-            log.info("normalized ×%.1f (peak %.4f → 0.9)", 0.9 / peak, peak)
-
-        # Suppress the per-call "Performing inference on CPU when CUDA is
-        # available" UserWarning. A module-level warnings.filterwarnings()
-        # doesn't catch it because whisper re-issues the warning every call
-        # via its own logger path. catch_warnings scoped to this call is
-        # the clean way.
-        #
-        # CRITICAL: temperature=0.0 (greedy, no fallback).
-        # Whisper's default temperature schedule is (0.0, 0.2, 0.4, 0.6,
-        # 0.8, 1.0) — it retries with higher temperatures when the greedy
-        # pass misses a quality gate. The retry path calls
-        # `Categorical(logits=logits / temperature).sample()` which blows
-        # up on Jetson's torch-aarch64 (logits overflow to inf → softmax
-        # becomes NaN). Traceback (2026-04-22):
-        #   ValueError: Expected parameter logits ... found invalid values:
-        #   tensor([[nan, nan, nan, ..., nan, nan, nan]])
-        # The voice thread crashed every 2 s and wake-word never fired.
-        # Forcing temperature=0.0 stays on the greedy path (argmax), which
-        # has no Categorical sampler and no numerical instability.
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            result = model.transcribe(
-                audio_f32,
-                language=self._stt["language"],   # None = auto-detect
-                task=task,
-                fp16=False,
-                temperature=0.0,                  # no fallback — avoids NaN bug
-                condition_on_previous_text=False, # no accumulated context
-            )
-        text = result["text"].strip()
-        detected_lang = result.get("language", "unknown")
-
-        # Filter Whisper's "no phonetic content" degeneration patterns.
-        # Near-silence or very quiet speech can produce repetitive filler
-        # like "!!!!!!!!!", ". . . . .", "... ... ...", or a single
-        # repeated word. Treat anything with < 3 distinct alphanumeric
-        # characters as silence so the wake-word check doesn't see it.
-        alnum = ''.join(c.lower() for c in text if c.isalnum())
-        if not alnum or len(set(alnum)) < 3:
-            log.info("Transcribed [%s]: (filtered as noise: %r)", detected_lang, text[:60])
+        if audio.size == 0:
            return ""

-        log.info("Transcribed [%s]: %s", detected_lang, text[:100])
+        # Fresh recognizer per utterance. Pass grammar if provided.
+        if grammar:
+            rec = self._KaldiRecognizer(self._vosk_model, self._sample_rate, grammar)
+        else:
+            rec = self._KaldiRecognizer(self._vosk_model, self._sample_rate)
+        rec.SetWords(False)
+
+        # Single-shot: feed the whole utterance in one AcceptWaveform call,
+        # then take FinalResult. Chunk-based feeding split short "sanad"
+        # utterances across chunk boundaries and Vosk's decoder often
+        # refused to commit, returning empty. Single-shot works for every
+        # voice-assistant example in Vosk's docs.
+        #
+        # When FinalResult is empty, also check PartialResult — sometimes
+        # Vosk heard something but didn't reach a segmentation boundary
+        # yet. PartialResult still has the text, just not "finalized".
+        rec.AcceptWaveform(audio.tobytes())
+        final = _json.loads(rec.FinalResult()).get("text", "").strip()
+        if not final:
+            partial = _json.loads(rec.PartialResult()).get("partial", "").strip()
+            if partial:
+                final = partial
+                log.info("  (partial only, no final commit)")
+        text = final
+
+        if not text:
+            log.info("Transcribed: (empty)")
+            return ""
+
+        log.info("Transcribed: %s", text[:100])
        return text

    def _check_wake_word(self, text: str) -> bool:
@ -298,23 +306,29 @@ class VoiceModule:
                time.sleep(1)

    def _do_idle(self):
-        """Listen for wake word in 2-second chunks."""
+        """Listen for wake word in 4-second chunks. Longer windows give
+        Vosk's decoder enough context to commit short utterances like a
+        single 'sanad'."""
        # Skip if robot is speaking — prevents self-listening
        if self._audio.is_speaking:
            time.sleep(0.2)
            return

-        audio = self._record_chunk(2.0)
+        audio = self._record_chunk(4.0)

        # Double-check speaking didn't start during recording
        if self._audio.is_speaking:
            return

-        # Skip if too quiet (no one talking)
-        if audio.std() < 100:
+        # Skip if too quiet (no one talking). Threshold lowered to 60 to
+        # match the G1 on-board mic's typical noise floor (std ~30-80 when
+        # idle, ~150+ when someone speaks). With 100 we were skipping
+        # quiet "sanad" utterances entirely.
+        if audio.std() < 60:
            return

-        text = self._transcribe(audio, self._wake_model)
+        # Wake-word pass uses restricted Vosk grammar (only "sanad" or "[unk]")
+        text = self._transcribe(audio, grammar=self._wake_grammar)

        if self._check_wake_word(text):
            log.info("Wake word detected!")
@ -330,10 +344,18 @@ class VoiceModule:

    def _do_wake_heard(self):
        """Record the command until silence."""
-        # Wait for "Listening..." TTS to finish before recording
+        # Wait for "Yes" TTS to finish before recording.
        while self._audio.is_speaking:
            time.sleep(0.1)

+        # CRITICAL: flush the mic ring buffer. The UDP multicast receiver
+        # has been accumulating audio continuously (including pre-wake
+        # silence and the TTS "Yes" that just played back into the mic
+        # path). Without flush, _record_until_silence() reads the old
+        # buffered silence instantly, counts 3 silent chunks, and exits
+        # before the user has started speaking the command.
+        self._mic_capture.flush()
+
        log.info("Recording command...")
        audio = self._record_until_silence()

@ -348,7 +370,7 @@ class VoiceModule:

    def _do_processing(self):
        """Transcribe the command and send to brain."""
-        text = self._transcribe(self._command_audio, self._cmd_model)
+        text = self._transcribe(self._command_audio)
        self._command_audio = None

        if not text or len(text.strip()) < 2:
				`@ -0,0 +1 @@`
				`# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh`