Update 2026-04-22 17:01:46

2026-04-22 17:01:48 +04:00 · 2026-04-22 17:01:48 +04:00 · 00e52496a9
commit 00e52496a9
parent 78a5b0b408
12 changed files with 110 additions and 19 deletions
--- a/Config/config_Voice.json
+++ b/Config/config_Voice.json
@ -7,7 +7,11 @@
  "stt": {
    "wake_model": "tiny",
    "command_model": "tiny",
-    "wake_words_en": ["sanad", "sannad", "sanat", "sunnat"],
+    "wake_words_en": [
+      "sanad", "sannad", "sanat", "sunnat",
+      "senad", "sennad", "sanid", "sanud", "sand",
+      "samad", "sandy", "sanday", "sunday", "synod", "signed", "sent"
+    ],
    "language": "en",
    "command_timeout_sec": 10,
    "silence_threshold": 500,
@ -37,7 +41,7 @@
    "log_file": "logs/voice.log"
  },
  "messages": {
-    "wake_heard": "Listening",
+    "wake_heard": "Yes",
    "no_speech": "I didn't catch that, please say it again",
    "error_tts": "Speech synthesis failed",
    "error_mic": "Microphone error",
--- a/Config/marcus_prompts.yaml
+++ b/Config/marcus_prompts.yaml
@ -45,11 +45,14 @@ goal_prompt: |
  {{"reached":<true|false>,"next_move":"<left|right|forward>","duration":<0.3-0.8>,"speak":"<one-sentence description of what THIS camera image actually shows>","confidence":"<low|medium|high>"}}

  Rules:
-  - reached = true ONLY when the target is CLEARLY and unambiguously in the current image. Partial, occluded, uncertain, or similar-but-not-exact = false.
-  - For compound goals ("person holding phone"), both parts must be visible in the SAME frame.
+  - reached = true ONLY when the target described by the mission is CLEARLY present in this exact frame. Default to reached = false.
+  - "office env" ≠ hallway, door, corridor, or random room — require the specific target type (e.g. an office must show desks/monitors/workstations).
+  - "person" means a human body visible — not just a chair or bag that belongs to someone.
+  - If you are not sure the target type matches exactly → reached = false, keep searching.
+  - For compound goals ("person holding phone"), BOTH parts must be visible in the SAME frame.
  - confidence: "high" clear · "medium" likely · "low" keep searching. Only set reached=true at medium+.
  - next_move: "left" (default scan) · "right" · "forward" (approach if target visible but far).
-  - speak MUST describe what this image actually shows right now. Do NOT output the literal text "what you see now" or the literal string "low|medium|high" — replace them with real content.
+  - speak: write a concrete description of the objects visible in THIS frame, in your own words.


 # ── PATROL PROMPT ────────────────────────────────────────────────────────────
--- a/Data/Brain/Sessions/session_001_2026-04-22/alerts.json
+++ b/Data/Brain/Sessions/session_001_2026-04-22/alerts.json
@ -0,0 +1 @@
+[]
--- a/Data/Brain/Sessions/session_001_2026-04-22/commands.json
+++ b/Data/Brain/Sessions/session_001_2026-04-22/commands.json
@ -0,0 +1,8 @@
+[
+  {
+    "time": "15:29:59",
+    "cmd": "hello, can you hear me",
+    "response": "Hello, can you hear me?",
+    "duration_s": 4.69
+  }
+]
--- a/Data/Brain/Sessions/session_001_2026-04-22/detections.json
+++ b/Data/Brain/Sessions/session_001_2026-04-22/detections.json
@ -0,0 +1 @@
+[]
--- a/Data/Brain/Sessions/session_001_2026-04-22/places.json
+++ b/Data/Brain/Sessions/session_001_2026-04-22/places.json
@ -0,0 +1 @@
+{}
--- a/Data/Brain/Sessions/session_002_2026-04-22/alerts.json
+++ b/Data/Brain/Sessions/session_002_2026-04-22/alerts.json
@ -0,0 +1 @@
+[]
--- a/Data/Brain/Sessions/session_002_2026-04-22/commands.json
+++ b/Data/Brain/Sessions/session_002_2026-04-22/commands.json
@ -0,0 +1,8 @@
+[
+  {
+    "time": "15:37:37",
+    "cmd": "turn left",
+    "response": "local command",
+    "duration_s": 0.0
+  }
+]
--- a/Data/Brain/Sessions/session_002_2026-04-22/detections.json
+++ b/Data/Brain/Sessions/session_002_2026-04-22/detections.json
@ -0,0 +1 @@
+[]
--- a/Data/Brain/Sessions/session_002_2026-04-22/places.json
+++ b/Data/Brain/Sessions/session_002_2026-04-22/places.json
@ -0,0 +1 @@
+{}
--- a/Voice/marcus_voice.py
+++ b/Voice/marcus_voice.py
@ -8,8 +8,9 @@ State machine:
  PROCESSING → (Whisper transcribe) → send to brain → SPEAKING
  SPEAKING → (TTS done) → IDLE

-Wake word: "Marcus"  (detected by Whisper tiny)
-Commands:  Transcribed by Whisper small
+Wake word: "Sanad"  (detected by Whisper tiny; mistranscription variants in
+                     config_Voice.json::stt.wake_words_en)
+Commands:  Transcribed by Whisper tiny (small if quality suffers)
 Mic:       G1 built-in array mic via UDP multicast (Voice/builtin_mic.py)
 TTS:       English only, Unitree built-in TtsMaker (API/audio_api.py)

@ -190,27 +191,85 @@ class VoiceModule:

    def _transcribe(self, audio: np.ndarray, model, task: str = "transcribe") -> str:
        """Transcribe audio using Whisper. Returns text."""
+        import warnings
        import whisper

+        # Audio stats — log before transcribe so we can see exactly what
+        # Whisper is being fed. Useful when wake-word never fires: if
+        # peak_int16 is always < 500 the mic is too quiet regardless of
+        # any software gain.
+        peak_i16 = int(np.abs(audio).max()) if audio.size else 0
+        rms_i16  = float(np.sqrt(np.mean(audio.astype(np.float64) ** 2))) if audio.size else 0.0
+        log.info("audio stats: samples=%d  peak=%d  rms=%.1f", audio.size, peak_i16, rms_i16)
+
        # Convert int16 to float32 [-1, 1]
        audio_f32 = audio.astype(np.float32) / 32768.0

-        # Whisper expects 16kHz
+        # Normalize to ~0.9 peak so Whisper's mel features carry real energy.
+        # Harmless on already-loud audio. Skip if peak is essentially zero
+        # (no signal at all) — amplifying pure noise doesn't help.
+        peak = float(np.abs(audio_f32).max())
+        if peak > 1e-4 and peak < 0.9:
+            audio_f32 = audio_f32 * (0.9 / peak)
+            log.info("normalized ×%.1f (peak %.4f → 0.9)", 0.9 / peak, peak)
+
+        # Suppress the per-call "Performing inference on CPU when CUDA is
+        # available" UserWarning. A module-level warnings.filterwarnings()
+        # doesn't catch it because whisper re-issues the warning every call
+        # via its own logger path. catch_warnings scoped to this call is
+        # the clean way.
+        #
+        # CRITICAL: temperature=0.0 (greedy, no fallback).
+        # Whisper's default temperature schedule is (0.0, 0.2, 0.4, 0.6,
+        # 0.8, 1.0) — it retries with higher temperatures when the greedy
+        # pass misses a quality gate. The retry path calls
+        # `Categorical(logits=logits / temperature).sample()` which blows
+        # up on Jetson's torch-aarch64 (logits overflow to inf → softmax
+        # becomes NaN). Traceback (2026-04-22):
+        #   ValueError: Expected parameter logits ... found invalid values:
+        #   tensor([[nan, nan, nan, ..., nan, nan, nan]])
+        # The voice thread crashed every 2 s and wake-word never fired.
+        # Forcing temperature=0.0 stays on the greedy path (argmax), which
+        # has no Categorical sampler and no numerical instability.
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
            result = model.transcribe(
                audio_f32,
                language=self._stt["language"],   # None = auto-detect
                task=task,
                fp16=False,
+                temperature=0.0,                  # no fallback — avoids NaN bug
+                condition_on_previous_text=False, # no accumulated context
            )
        text = result["text"].strip()
        detected_lang = result.get("language", "unknown")
+
+        # Filter Whisper's "no phonetic content" degeneration patterns.
+        # Near-silence or very quiet speech can produce repetitive filler
+        # like "!!!!!!!!!", ". . . . .", "... ... ...", or a single
+        # repeated word. Treat anything with < 3 distinct alphanumeric
+        # characters as silence so the wake-word check doesn't see it.
+        alnum = ''.join(c.lower() for c in text if c.isalnum())
+        if not alnum or len(set(alnum)) < 3:
+            log.info("Transcribed [%s]: (filtered as noise: %r)", detected_lang, text[:60])
+            return ""
+
        log.info("Transcribed [%s]: %s", detected_lang, text[:100])
        return text

    def _check_wake_word(self, text: str) -> bool:
-        """Check if transcribed text contains an English wake word."""
+        """
+        Check if transcribed text contains an English wake word.
+        Matches on word boundary (so "sandstorm" doesn't trigger off "sand"),
+        but is lenient about punctuation/whitespace around the word.
+        """
+        import re
        text_lower = text.lower().strip()
-        return any(w in text_lower for w in self._wake_en)
+        # word-boundary regex built once per call (cheap; runs 2×/sec)
+        for w in self._wake_en:
+            if re.search(r'\b' + re.escape(w) + r'\b', text_lower):
+                return True
+        return False

    # ─── MAIN LOOP ────────────────────────────────────────

--- a/run_marcus.py
+++ b/run_marcus.py
@ -7,12 +7,15 @@ import os
 import sys
 import warnings

-# Silence known-harmless third-party deprecation warnings before ANY heavy
-# import fires them. Keeps the terminal dashboard readable.
+# Silence known-harmless third-party warnings before ANY heavy import fires
+# them. Keeps the terminal dashboard readable.
 #   - TypedStorage  : fires from torch during yolov8m.pt checkpoint load
-#   - torch.ampwhile : fires in ultralytics when FP16 is enabled on Jetson torch 2.1
+#   - torch.cuda.amp : fires in ultralytics when FP16 is enabled on Jetson torch 2.1
+#   - Whisper CPU   : fires on every transcribe call; we intentionally force CPU
+#                     to avoid a torch-aarch64 CUDA deserialization bug
 warnings.filterwarnings("ignore", message=".*TypedStorage is deprecated.*")
 warnings.filterwarnings("ignore", message=".*torch\\.cuda\\.amp.*")
+warnings.filterwarnings("ignore", message=".*Performing inference on CPU when CUDA is available.*")
 os.environ.setdefault("PYTHONWARNINGS", "ignore::UserWarning:torch._utils")

 PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))