Update 2026-04-22 17:01:46

2026-04-22 17:01:48 +04:00 · 2026-04-22 17:01:48 +04:00 · 00e52496a9
commit 00e52496a9
parent 78a5b0b408
12 changed files with 110 additions and 19 deletions
--- a/Config/config_Voice.json
+++ b/Config/config_Voice.json
@ -7,7 +7,11 @@
  "stt": {
    "wake_model": "tiny",
    "command_model": "tiny",
-    "wake_words_en": ["sanad", "sannad", "sanat", "sunnat"],
+    "wake_words_en": [
      "sanad", "sannad", "sanat", "sunnat",
      "senad", "sennad", "sanid", "sanud", "sand",
      "samad", "sandy", "sanday", "sunday", "synod", "signed", "sent"
    ],
    "language": "en",
    "command_timeout_sec": 10,
    "silence_threshold": 500,
@ -37,7 +41,7 @@
    "log_file": "logs/voice.log"
  },
  "messages": {
-    "wake_heard": "Listening",
+    "wake_heard": "Yes",
    "no_speech": "I didn't catch that, please say it again",
    "error_tts": "Speech synthesis failed",
    "error_mic": "Microphone error",
--- a/Config/marcus_prompts.yaml
+++ b/Config/marcus_prompts.yaml
@ -45,11 +45,14 @@ goal_prompt: |
  {{"reached":<true|false>,"next_move":"<left|right|forward>","duration":<0.3-0.8>,"speak":"<one-sentence description of what THIS camera image actually shows>","confidence":"<low|medium|high>"}}
  Rules:
-  - reached = true ONLY when the target is CLEARLY and unambiguously in the current image. Partial, occluded, uncertain, or similar-but-not-exact = false.
+  - reached = true ONLY when the target described by the mission is CLEARLY present in this exact frame. Default to reached = false.
-  - For compound goals ("person holding phone"), both parts must be visible in the SAME frame.
+  - "office env" ≠ hallway, door, corridor, or random room — require the specific target type (e.g. an office must show desks/monitors/workstations).
  - "person" means a human body visible — not just a chair or bag that belongs to someone.
  - If you are not sure the target type matches exactly → reached = false, keep searching.
  - For compound goals ("person holding phone"), BOTH parts must be visible in the SAME frame.
  - confidence: "high" clear · "medium" likely · "low" keep searching. Only set reached=true at medium+.
  - next_move: "left" (default scan) · "right" · "forward" (approach if target visible but far).
-  - speak MUST describe what this image actually shows right now. Do NOT output the literal text "what you see now" or the literal string "low|medium|high" — replace them with real content.
+  - speak: write a concrete description of the objects visible in THIS frame, in your own words.
 # ── PATROL PROMPT ────────────────────────────────────────────────────────────
--- a/Data/Brain/Sessions/session_001_2026-04-22/alerts.json
+++ b/Data/Brain/Sessions/session_001_2026-04-22/alerts.json
@ -0,0 +1 @@
 []
--- a/Data/Brain/Sessions/session_001_2026-04-22/commands.json
+++ b/Data/Brain/Sessions/session_001_2026-04-22/commands.json
@ -0,0 +1,8 @@
 [
  {
    "time": "15:29:59",
    "cmd": "hello, can you hear me",
    "response": "Hello, can you hear me?",
    "duration_s": 4.69
  }
 ]
--- a/Data/Brain/Sessions/session_001_2026-04-22/detections.json
+++ b/Data/Brain/Sessions/session_001_2026-04-22/detections.json
@ -0,0 +1 @@
 []
--- a/Data/Brain/Sessions/session_001_2026-04-22/places.json
+++ b/Data/Brain/Sessions/session_001_2026-04-22/places.json
@ -0,0 +1 @@
 {}
--- a/Data/Brain/Sessions/session_002_2026-04-22/alerts.json
+++ b/Data/Brain/Sessions/session_002_2026-04-22/alerts.json
@ -0,0 +1 @@
 []
--- a/Data/Brain/Sessions/session_002_2026-04-22/commands.json
+++ b/Data/Brain/Sessions/session_002_2026-04-22/commands.json
@ -0,0 +1,8 @@
 [
  {
    "time": "15:37:37",
    "cmd": "turn left",
    "response": "local command",
    "duration_s": 0.0
  }
 ]
--- a/Data/Brain/Sessions/session_002_2026-04-22/detections.json
+++ b/Data/Brain/Sessions/session_002_2026-04-22/detections.json
@ -0,0 +1 @@
 []
--- a/Data/Brain/Sessions/session_002_2026-04-22/places.json
+++ b/Data/Brain/Sessions/session_002_2026-04-22/places.json
@ -0,0 +1 @@
 {}
--- a/Voice/marcus_voice.py
+++ b/Voice/marcus_voice.py
@ -8,8 +8,9 @@ State machine:
  PROCESSING → (Whisper transcribe) → send to brain → SPEAKING
  SPEAKING → (TTS done) → IDLE
-Wake word: "Marcus"  (detected by Whisper tiny)
+Wake word: "Sanad"  (detected by Whisper tiny; mistranscription variants in
-Commands:  Transcribed by Whisper small
+                     config_Voice.json::stt.wake_words_en)
 Commands:  Transcribed by Whisper tiny (small if quality suffers)
 Mic:       G1 built-in array mic via UDP multicast (Voice/builtin_mic.py)
 TTS:       English only, Unitree built-in TtsMaker (API/audio_api.py)
@ -190,27 +191,85 @@ class VoiceModule:
    def _transcribe(self, audio: np.ndarray, model, task: str = "transcribe") -> str:
        """Transcribe audio using Whisper. Returns text."""
        import warnings
        import whisper
        # Audio stats — log before transcribe so we can see exactly what
        # Whisper is being fed. Useful when wake-word never fires: if
        # peak_int16 is always < 500 the mic is too quiet regardless of
        # any software gain.
        peak_i16 = int(np.abs(audio).max()) if audio.size else 0
        rms_i16  = float(np.sqrt(np.mean(audio.astype(np.float64) ** 2))) if audio.size else 0.0
        log.info("audio stats: samples=%d  peak=%d  rms=%.1f", audio.size, peak_i16, rms_i16)
        # Convert int16 to float32 [-1, 1]
        audio_f32 = audio.astype(np.float32) / 32768.0
-        # Whisper expects 16kHz
+        # Normalize to ~0.9 peak so Whisper's mel features carry real energy.
-        result = model.transcribe(
+        # Harmless on already-loud audio. Skip if peak is essentially zero
-            audio_f32,
+        # (no signal at all) — amplifying pure noise doesn't help.
-            language=self._stt["language"],  # None = auto-detect
+        peak = float(np.abs(audio_f32).max())
-            task=task,
+        if peak > 1e-4 and peak < 0.9:
-            fp16=False,
+            audio_f32 = audio_f32 * (0.9 / peak)
-        )
+            log.info("normalized ×%.1f (peak %.4f → 0.9)", 0.9 / peak, peak)
        # Suppress the per-call "Performing inference on CPU when CUDA is
        # available" UserWarning. A module-level warnings.filterwarnings()
        # doesn't catch it because whisper re-issues the warning every call
        # via its own logger path. catch_warnings scoped to this call is
        # the clean way.
        #
        # CRITICAL: temperature=0.0 (greedy, no fallback).
        # Whisper's default temperature schedule is (0.0, 0.2, 0.4, 0.6,
        # 0.8, 1.0) — it retries with higher temperatures when the greedy
        # pass misses a quality gate. The retry path calls
        # `Categorical(logits=logits / temperature).sample()` which blows
        # up on Jetson's torch-aarch64 (logits overflow to inf → softmax
        # becomes NaN). Traceback (2026-04-22):
        #   ValueError: Expected parameter logits ... found invalid values:
        #   tensor([[nan, nan, nan, ..., nan, nan, nan]])
        # The voice thread crashed every 2 s and wake-word never fired.
        # Forcing temperature=0.0 stays on the greedy path (argmax), which
        # has no Categorical sampler and no numerical instability.
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            result = model.transcribe(
                audio_f32,
                language=self._stt["language"],   # None = auto-detect
                task=task,
                fp16=False,
                temperature=0.0,                  # no fallback — avoids NaN bug
                condition_on_previous_text=False, # no accumulated context
            )
        text = result["text"].strip()
        detected_lang = result.get("language", "unknown")
        # Filter Whisper's "no phonetic content" degeneration patterns.
        # Near-silence or very quiet speech can produce repetitive filler
        # like "!!!!!!!!!", ". . . . .", "... ... ...", or a single
        # repeated word. Treat anything with < 3 distinct alphanumeric
        # characters as silence so the wake-word check doesn't see it.
        alnum = ''.join(c.lower() for c in text if c.isalnum())
        if not alnum or len(set(alnum)) < 3:
            log.info("Transcribed [%s]: (filtered as noise: %r)", detected_lang, text[:60])
            return ""
        log.info("Transcribed [%s]: %s", detected_lang, text[:100])
        return text
    def _check_wake_word(self, text: str) -> bool:
-        """Check if transcribed text contains an English wake word."""
+        """
        Check if transcribed text contains an English wake word.
        Matches on word boundary (so "sandstorm" doesn't trigger off "sand"),
        but is lenient about punctuation/whitespace around the word.
        """
        import re
        text_lower = text.lower().strip()
-        return any(w in text_lower for w in self._wake_en)
+        # word-boundary regex built once per call (cheap; runs 2×/sec)
        for w in self._wake_en:
            if re.search(r'\b' + re.escape(w) + r'\b', text_lower):
                return True
        return False
    # ─── MAIN LOOP ────────────────────────────────────────
--- a/run_marcus.py
+++ b/run_marcus.py
@ -7,12 +7,15 @@ import os
 import sys
 import warnings
-# Silence known-harmless third-party deprecation warnings before ANY heavy
+# Silence known-harmless third-party warnings before ANY heavy import fires
-# import fires them. Keeps the terminal dashboard readable.
+# them. Keeps the terminal dashboard readable.
 #   - TypedStorage  : fires from torch during yolov8m.pt checkpoint load
-#   - torch.ampwhile : fires in ultralytics when FP16 is enabled on Jetson torch 2.1
+#   - torch.cuda.amp : fires in ultralytics when FP16 is enabled on Jetson torch 2.1
 #   - Whisper CPU   : fires on every transcribe call; we intentionally force CPU
 #                     to avoid a torch-aarch64 CUDA deserialization bug
 warnings.filterwarnings("ignore", message=".*TypedStorage is deprecated.*")
 warnings.filterwarnings("ignore", message=".*torch\\.cuda\\.amp.*")
 warnings.filterwarnings("ignore", message=".*Performing inference on CPU when CUDA is available.*")
 os.environ.setdefault("PYTHONWARNINGS", "ignore::UserWarning:torch._utils")
 PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))