diff --git a/Config/config_Voice.json b/Config/config_Voice.json index 91635fb..e25bf0c 100644 --- a/Config/config_Voice.json +++ b/Config/config_Voice.json @@ -7,7 +7,11 @@ "stt": { "wake_model": "tiny", "command_model": "tiny", - "wake_words_en": ["sanad", "sannad", "sanat", "sunnat"], + "wake_words_en": [ + "sanad", "sannad", "sanat", "sunnat", + "senad", "sennad", "sanid", "sanud", "sand", + "samad", "sandy", "sanday", "sunday", "synod", "signed", "sent" + ], "language": "en", "command_timeout_sec": 10, "silence_threshold": 500, @@ -37,7 +41,7 @@ "log_file": "logs/voice.log" }, "messages": { - "wake_heard": "Listening", + "wake_heard": "Yes", "no_speech": "I didn't catch that, please say it again", "error_tts": "Speech synthesis failed", "error_mic": "Microphone error", diff --git a/Config/marcus_prompts.yaml b/Config/marcus_prompts.yaml index 738ee4a..7a71fbf 100644 --- a/Config/marcus_prompts.yaml +++ b/Config/marcus_prompts.yaml @@ -45,11 +45,14 @@ goal_prompt: | {{"reached":,"next_move":"","duration":<0.3-0.8>,"speak":"","confidence":""}} Rules: - - reached = true ONLY when the target is CLEARLY and unambiguously in the current image. Partial, occluded, uncertain, or similar-but-not-exact = false. - - For compound goals ("person holding phone"), both parts must be visible in the SAME frame. + - reached = true ONLY when the target described by the mission is CLEARLY present in this exact frame. Default to reached = false. + - "office env" ≠ hallway, door, corridor, or random room — require the specific target type (e.g. an office must show desks/monitors/workstations). + - "person" means a human body visible — not just a chair or bag that belongs to someone. + - If you are not sure the target type matches exactly → reached = false, keep searching. + - For compound goals ("person holding phone"), BOTH parts must be visible in the SAME frame. - confidence: "high" clear · "medium" likely · "low" keep searching. Only set reached=true at medium+. - next_move: "left" (default scan) · "right" · "forward" (approach if target visible but far). - - speak MUST describe what this image actually shows right now. Do NOT output the literal text "what you see now" or the literal string "low|medium|high" — replace them with real content. + - speak: write a concrete description of the objects visible in THIS frame, in your own words. # ── PATROL PROMPT ──────────────────────────────────────────────────────────── diff --git a/Data/Brain/Sessions/session_001_2026-04-22/alerts.json b/Data/Brain/Sessions/session_001_2026-04-22/alerts.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_001_2026-04-22/alerts.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_001_2026-04-22/commands.json b/Data/Brain/Sessions/session_001_2026-04-22/commands.json new file mode 100644 index 0000000..a420e30 --- /dev/null +++ b/Data/Brain/Sessions/session_001_2026-04-22/commands.json @@ -0,0 +1,8 @@ +[ + { + "time": "15:29:59", + "cmd": "hello, can you hear me", + "response": "Hello, can you hear me?", + "duration_s": 4.69 + } +] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_001_2026-04-22/detections.json b/Data/Brain/Sessions/session_001_2026-04-22/detections.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_001_2026-04-22/detections.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_001_2026-04-22/places.json b/Data/Brain/Sessions/session_001_2026-04-22/places.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/Data/Brain/Sessions/session_001_2026-04-22/places.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/Data/Brain/Sessions/session_002_2026-04-22/alerts.json b/Data/Brain/Sessions/session_002_2026-04-22/alerts.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_002_2026-04-22/alerts.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_002_2026-04-22/commands.json b/Data/Brain/Sessions/session_002_2026-04-22/commands.json new file mode 100644 index 0000000..b5aa0ad --- /dev/null +++ b/Data/Brain/Sessions/session_002_2026-04-22/commands.json @@ -0,0 +1,8 @@ +[ + { + "time": "15:37:37", + "cmd": "turn left", + "response": "local command", + "duration_s": 0.0 + } +] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_002_2026-04-22/detections.json b/Data/Brain/Sessions/session_002_2026-04-22/detections.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_002_2026-04-22/detections.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_002_2026-04-22/places.json b/Data/Brain/Sessions/session_002_2026-04-22/places.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/Data/Brain/Sessions/session_002_2026-04-22/places.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/Voice/marcus_voice.py b/Voice/marcus_voice.py index 684a186..c59d49a 100644 --- a/Voice/marcus_voice.py +++ b/Voice/marcus_voice.py @@ -8,8 +8,9 @@ State machine: PROCESSING → (Whisper transcribe) → send to brain → SPEAKING SPEAKING → (TTS done) → IDLE -Wake word: "Marcus" (detected by Whisper tiny) -Commands: Transcribed by Whisper small +Wake word: "Sanad" (detected by Whisper tiny; mistranscription variants in + config_Voice.json::stt.wake_words_en) +Commands: Transcribed by Whisper tiny (small if quality suffers) Mic: G1 built-in array mic via UDP multicast (Voice/builtin_mic.py) TTS: English only, Unitree built-in TtsMaker (API/audio_api.py) @@ -190,27 +191,85 @@ class VoiceModule: def _transcribe(self, audio: np.ndarray, model, task: str = "transcribe") -> str: """Transcribe audio using Whisper. Returns text.""" + import warnings import whisper + # Audio stats — log before transcribe so we can see exactly what + # Whisper is being fed. Useful when wake-word never fires: if + # peak_int16 is always < 500 the mic is too quiet regardless of + # any software gain. + peak_i16 = int(np.abs(audio).max()) if audio.size else 0 + rms_i16 = float(np.sqrt(np.mean(audio.astype(np.float64) ** 2))) if audio.size else 0.0 + log.info("audio stats: samples=%d peak=%d rms=%.1f", audio.size, peak_i16, rms_i16) + # Convert int16 to float32 [-1, 1] audio_f32 = audio.astype(np.float32) / 32768.0 - # Whisper expects 16kHz - result = model.transcribe( - audio_f32, - language=self._stt["language"], # None = auto-detect - task=task, - fp16=False, - ) + # Normalize to ~0.9 peak so Whisper's mel features carry real energy. + # Harmless on already-loud audio. Skip if peak is essentially zero + # (no signal at all) — amplifying pure noise doesn't help. + peak = float(np.abs(audio_f32).max()) + if peak > 1e-4 and peak < 0.9: + audio_f32 = audio_f32 * (0.9 / peak) + log.info("normalized ×%.1f (peak %.4f → 0.9)", 0.9 / peak, peak) + + # Suppress the per-call "Performing inference on CPU when CUDA is + # available" UserWarning. A module-level warnings.filterwarnings() + # doesn't catch it because whisper re-issues the warning every call + # via its own logger path. catch_warnings scoped to this call is + # the clean way. + # + # CRITICAL: temperature=0.0 (greedy, no fallback). + # Whisper's default temperature schedule is (0.0, 0.2, 0.4, 0.6, + # 0.8, 1.0) — it retries with higher temperatures when the greedy + # pass misses a quality gate. The retry path calls + # `Categorical(logits=logits / temperature).sample()` which blows + # up on Jetson's torch-aarch64 (logits overflow to inf → softmax + # becomes NaN). Traceback (2026-04-22): + # ValueError: Expected parameter logits ... found invalid values: + # tensor([[nan, nan, nan, ..., nan, nan, nan]]) + # The voice thread crashed every 2 s and wake-word never fired. + # Forcing temperature=0.0 stays on the greedy path (argmax), which + # has no Categorical sampler and no numerical instability. + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + result = model.transcribe( + audio_f32, + language=self._stt["language"], # None = auto-detect + task=task, + fp16=False, + temperature=0.0, # no fallback — avoids NaN bug + condition_on_previous_text=False, # no accumulated context + ) text = result["text"].strip() detected_lang = result.get("language", "unknown") + + # Filter Whisper's "no phonetic content" degeneration patterns. + # Near-silence or very quiet speech can produce repetitive filler + # like "!!!!!!!!!", ". . . . .", "... ... ...", or a single + # repeated word. Treat anything with < 3 distinct alphanumeric + # characters as silence so the wake-word check doesn't see it. + alnum = ''.join(c.lower() for c in text if c.isalnum()) + if not alnum or len(set(alnum)) < 3: + log.info("Transcribed [%s]: (filtered as noise: %r)", detected_lang, text[:60]) + return "" + log.info("Transcribed [%s]: %s", detected_lang, text[:100]) return text def _check_wake_word(self, text: str) -> bool: - """Check if transcribed text contains an English wake word.""" + """ + Check if transcribed text contains an English wake word. + Matches on word boundary (so "sandstorm" doesn't trigger off "sand"), + but is lenient about punctuation/whitespace around the word. + """ + import re text_lower = text.lower().strip() - return any(w in text_lower for w in self._wake_en) + # word-boundary regex built once per call (cheap; runs 2×/sec) + for w in self._wake_en: + if re.search(r'\b' + re.escape(w) + r'\b', text_lower): + return True + return False # ─── MAIN LOOP ──────────────────────────────────────── diff --git a/run_marcus.py b/run_marcus.py index 70b5e98..d56dca6 100644 --- a/run_marcus.py +++ b/run_marcus.py @@ -7,12 +7,15 @@ import os import sys import warnings -# Silence known-harmless third-party deprecation warnings before ANY heavy -# import fires them. Keeps the terminal dashboard readable. +# Silence known-harmless third-party warnings before ANY heavy import fires +# them. Keeps the terminal dashboard readable. # - TypedStorage : fires from torch during yolov8m.pt checkpoint load -# - torch.ampwhile : fires in ultralytics when FP16 is enabled on Jetson torch 2.1 +# - torch.cuda.amp : fires in ultralytics when FP16 is enabled on Jetson torch 2.1 +# - Whisper CPU : fires on every transcribe call; we intentionally force CPU +# to avoid a torch-aarch64 CUDA deserialization bug warnings.filterwarnings("ignore", message=".*TypedStorage is deprecated.*") warnings.filterwarnings("ignore", message=".*torch\\.cuda\\.amp.*") +warnings.filterwarnings("ignore", message=".*Performing inference on CPU when CUDA is available.*") os.environ.setdefault("PYTHONWARNINGS", "ignore::UserWarning:torch._utils") PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))