Update 2026-04-22 17:01:46

This commit is contained in:
kassam 2026-04-22 17:01:48 +04:00
parent 78a5b0b408
commit 00e52496a9
12 changed files with 110 additions and 19 deletions

View File

@ -7,7 +7,11 @@
"stt": {
"wake_model": "tiny",
"command_model": "tiny",
"wake_words_en": ["sanad", "sannad", "sanat", "sunnat"],
"wake_words_en": [
"sanad", "sannad", "sanat", "sunnat",
"senad", "sennad", "sanid", "sanud", "sand",
"samad", "sandy", "sanday", "sunday", "synod", "signed", "sent"
],
"language": "en",
"command_timeout_sec": 10,
"silence_threshold": 500,
@ -37,7 +41,7 @@
"log_file": "logs/voice.log"
},
"messages": {
"wake_heard": "Listening",
"wake_heard": "Yes",
"no_speech": "I didn't catch that, please say it again",
"error_tts": "Speech synthesis failed",
"error_mic": "Microphone error",

View File

@ -45,11 +45,14 @@ goal_prompt: |
{{"reached":<true|false>,"next_move":"<left|right|forward>","duration":<0.3-0.8>,"speak":"<one-sentence description of what THIS camera image actually shows>","confidence":"<low|medium|high>"}}
Rules:
- reached = true ONLY when the target is CLEARLY and unambiguously in the current image. Partial, occluded, uncertain, or similar-but-not-exact = false.
- For compound goals ("person holding phone"), both parts must be visible in the SAME frame.
- reached = true ONLY when the target described by the mission is CLEARLY present in this exact frame. Default to reached = false.
- "office env" ≠ hallway, door, corridor, or random room — require the specific target type (e.g. an office must show desks/monitors/workstations).
- "person" means a human body visible — not just a chair or bag that belongs to someone.
- If you are not sure the target type matches exactly → reached = false, keep searching.
- For compound goals ("person holding phone"), BOTH parts must be visible in the SAME frame.
- confidence: "high" clear · "medium" likely · "low" keep searching. Only set reached=true at medium+.
- next_move: "left" (default scan) · "right" · "forward" (approach if target visible but far).
- speak MUST describe what this image actually shows right now. Do NOT output the literal text "what you see now" or the literal string "low|medium|high" — replace them with real content.
- speak: write a concrete description of the objects visible in THIS frame, in your own words.
# ── PATROL PROMPT ────────────────────────────────────────────────────────────

View File

@ -0,0 +1 @@
[]

View File

@ -0,0 +1,8 @@
[
{
"time": "15:29:59",
"cmd": "hello, can you hear me",
"response": "Hello, can you hear me?",
"duration_s": 4.69
}
]

View File

@ -0,0 +1 @@
[]

View File

@ -0,0 +1 @@
{}

View File

@ -0,0 +1 @@
[]

View File

@ -0,0 +1,8 @@
[
{
"time": "15:37:37",
"cmd": "turn left",
"response": "local command",
"duration_s": 0.0
}
]

View File

@ -0,0 +1 @@
[]

View File

@ -0,0 +1 @@
{}

View File

@ -8,8 +8,9 @@ State machine:
PROCESSING (Whisper transcribe) send to brain SPEAKING
SPEAKING (TTS done) IDLE
Wake word: "Marcus" (detected by Whisper tiny)
Commands: Transcribed by Whisper small
Wake word: "Sanad" (detected by Whisper tiny; mistranscription variants in
config_Voice.json::stt.wake_words_en)
Commands: Transcribed by Whisper tiny (small if quality suffers)
Mic: G1 built-in array mic via UDP multicast (Voice/builtin_mic.py)
TTS: English only, Unitree built-in TtsMaker (API/audio_api.py)
@ -190,27 +191,85 @@ class VoiceModule:
def _transcribe(self, audio: np.ndarray, model, task: str = "transcribe") -> str:
"""Transcribe audio using Whisper. Returns text."""
import warnings
import whisper
# Audio stats — log before transcribe so we can see exactly what
# Whisper is being fed. Useful when wake-word never fires: if
# peak_int16 is always < 500 the mic is too quiet regardless of
# any software gain.
peak_i16 = int(np.abs(audio).max()) if audio.size else 0
rms_i16 = float(np.sqrt(np.mean(audio.astype(np.float64) ** 2))) if audio.size else 0.0
log.info("audio stats: samples=%d peak=%d rms=%.1f", audio.size, peak_i16, rms_i16)
# Convert int16 to float32 [-1, 1]
audio_f32 = audio.astype(np.float32) / 32768.0
# Whisper expects 16kHz
# Normalize to ~0.9 peak so Whisper's mel features carry real energy.
# Harmless on already-loud audio. Skip if peak is essentially zero
# (no signal at all) — amplifying pure noise doesn't help.
peak = float(np.abs(audio_f32).max())
if peak > 1e-4 and peak < 0.9:
audio_f32 = audio_f32 * (0.9 / peak)
log.info("normalized ×%.1f (peak %.4f → 0.9)", 0.9 / peak, peak)
# Suppress the per-call "Performing inference on CPU when CUDA is
# available" UserWarning. A module-level warnings.filterwarnings()
# doesn't catch it because whisper re-issues the warning every call
# via its own logger path. catch_warnings scoped to this call is
# the clean way.
#
# CRITICAL: temperature=0.0 (greedy, no fallback).
# Whisper's default temperature schedule is (0.0, 0.2, 0.4, 0.6,
# 0.8, 1.0) — it retries with higher temperatures when the greedy
# pass misses a quality gate. The retry path calls
# `Categorical(logits=logits / temperature).sample()` which blows
# up on Jetson's torch-aarch64 (logits overflow to inf → softmax
# becomes NaN). Traceback (2026-04-22):
# ValueError: Expected parameter logits ... found invalid values:
# tensor([[nan, nan, nan, ..., nan, nan, nan]])
# The voice thread crashed every 2 s and wake-word never fired.
# Forcing temperature=0.0 stays on the greedy path (argmax), which
# has no Categorical sampler and no numerical instability.
with warnings.catch_warnings():
warnings.simplefilter("ignore")
result = model.transcribe(
audio_f32,
language=self._stt["language"], # None = auto-detect
task=task,
fp16=False,
temperature=0.0, # no fallback — avoids NaN bug
condition_on_previous_text=False, # no accumulated context
)
text = result["text"].strip()
detected_lang = result.get("language", "unknown")
# Filter Whisper's "no phonetic content" degeneration patterns.
# Near-silence or very quiet speech can produce repetitive filler
# like "!!!!!!!!!", ". . . . .", "... ... ...", or a single
# repeated word. Treat anything with < 3 distinct alphanumeric
# characters as silence so the wake-word check doesn't see it.
alnum = ''.join(c.lower() for c in text if c.isalnum())
if not alnum or len(set(alnum)) < 3:
log.info("Transcribed [%s]: (filtered as noise: %r)", detected_lang, text[:60])
return ""
log.info("Transcribed [%s]: %s", detected_lang, text[:100])
return text
def _check_wake_word(self, text: str) -> bool:
"""Check if transcribed text contains an English wake word."""
"""
Check if transcribed text contains an English wake word.
Matches on word boundary (so "sandstorm" doesn't trigger off "sand"),
but is lenient about punctuation/whitespace around the word.
"""
import re
text_lower = text.lower().strip()
return any(w in text_lower for w in self._wake_en)
# word-boundary regex built once per call (cheap; runs 2×/sec)
for w in self._wake_en:
if re.search(r'\b' + re.escape(w) + r'\b', text_lower):
return True
return False
# ─── MAIN LOOP ────────────────────────────────────────

View File

@ -7,12 +7,15 @@ import os
import sys
import warnings
# Silence known-harmless third-party deprecation warnings before ANY heavy
# import fires them. Keeps the terminal dashboard readable.
# Silence known-harmless third-party warnings before ANY heavy import fires
# them. Keeps the terminal dashboard readable.
# - TypedStorage : fires from torch during yolov8m.pt checkpoint load
# - torch.ampwhile : fires in ultralytics when FP16 is enabled on Jetson torch 2.1
# - torch.cuda.amp : fires in ultralytics when FP16 is enabled on Jetson torch 2.1
# - Whisper CPU : fires on every transcribe call; we intentionally force CPU
# to avoid a torch-aarch64 CUDA deserialization bug
warnings.filterwarnings("ignore", message=".*TypedStorage is deprecated.*")
warnings.filterwarnings("ignore", message=".*torch\\.cuda\\.amp.*")
warnings.filterwarnings("ignore", message=".*Performing inference on CPU when CUDA is available.*")
os.environ.setdefault("PYTHONWARNINGS", "ignore::UserWarning:torch._utils")
PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))