Update 2026-04-22 17:01:46
This commit is contained in:
parent
78a5b0b408
commit
00e52496a9
@ -7,7 +7,11 @@
|
|||||||
"stt": {
|
"stt": {
|
||||||
"wake_model": "tiny",
|
"wake_model": "tiny",
|
||||||
"command_model": "tiny",
|
"command_model": "tiny",
|
||||||
"wake_words_en": ["sanad", "sannad", "sanat", "sunnat"],
|
"wake_words_en": [
|
||||||
|
"sanad", "sannad", "sanat", "sunnat",
|
||||||
|
"senad", "sennad", "sanid", "sanud", "sand",
|
||||||
|
"samad", "sandy", "sanday", "sunday", "synod", "signed", "sent"
|
||||||
|
],
|
||||||
"language": "en",
|
"language": "en",
|
||||||
"command_timeout_sec": 10,
|
"command_timeout_sec": 10,
|
||||||
"silence_threshold": 500,
|
"silence_threshold": 500,
|
||||||
@ -37,7 +41,7 @@
|
|||||||
"log_file": "logs/voice.log"
|
"log_file": "logs/voice.log"
|
||||||
},
|
},
|
||||||
"messages": {
|
"messages": {
|
||||||
"wake_heard": "Listening",
|
"wake_heard": "Yes",
|
||||||
"no_speech": "I didn't catch that, please say it again",
|
"no_speech": "I didn't catch that, please say it again",
|
||||||
"error_tts": "Speech synthesis failed",
|
"error_tts": "Speech synthesis failed",
|
||||||
"error_mic": "Microphone error",
|
"error_mic": "Microphone error",
|
||||||
|
|||||||
@ -45,11 +45,14 @@ goal_prompt: |
|
|||||||
{{"reached":<true|false>,"next_move":"<left|right|forward>","duration":<0.3-0.8>,"speak":"<one-sentence description of what THIS camera image actually shows>","confidence":"<low|medium|high>"}}
|
{{"reached":<true|false>,"next_move":"<left|right|forward>","duration":<0.3-0.8>,"speak":"<one-sentence description of what THIS camera image actually shows>","confidence":"<low|medium|high>"}}
|
||||||
|
|
||||||
Rules:
|
Rules:
|
||||||
- reached = true ONLY when the target is CLEARLY and unambiguously in the current image. Partial, occluded, uncertain, or similar-but-not-exact = false.
|
- reached = true ONLY when the target described by the mission is CLEARLY present in this exact frame. Default to reached = false.
|
||||||
- For compound goals ("person holding phone"), both parts must be visible in the SAME frame.
|
- "office env" ≠ hallway, door, corridor, or random room — require the specific target type (e.g. an office must show desks/monitors/workstations).
|
||||||
|
- "person" means a human body visible — not just a chair or bag that belongs to someone.
|
||||||
|
- If you are not sure the target type matches exactly → reached = false, keep searching.
|
||||||
|
- For compound goals ("person holding phone"), BOTH parts must be visible in the SAME frame.
|
||||||
- confidence: "high" clear · "medium" likely · "low" keep searching. Only set reached=true at medium+.
|
- confidence: "high" clear · "medium" likely · "low" keep searching. Only set reached=true at medium+.
|
||||||
- next_move: "left" (default scan) · "right" · "forward" (approach if target visible but far).
|
- next_move: "left" (default scan) · "right" · "forward" (approach if target visible but far).
|
||||||
- speak MUST describe what this image actually shows right now. Do NOT output the literal text "what you see now" or the literal string "low|medium|high" — replace them with real content.
|
- speak: write a concrete description of the objects visible in THIS frame, in your own words.
|
||||||
|
|
||||||
|
|
||||||
# ── PATROL PROMPT ────────────────────────────────────────────────────────────
|
# ── PATROL PROMPT ────────────────────────────────────────────────────────────
|
||||||
|
|||||||
1
Data/Brain/Sessions/session_001_2026-04-22/alerts.json
Normal file
1
Data/Brain/Sessions/session_001_2026-04-22/alerts.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
[]
|
||||||
8
Data/Brain/Sessions/session_001_2026-04-22/commands.json
Normal file
8
Data/Brain/Sessions/session_001_2026-04-22/commands.json
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"time": "15:29:59",
|
||||||
|
"cmd": "hello, can you hear me",
|
||||||
|
"response": "Hello, can you hear me?",
|
||||||
|
"duration_s": 4.69
|
||||||
|
}
|
||||||
|
]
|
||||||
@ -0,0 +1 @@
|
|||||||
|
[]
|
||||||
1
Data/Brain/Sessions/session_001_2026-04-22/places.json
Normal file
1
Data/Brain/Sessions/session_001_2026-04-22/places.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
{}
|
||||||
1
Data/Brain/Sessions/session_002_2026-04-22/alerts.json
Normal file
1
Data/Brain/Sessions/session_002_2026-04-22/alerts.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
[]
|
||||||
8
Data/Brain/Sessions/session_002_2026-04-22/commands.json
Normal file
8
Data/Brain/Sessions/session_002_2026-04-22/commands.json
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"time": "15:37:37",
|
||||||
|
"cmd": "turn left",
|
||||||
|
"response": "local command",
|
||||||
|
"duration_s": 0.0
|
||||||
|
}
|
||||||
|
]
|
||||||
@ -0,0 +1 @@
|
|||||||
|
[]
|
||||||
1
Data/Brain/Sessions/session_002_2026-04-22/places.json
Normal file
1
Data/Brain/Sessions/session_002_2026-04-22/places.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
{}
|
||||||
@ -8,8 +8,9 @@ State machine:
|
|||||||
PROCESSING → (Whisper transcribe) → send to brain → SPEAKING
|
PROCESSING → (Whisper transcribe) → send to brain → SPEAKING
|
||||||
SPEAKING → (TTS done) → IDLE
|
SPEAKING → (TTS done) → IDLE
|
||||||
|
|
||||||
Wake word: "Marcus" (detected by Whisper tiny)
|
Wake word: "Sanad" (detected by Whisper tiny; mistranscription variants in
|
||||||
Commands: Transcribed by Whisper small
|
config_Voice.json::stt.wake_words_en)
|
||||||
|
Commands: Transcribed by Whisper tiny (small if quality suffers)
|
||||||
Mic: G1 built-in array mic via UDP multicast (Voice/builtin_mic.py)
|
Mic: G1 built-in array mic via UDP multicast (Voice/builtin_mic.py)
|
||||||
TTS: English only, Unitree built-in TtsMaker (API/audio_api.py)
|
TTS: English only, Unitree built-in TtsMaker (API/audio_api.py)
|
||||||
|
|
||||||
@ -190,27 +191,85 @@ class VoiceModule:
|
|||||||
|
|
||||||
def _transcribe(self, audio: np.ndarray, model, task: str = "transcribe") -> str:
|
def _transcribe(self, audio: np.ndarray, model, task: str = "transcribe") -> str:
|
||||||
"""Transcribe audio using Whisper. Returns text."""
|
"""Transcribe audio using Whisper. Returns text."""
|
||||||
|
import warnings
|
||||||
import whisper
|
import whisper
|
||||||
|
|
||||||
|
# Audio stats — log before transcribe so we can see exactly what
|
||||||
|
# Whisper is being fed. Useful when wake-word never fires: if
|
||||||
|
# peak_int16 is always < 500 the mic is too quiet regardless of
|
||||||
|
# any software gain.
|
||||||
|
peak_i16 = int(np.abs(audio).max()) if audio.size else 0
|
||||||
|
rms_i16 = float(np.sqrt(np.mean(audio.astype(np.float64) ** 2))) if audio.size else 0.0
|
||||||
|
log.info("audio stats: samples=%d peak=%d rms=%.1f", audio.size, peak_i16, rms_i16)
|
||||||
|
|
||||||
# Convert int16 to float32 [-1, 1]
|
# Convert int16 to float32 [-1, 1]
|
||||||
audio_f32 = audio.astype(np.float32) / 32768.0
|
audio_f32 = audio.astype(np.float32) / 32768.0
|
||||||
|
|
||||||
# Whisper expects 16kHz
|
# Normalize to ~0.9 peak so Whisper's mel features carry real energy.
|
||||||
result = model.transcribe(
|
# Harmless on already-loud audio. Skip if peak is essentially zero
|
||||||
audio_f32,
|
# (no signal at all) — amplifying pure noise doesn't help.
|
||||||
language=self._stt["language"], # None = auto-detect
|
peak = float(np.abs(audio_f32).max())
|
||||||
task=task,
|
if peak > 1e-4 and peak < 0.9:
|
||||||
fp16=False,
|
audio_f32 = audio_f32 * (0.9 / peak)
|
||||||
)
|
log.info("normalized ×%.1f (peak %.4f → 0.9)", 0.9 / peak, peak)
|
||||||
|
|
||||||
|
# Suppress the per-call "Performing inference on CPU when CUDA is
|
||||||
|
# available" UserWarning. A module-level warnings.filterwarnings()
|
||||||
|
# doesn't catch it because whisper re-issues the warning every call
|
||||||
|
# via its own logger path. catch_warnings scoped to this call is
|
||||||
|
# the clean way.
|
||||||
|
#
|
||||||
|
# CRITICAL: temperature=0.0 (greedy, no fallback).
|
||||||
|
# Whisper's default temperature schedule is (0.0, 0.2, 0.4, 0.6,
|
||||||
|
# 0.8, 1.0) — it retries with higher temperatures when the greedy
|
||||||
|
# pass misses a quality gate. The retry path calls
|
||||||
|
# `Categorical(logits=logits / temperature).sample()` which blows
|
||||||
|
# up on Jetson's torch-aarch64 (logits overflow to inf → softmax
|
||||||
|
# becomes NaN). Traceback (2026-04-22):
|
||||||
|
# ValueError: Expected parameter logits ... found invalid values:
|
||||||
|
# tensor([[nan, nan, nan, ..., nan, nan, nan]])
|
||||||
|
# The voice thread crashed every 2 s and wake-word never fired.
|
||||||
|
# Forcing temperature=0.0 stays on the greedy path (argmax), which
|
||||||
|
# has no Categorical sampler and no numerical instability.
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("ignore")
|
||||||
|
result = model.transcribe(
|
||||||
|
audio_f32,
|
||||||
|
language=self._stt["language"], # None = auto-detect
|
||||||
|
task=task,
|
||||||
|
fp16=False,
|
||||||
|
temperature=0.0, # no fallback — avoids NaN bug
|
||||||
|
condition_on_previous_text=False, # no accumulated context
|
||||||
|
)
|
||||||
text = result["text"].strip()
|
text = result["text"].strip()
|
||||||
detected_lang = result.get("language", "unknown")
|
detected_lang = result.get("language", "unknown")
|
||||||
|
|
||||||
|
# Filter Whisper's "no phonetic content" degeneration patterns.
|
||||||
|
# Near-silence or very quiet speech can produce repetitive filler
|
||||||
|
# like "!!!!!!!!!", ". . . . .", "... ... ...", or a single
|
||||||
|
# repeated word. Treat anything with < 3 distinct alphanumeric
|
||||||
|
# characters as silence so the wake-word check doesn't see it.
|
||||||
|
alnum = ''.join(c.lower() for c in text if c.isalnum())
|
||||||
|
if not alnum or len(set(alnum)) < 3:
|
||||||
|
log.info("Transcribed [%s]: (filtered as noise: %r)", detected_lang, text[:60])
|
||||||
|
return ""
|
||||||
|
|
||||||
log.info("Transcribed [%s]: %s", detected_lang, text[:100])
|
log.info("Transcribed [%s]: %s", detected_lang, text[:100])
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def _check_wake_word(self, text: str) -> bool:
|
def _check_wake_word(self, text: str) -> bool:
|
||||||
"""Check if transcribed text contains an English wake word."""
|
"""
|
||||||
|
Check if transcribed text contains an English wake word.
|
||||||
|
Matches on word boundary (so "sandstorm" doesn't trigger off "sand"),
|
||||||
|
but is lenient about punctuation/whitespace around the word.
|
||||||
|
"""
|
||||||
|
import re
|
||||||
text_lower = text.lower().strip()
|
text_lower = text.lower().strip()
|
||||||
return any(w in text_lower for w in self._wake_en)
|
# word-boundary regex built once per call (cheap; runs 2×/sec)
|
||||||
|
for w in self._wake_en:
|
||||||
|
if re.search(r'\b' + re.escape(w) + r'\b', text_lower):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
# ─── MAIN LOOP ────────────────────────────────────────
|
# ─── MAIN LOOP ────────────────────────────────────────
|
||||||
|
|
||||||
|
|||||||
@ -7,12 +7,15 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
# Silence known-harmless third-party deprecation warnings before ANY heavy
|
# Silence known-harmless third-party warnings before ANY heavy import fires
|
||||||
# import fires them. Keeps the terminal dashboard readable.
|
# them. Keeps the terminal dashboard readable.
|
||||||
# - TypedStorage : fires from torch during yolov8m.pt checkpoint load
|
# - TypedStorage : fires from torch during yolov8m.pt checkpoint load
|
||||||
# - torch.ampwhile : fires in ultralytics when FP16 is enabled on Jetson torch 2.1
|
# - torch.cuda.amp : fires in ultralytics when FP16 is enabled on Jetson torch 2.1
|
||||||
|
# - Whisper CPU : fires on every transcribe call; we intentionally force CPU
|
||||||
|
# to avoid a torch-aarch64 CUDA deserialization bug
|
||||||
warnings.filterwarnings("ignore", message=".*TypedStorage is deprecated.*")
|
warnings.filterwarnings("ignore", message=".*TypedStorage is deprecated.*")
|
||||||
warnings.filterwarnings("ignore", message=".*torch\\.cuda\\.amp.*")
|
warnings.filterwarnings("ignore", message=".*torch\\.cuda\\.amp.*")
|
||||||
|
warnings.filterwarnings("ignore", message=".*Performing inference on CPU when CUDA is available.*")
|
||||||
os.environ.setdefault("PYTHONWARNINGS", "ignore::UserWarning:torch._utils")
|
os.environ.setdefault("PYTHONWARNINGS", "ignore::UserWarning:torch._utils")
|
||||||
|
|
||||||
PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
|
PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user