433 lines
17 KiB
Python
433 lines
17 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Voice/marcus_voice.py — Marcus Always-Listening Voice Module (English)
|
||
=======================================================================
|
||
State machine:
|
||
IDLE → (wake word detected) → WAKE_HEARD
|
||
WAKE_HEARD → (record command) → PROCESSING
|
||
PROCESSING → (Whisper transcribe) → send to brain → SPEAKING
|
||
SPEAKING → (TTS done) → IDLE
|
||
|
||
Wake word: "Sanad" (detected by Whisper tiny; mistranscription variants in
|
||
config_Voice.json::stt.wake_words_en)
|
||
Commands: Transcribed by Whisper tiny (small if quality suffers)
|
||
Mic: G1 built-in array mic via UDP multicast (Voice/builtin_mic.py)
|
||
TTS: English only, Unitree built-in TtsMaker (API/audio_api.py)
|
||
|
||
Usage:
|
||
from Voice.marcus_voice import VoiceModule
|
||
voice = VoiceModule(audio_api, on_command=brain.handle_voice_command)
|
||
voice.start() # background thread
|
||
voice.stop()
|
||
"""
|
||
|
||
import logging
|
||
import os
|
||
import sys
|
||
import threading
|
||
import time
|
||
from logging.handlers import RotatingFileHandler
|
||
|
||
import numpy as np
|
||
|
||
# ─── PATH + CONFIG ───────────────────────────────────────
|
||
# Single source of truth lives in Core/; everyone else imports from there.
|
||
_PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||
if _PROJECT_DIR not in sys.path:
|
||
sys.path.insert(0, _PROJECT_DIR)
|
||
from Core.env_loader import PROJECT_ROOT
|
||
from Core.config_loader import load_config
|
||
|
||
LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
|
||
os.makedirs(LOG_DIR, exist_ok=True)
|
||
|
||
# Voice runs as a background subsystem — its INFO/DEBUG logs go ONLY to
|
||
# logs/voice.log so they don't drown out the interactive `Command:` prompt.
|
||
# Anything the user needs to see (wake-word fired, command heard) is
|
||
# print()-ed explicitly from the callbacks below.
|
||
# basicConfig is idempotent; audio_api may have already called it.
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
|
||
handlers=[
|
||
RotatingFileHandler(
|
||
os.path.join(LOG_DIR, "voice.log"),
|
||
maxBytes=5_000_000, backupCount=3, encoding="utf-8",
|
||
),
|
||
],
|
||
)
|
||
log = logging.getLogger("marcus_voice")
|
||
|
||
|
||
# ─── STATE ENUM ──────────────────────────────────────────
|
||
|
||
class State:
|
||
IDLE = "IDLE"
|
||
WAKE_HEARD = "WAKE_HEARD"
|
||
PROCESSING = "PROCESSING"
|
||
SPEAKING = "SPEAKING"
|
||
|
||
|
||
# ─── VOICE MODULE ────────────────────────────────────────
|
||
|
||
class VoiceModule:
|
||
"""Always-listening voice interface for Marcus."""
|
||
|
||
def __init__(self, audio_api, on_command=None):
|
||
"""
|
||
Args:
|
||
audio_api: AudioAPI instance (from API/audio_api.py)
|
||
on_command: callback(text: str, lang: str) — "lang" is always "en"
|
||
now; kept in the signature for interface stability.
|
||
"""
|
||
self._audio = audio_api
|
||
self._on_command = on_command
|
||
self._config = load_config("Voice")
|
||
|
||
self._stt = self._config["stt"]
|
||
self._mic = self._config["mic"]
|
||
|
||
# Whisper models — lazy loaded on first _voice_loop() iteration
|
||
self._wake_model = None
|
||
self._cmd_model = None
|
||
|
||
# Wake words (English only — built-in TTS doesn't do Arabic)
|
||
self._wake_en = [w.lower() for w in self._stt.get("wake_words_en",
|
||
["marcus", "marcos"])]
|
||
|
||
# G1 built-in mic (UDP multicast).
|
||
from Voice.builtin_mic import BuiltinMic
|
||
_mcfg = self._config.get("mic_udp", {})
|
||
self._mic_capture = BuiltinMic(
|
||
group=_mcfg.get("group", "239.168.123.161"),
|
||
port=_mcfg.get("port", 5555),
|
||
buf_max=_mcfg.get("buffer_max_bytes", 64000),
|
||
)
|
||
self._sample_rate = self._mic_capture.sample_rate # 16000
|
||
|
||
# State
|
||
self._state = State.IDLE
|
||
self._running = False
|
||
self._thread = None
|
||
self._lock = threading.Lock()
|
||
|
||
log.info("VoiceModule initialized (mic: G1 built-in UDP)")
|
||
|
||
# ─── MODEL LOADING ────────────────────────────────────
|
||
|
||
def _load_whisper(self):
|
||
"""
|
||
Lazy-load Whisper models on CPU.
|
||
|
||
Force device='cpu' regardless of torch.cuda.is_available(). On the
|
||
Jetson the torch install sometimes claims CUDA but can't deserialize
|
||
to it (aarch64 wheel mismatch), and Whisper's default device-auto
|
||
then crashes with:
|
||
_pickle.UnpicklingError: Weights only load failed.
|
||
Attempting to deserialize object on CUDA device 0
|
||
CPU-only inference is plenty fast for Whisper tiny (~80 MB model).
|
||
"""
|
||
import whisper
|
||
|
||
if self._wake_model is None:
|
||
log.info("Loading Whisper '%s' for wake word (CPU)...", self._stt["wake_model"])
|
||
self._wake_model = whisper.load_model(self._stt["wake_model"], device="cpu")
|
||
log.info("Wake model ready")
|
||
|
||
if self._cmd_model is None:
|
||
log.info("Loading Whisper '%s' for commands (CPU)...", self._stt["command_model"])
|
||
self._cmd_model = whisper.load_model(self._stt["command_model"], device="cpu")
|
||
log.info("Command model ready")
|
||
|
||
# ─── MIC RECORDING (G1 built-in UDP) ──────────────────
|
||
|
||
def _record_chunk(self, seconds: float) -> np.ndarray:
|
||
"""Capture a fixed-duration chunk from the G1 built-in mic."""
|
||
num_bytes = int(seconds * self._sample_rate * 2) # int16 mono
|
||
raw = bytearray()
|
||
bite = 1024
|
||
while len(raw) < num_bytes:
|
||
raw.extend(self._mic_capture.read_chunk(min(bite, num_bytes - len(raw))))
|
||
return np.frombuffer(bytes(raw), dtype=np.int16)
|
||
|
||
def _record_until_silence(self) -> np.ndarray:
|
||
"""Capture until RMS drops below threshold for `silence_duration_sec`."""
|
||
threshold = self._stt.get("silence_threshold", 500)
|
||
silence_dur = self._stt.get("silence_duration_sec", 1.5)
|
||
max_dur = self._stt.get("max_record_sec", 15)
|
||
|
||
chunk_sec = 0.5
|
||
chunk_bytes = int(self._sample_rate * chunk_sec) * 2
|
||
silence_chunks_need = int(silence_dur / chunk_sec)
|
||
max_chunks = int(max_dur / chunk_sec)
|
||
|
||
all_audio = []
|
||
silence_count = 0
|
||
chunk_count = 0
|
||
|
||
while chunk_count < max_chunks:
|
||
raw = self._mic_capture.read_chunk(chunk_bytes)
|
||
if not raw:
|
||
break
|
||
chunk = np.frombuffer(raw, dtype=np.int16)
|
||
all_audio.append(chunk)
|
||
chunk_count += 1
|
||
|
||
rms = np.sqrt(np.mean(chunk.astype(np.float64) ** 2))
|
||
if rms < threshold:
|
||
silence_count += 1
|
||
else:
|
||
silence_count = 0
|
||
|
||
if silence_count >= silence_chunks_need and chunk_count > 2:
|
||
log.info("Silence detected after %.1fs", chunk_count * chunk_sec)
|
||
break
|
||
|
||
if all_audio:
|
||
return np.concatenate(all_audio)
|
||
return np.array([], dtype=np.int16)
|
||
|
||
# ─── TRANSCRIPTION ────────────────────────────────────
|
||
|
||
def _transcribe(self, audio: np.ndarray, model, task: str = "transcribe") -> str:
|
||
"""Transcribe audio using Whisper. Returns text."""
|
||
import warnings
|
||
import whisper
|
||
|
||
# Audio stats — log before transcribe so we can see exactly what
|
||
# Whisper is being fed. Useful when wake-word never fires: if
|
||
# peak_int16 is always < 500 the mic is too quiet regardless of
|
||
# any software gain.
|
||
peak_i16 = int(np.abs(audio).max()) if audio.size else 0
|
||
rms_i16 = float(np.sqrt(np.mean(audio.astype(np.float64) ** 2))) if audio.size else 0.0
|
||
log.info("audio stats: samples=%d peak=%d rms=%.1f", audio.size, peak_i16, rms_i16)
|
||
|
||
# Convert int16 to float32 [-1, 1]
|
||
audio_f32 = audio.astype(np.float32) / 32768.0
|
||
|
||
# Normalize to ~0.9 peak so Whisper's mel features carry real energy.
|
||
# Harmless on already-loud audio. Skip if peak is essentially zero
|
||
# (no signal at all) — amplifying pure noise doesn't help.
|
||
peak = float(np.abs(audio_f32).max())
|
||
if peak > 1e-4 and peak < 0.9:
|
||
audio_f32 = audio_f32 * (0.9 / peak)
|
||
log.info("normalized ×%.1f (peak %.4f → 0.9)", 0.9 / peak, peak)
|
||
|
||
# Suppress the per-call "Performing inference on CPU when CUDA is
|
||
# available" UserWarning. A module-level warnings.filterwarnings()
|
||
# doesn't catch it because whisper re-issues the warning every call
|
||
# via its own logger path. catch_warnings scoped to this call is
|
||
# the clean way.
|
||
#
|
||
# CRITICAL: temperature=0.0 (greedy, no fallback).
|
||
# Whisper's default temperature schedule is (0.0, 0.2, 0.4, 0.6,
|
||
# 0.8, 1.0) — it retries with higher temperatures when the greedy
|
||
# pass misses a quality gate. The retry path calls
|
||
# `Categorical(logits=logits / temperature).sample()` which blows
|
||
# up on Jetson's torch-aarch64 (logits overflow to inf → softmax
|
||
# becomes NaN). Traceback (2026-04-22):
|
||
# ValueError: Expected parameter logits ... found invalid values:
|
||
# tensor([[nan, nan, nan, ..., nan, nan, nan]])
|
||
# The voice thread crashed every 2 s and wake-word never fired.
|
||
# Forcing temperature=0.0 stays on the greedy path (argmax), which
|
||
# has no Categorical sampler and no numerical instability.
|
||
with warnings.catch_warnings():
|
||
warnings.simplefilter("ignore")
|
||
result = model.transcribe(
|
||
audio_f32,
|
||
language=self._stt["language"], # None = auto-detect
|
||
task=task,
|
||
fp16=False,
|
||
temperature=0.0, # no fallback — avoids NaN bug
|
||
condition_on_previous_text=False, # no accumulated context
|
||
)
|
||
text = result["text"].strip()
|
||
detected_lang = result.get("language", "unknown")
|
||
|
||
# Filter Whisper's "no phonetic content" degeneration patterns.
|
||
# Near-silence or very quiet speech can produce repetitive filler
|
||
# like "!!!!!!!!!", ". . . . .", "... ... ...", or a single
|
||
# repeated word. Treat anything with < 3 distinct alphanumeric
|
||
# characters as silence so the wake-word check doesn't see it.
|
||
alnum = ''.join(c.lower() for c in text if c.isalnum())
|
||
if not alnum or len(set(alnum)) < 3:
|
||
log.info("Transcribed [%s]: (filtered as noise: %r)", detected_lang, text[:60])
|
||
return ""
|
||
|
||
log.info("Transcribed [%s]: %s", detected_lang, text[:100])
|
||
return text
|
||
|
||
def _check_wake_word(self, text: str) -> bool:
|
||
"""
|
||
Check if transcribed text contains an English wake word.
|
||
Matches on word boundary (so "sandstorm" doesn't trigger off "sand"),
|
||
but is lenient about punctuation/whitespace around the word.
|
||
"""
|
||
import re
|
||
text_lower = text.lower().strip()
|
||
# word-boundary regex built once per call (cheap; runs 2×/sec)
|
||
for w in self._wake_en:
|
||
if re.search(r'\b' + re.escape(w) + r'\b', text_lower):
|
||
return True
|
||
return False
|
||
|
||
# ─── MAIN LOOP ────────────────────────────────────────
|
||
|
||
def _voice_loop(self):
|
||
"""Main voice processing loop — runs in background thread."""
|
||
self._load_whisper()
|
||
self._mic_capture.start()
|
||
log.info("Voice loop started — listening for wake word...")
|
||
|
||
while self._running:
|
||
try:
|
||
if self._state == State.IDLE:
|
||
self._do_idle()
|
||
elif self._state == State.WAKE_HEARD:
|
||
self._do_wake_heard()
|
||
elif self._state == State.PROCESSING:
|
||
self._do_processing()
|
||
elif self._state == State.SPEAKING:
|
||
# Wait for any TTS to finish before returning to IDLE
|
||
while self._audio.is_speaking:
|
||
time.sleep(0.1)
|
||
self._state = State.IDLE
|
||
except Exception as e:
|
||
log.error("Voice loop error: %s", e, exc_info=True)
|
||
self._state = State.IDLE
|
||
time.sleep(1)
|
||
|
||
def _do_idle(self):
|
||
"""Listen for wake word in 2-second chunks."""
|
||
# Skip if robot is speaking — prevents self-listening
|
||
if self._audio.is_speaking:
|
||
time.sleep(0.2)
|
||
return
|
||
|
||
audio = self._record_chunk(2.0)
|
||
|
||
# Double-check speaking didn't start during recording
|
||
if self._audio.is_speaking:
|
||
return
|
||
|
||
# Skip if too quiet (no one talking)
|
||
if audio.std() < 100:
|
||
return
|
||
|
||
text = self._transcribe(audio, self._wake_model)
|
||
|
||
if self._check_wake_word(text):
|
||
log.info("Wake word detected!")
|
||
# One clean line to the terminal so the operator knows voice
|
||
# actually heard them, even though all other voice logs are
|
||
# file-only. \n leads because we may be painting over a
|
||
# half-drawn `Command:` prompt.
|
||
print("\n [Sanad] wake heard — recording command…")
|
||
self._state = State.WAKE_HEARD
|
||
|
||
# Acknowledge
|
||
self._audio.speak(self._config["messages"]["wake_heard"])
|
||
|
||
def _do_wake_heard(self):
|
||
"""Record the command until silence."""
|
||
# Wait for "Listening..." TTS to finish before recording
|
||
while self._audio.is_speaking:
|
||
time.sleep(0.1)
|
||
|
||
log.info("Recording command...")
|
||
audio = self._record_until_silence()
|
||
|
||
if len(audio) < 4000: # < 0.25s at 16kHz
|
||
log.info("Too short, ignoring")
|
||
self._audio.speak(self._config["messages"]["no_speech"])
|
||
self._state = State.IDLE
|
||
return
|
||
|
||
self._command_audio = audio
|
||
self._state = State.PROCESSING
|
||
|
||
def _do_processing(self):
|
||
"""Transcribe the command and send to brain."""
|
||
text = self._transcribe(self._command_audio, self._cmd_model)
|
||
self._command_audio = None
|
||
|
||
if not text or len(text.strip()) < 2:
|
||
log.info("Empty transcription")
|
||
self._audio.speak(self._config["messages"]["no_speech"])
|
||
self._state = State.IDLE
|
||
return
|
||
|
||
log.info("Command: %s", text)
|
||
|
||
# Send to brain callback (lang always "en" in this build)
|
||
if self._on_command:
|
||
try:
|
||
self._on_command(text, "en")
|
||
except Exception as e:
|
||
log.error("Brain callback error: %s", e)
|
||
|
||
self._state = State.IDLE
|
||
|
||
# ─── START / STOP ─────────────────────────────────────
|
||
|
||
def start(self):
|
||
"""Start voice listening in background thread."""
|
||
if self._running:
|
||
log.warning("Voice module already running")
|
||
return
|
||
|
||
self._running = True
|
||
self._state = State.IDLE
|
||
self._thread = threading.Thread(target=self._voice_loop, daemon=True, name="voice")
|
||
self._thread.start()
|
||
log.info("Voice module started")
|
||
|
||
def stop(self):
|
||
"""Stop voice listening."""
|
||
self._running = False
|
||
try:
|
||
self._mic_capture.stop()
|
||
except Exception:
|
||
pass
|
||
if self._thread:
|
||
self._thread.join(timeout=5)
|
||
self._thread = None
|
||
log.info("Voice module stopped")
|
||
|
||
@property
|
||
def state(self) -> str:
|
||
return self._state
|
||
|
||
@property
|
||
def is_running(self) -> bool:
|
||
return self._running
|
||
|
||
|
||
# ─── STANDALONE TEST ─────────────────────────────────────
|
||
|
||
if __name__ == "__main__":
|
||
import sys
|
||
sys.path.insert(0, PROJECT_ROOT)
|
||
from API.audio_api import AudioAPI
|
||
|
||
def on_command(text, lang):
|
||
print(f"\n{'='*50}")
|
||
print(f" COMMAND [{lang}]: {text}")
|
||
print(f"{'='*50}\n")
|
||
|
||
audio = AudioAPI()
|
||
voice = VoiceModule(audio, on_command=on_command)
|
||
|
||
print("Starting voice module... say 'Marcus' to wake.")
|
||
print("Press Ctrl+C to stop.\n")
|
||
|
||
voice.start()
|
||
|
||
try:
|
||
while voice.is_running:
|
||
time.sleep(0.5)
|
||
except KeyboardInterrupt:
|
||
print("\nStopping...")
|
||
voice.stop()
|
||
print("Done.")
|