Marcus/Voice/marcus_voice.py

433 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Voice/marcus_voice.py — Marcus Always-Listening Voice Module (English)
=======================================================================
State machine:
IDLE → (wake word detected) → WAKE_HEARD
WAKE_HEARD → (record command) → PROCESSING
PROCESSING → (Whisper transcribe) → send to brain → SPEAKING
SPEAKING → (TTS done) → IDLE
Wake word: "Sanad" (detected by Whisper tiny; mistranscription variants in
config_Voice.json::stt.wake_words_en)
Commands: Transcribed by Whisper tiny (small if quality suffers)
Mic: G1 built-in array mic via UDP multicast (Voice/builtin_mic.py)
TTS: English only, Unitree built-in TtsMaker (API/audio_api.py)
Usage:
from Voice.marcus_voice import VoiceModule
voice = VoiceModule(audio_api, on_command=brain.handle_voice_command)
voice.start() # background thread
voice.stop()
"""
import logging
import os
import sys
import threading
import time
from logging.handlers import RotatingFileHandler
import numpy as np
# ─── PATH + CONFIG ───────────────────────────────────────
# Single source of truth lives in Core/; everyone else imports from there.
_PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if _PROJECT_DIR not in sys.path:
sys.path.insert(0, _PROJECT_DIR)
from Core.env_loader import PROJECT_ROOT
from Core.config_loader import load_config
LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
os.makedirs(LOG_DIR, exist_ok=True)
# Voice runs as a background subsystem — its INFO/DEBUG logs go ONLY to
# logs/voice.log so they don't drown out the interactive `Command:` prompt.
# Anything the user needs to see (wake-word fired, command heard) is
# print()-ed explicitly from the callbacks below.
# basicConfig is idempotent; audio_api may have already called it.
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
handlers=[
RotatingFileHandler(
os.path.join(LOG_DIR, "voice.log"),
maxBytes=5_000_000, backupCount=3, encoding="utf-8",
),
],
)
log = logging.getLogger("marcus_voice")
# ─── STATE ENUM ──────────────────────────────────────────
class State:
IDLE = "IDLE"
WAKE_HEARD = "WAKE_HEARD"
PROCESSING = "PROCESSING"
SPEAKING = "SPEAKING"
# ─── VOICE MODULE ────────────────────────────────────────
class VoiceModule:
"""Always-listening voice interface for Marcus."""
def __init__(self, audio_api, on_command=None):
"""
Args:
audio_api: AudioAPI instance (from API/audio_api.py)
on_command: callback(text: str, lang: str) — "lang" is always "en"
now; kept in the signature for interface stability.
"""
self._audio = audio_api
self._on_command = on_command
self._config = load_config("Voice")
self._stt = self._config["stt"]
self._mic = self._config["mic"]
# Whisper models — lazy loaded on first _voice_loop() iteration
self._wake_model = None
self._cmd_model = None
# Wake words (English only — built-in TTS doesn't do Arabic)
self._wake_en = [w.lower() for w in self._stt.get("wake_words_en",
["marcus", "marcos"])]
# G1 built-in mic (UDP multicast).
from Voice.builtin_mic import BuiltinMic
_mcfg = self._config.get("mic_udp", {})
self._mic_capture = BuiltinMic(
group=_mcfg.get("group", "239.168.123.161"),
port=_mcfg.get("port", 5555),
buf_max=_mcfg.get("buffer_max_bytes", 64000),
)
self._sample_rate = self._mic_capture.sample_rate # 16000
# State
self._state = State.IDLE
self._running = False
self._thread = None
self._lock = threading.Lock()
log.info("VoiceModule initialized (mic: G1 built-in UDP)")
# ─── MODEL LOADING ────────────────────────────────────
def _load_whisper(self):
"""
Lazy-load Whisper models on CPU.
Force device='cpu' regardless of torch.cuda.is_available(). On the
Jetson the torch install sometimes claims CUDA but can't deserialize
to it (aarch64 wheel mismatch), and Whisper's default device-auto
then crashes with:
_pickle.UnpicklingError: Weights only load failed.
Attempting to deserialize object on CUDA device 0
CPU-only inference is plenty fast for Whisper tiny (~80 MB model).
"""
import whisper
if self._wake_model is None:
log.info("Loading Whisper '%s' for wake word (CPU)...", self._stt["wake_model"])
self._wake_model = whisper.load_model(self._stt["wake_model"], device="cpu")
log.info("Wake model ready")
if self._cmd_model is None:
log.info("Loading Whisper '%s' for commands (CPU)...", self._stt["command_model"])
self._cmd_model = whisper.load_model(self._stt["command_model"], device="cpu")
log.info("Command model ready")
# ─── MIC RECORDING (G1 built-in UDP) ──────────────────
def _record_chunk(self, seconds: float) -> np.ndarray:
"""Capture a fixed-duration chunk from the G1 built-in mic."""
num_bytes = int(seconds * self._sample_rate * 2) # int16 mono
raw = bytearray()
bite = 1024
while len(raw) < num_bytes:
raw.extend(self._mic_capture.read_chunk(min(bite, num_bytes - len(raw))))
return np.frombuffer(bytes(raw), dtype=np.int16)
def _record_until_silence(self) -> np.ndarray:
"""Capture until RMS drops below threshold for `silence_duration_sec`."""
threshold = self._stt.get("silence_threshold", 500)
silence_dur = self._stt.get("silence_duration_sec", 1.5)
max_dur = self._stt.get("max_record_sec", 15)
chunk_sec = 0.5
chunk_bytes = int(self._sample_rate * chunk_sec) * 2
silence_chunks_need = int(silence_dur / chunk_sec)
max_chunks = int(max_dur / chunk_sec)
all_audio = []
silence_count = 0
chunk_count = 0
while chunk_count < max_chunks:
raw = self._mic_capture.read_chunk(chunk_bytes)
if not raw:
break
chunk = np.frombuffer(raw, dtype=np.int16)
all_audio.append(chunk)
chunk_count += 1
rms = np.sqrt(np.mean(chunk.astype(np.float64) ** 2))
if rms < threshold:
silence_count += 1
else:
silence_count = 0
if silence_count >= silence_chunks_need and chunk_count > 2:
log.info("Silence detected after %.1fs", chunk_count * chunk_sec)
break
if all_audio:
return np.concatenate(all_audio)
return np.array([], dtype=np.int16)
# ─── TRANSCRIPTION ────────────────────────────────────
def _transcribe(self, audio: np.ndarray, model, task: str = "transcribe") -> str:
"""Transcribe audio using Whisper. Returns text."""
import warnings
import whisper
# Audio stats — log before transcribe so we can see exactly what
# Whisper is being fed. Useful when wake-word never fires: if
# peak_int16 is always < 500 the mic is too quiet regardless of
# any software gain.
peak_i16 = int(np.abs(audio).max()) if audio.size else 0
rms_i16 = float(np.sqrt(np.mean(audio.astype(np.float64) ** 2))) if audio.size else 0.0
log.info("audio stats: samples=%d peak=%d rms=%.1f", audio.size, peak_i16, rms_i16)
# Convert int16 to float32 [-1, 1]
audio_f32 = audio.astype(np.float32) / 32768.0
# Normalize to ~0.9 peak so Whisper's mel features carry real energy.
# Harmless on already-loud audio. Skip if peak is essentially zero
# (no signal at all) — amplifying pure noise doesn't help.
peak = float(np.abs(audio_f32).max())
if peak > 1e-4 and peak < 0.9:
audio_f32 = audio_f32 * (0.9 / peak)
log.info("normalized ×%.1f (peak %.4f → 0.9)", 0.9 / peak, peak)
# Suppress the per-call "Performing inference on CPU when CUDA is
# available" UserWarning. A module-level warnings.filterwarnings()
# doesn't catch it because whisper re-issues the warning every call
# via its own logger path. catch_warnings scoped to this call is
# the clean way.
#
# CRITICAL: temperature=0.0 (greedy, no fallback).
# Whisper's default temperature schedule is (0.0, 0.2, 0.4, 0.6,
# 0.8, 1.0) — it retries with higher temperatures when the greedy
# pass misses a quality gate. The retry path calls
# `Categorical(logits=logits / temperature).sample()` which blows
# up on Jetson's torch-aarch64 (logits overflow to inf → softmax
# becomes NaN). Traceback (2026-04-22):
# ValueError: Expected parameter logits ... found invalid values:
# tensor([[nan, nan, nan, ..., nan, nan, nan]])
# The voice thread crashed every 2 s and wake-word never fired.
# Forcing temperature=0.0 stays on the greedy path (argmax), which
# has no Categorical sampler and no numerical instability.
with warnings.catch_warnings():
warnings.simplefilter("ignore")
result = model.transcribe(
audio_f32,
language=self._stt["language"], # None = auto-detect
task=task,
fp16=False,
temperature=0.0, # no fallback — avoids NaN bug
condition_on_previous_text=False, # no accumulated context
)
text = result["text"].strip()
detected_lang = result.get("language", "unknown")
# Filter Whisper's "no phonetic content" degeneration patterns.
# Near-silence or very quiet speech can produce repetitive filler
# like "!!!!!!!!!", ". . . . .", "... ... ...", or a single
# repeated word. Treat anything with < 3 distinct alphanumeric
# characters as silence so the wake-word check doesn't see it.
alnum = ''.join(c.lower() for c in text if c.isalnum())
if not alnum or len(set(alnum)) < 3:
log.info("Transcribed [%s]: (filtered as noise: %r)", detected_lang, text[:60])
return ""
log.info("Transcribed [%s]: %s", detected_lang, text[:100])
return text
def _check_wake_word(self, text: str) -> bool:
"""
Check if transcribed text contains an English wake word.
Matches on word boundary (so "sandstorm" doesn't trigger off "sand"),
but is lenient about punctuation/whitespace around the word.
"""
import re
text_lower = text.lower().strip()
# word-boundary regex built once per call (cheap; runs 2×/sec)
for w in self._wake_en:
if re.search(r'\b' + re.escape(w) + r'\b', text_lower):
return True
return False
# ─── MAIN LOOP ────────────────────────────────────────
def _voice_loop(self):
"""Main voice processing loop — runs in background thread."""
self._load_whisper()
self._mic_capture.start()
log.info("Voice loop started — listening for wake word...")
while self._running:
try:
if self._state == State.IDLE:
self._do_idle()
elif self._state == State.WAKE_HEARD:
self._do_wake_heard()
elif self._state == State.PROCESSING:
self._do_processing()
elif self._state == State.SPEAKING:
# Wait for any TTS to finish before returning to IDLE
while self._audio.is_speaking:
time.sleep(0.1)
self._state = State.IDLE
except Exception as e:
log.error("Voice loop error: %s", e, exc_info=True)
self._state = State.IDLE
time.sleep(1)
def _do_idle(self):
"""Listen for wake word in 2-second chunks."""
# Skip if robot is speaking — prevents self-listening
if self._audio.is_speaking:
time.sleep(0.2)
return
audio = self._record_chunk(2.0)
# Double-check speaking didn't start during recording
if self._audio.is_speaking:
return
# Skip if too quiet (no one talking)
if audio.std() < 100:
return
text = self._transcribe(audio, self._wake_model)
if self._check_wake_word(text):
log.info("Wake word detected!")
# One clean line to the terminal so the operator knows voice
# actually heard them, even though all other voice logs are
# file-only. \n leads because we may be painting over a
# half-drawn `Command:` prompt.
print("\n [Sanad] wake heard — recording command…")
self._state = State.WAKE_HEARD
# Acknowledge
self._audio.speak(self._config["messages"]["wake_heard"])
def _do_wake_heard(self):
"""Record the command until silence."""
# Wait for "Listening..." TTS to finish before recording
while self._audio.is_speaking:
time.sleep(0.1)
log.info("Recording command...")
audio = self._record_until_silence()
if len(audio) < 4000: # < 0.25s at 16kHz
log.info("Too short, ignoring")
self._audio.speak(self._config["messages"]["no_speech"])
self._state = State.IDLE
return
self._command_audio = audio
self._state = State.PROCESSING
def _do_processing(self):
"""Transcribe the command and send to brain."""
text = self._transcribe(self._command_audio, self._cmd_model)
self._command_audio = None
if not text or len(text.strip()) < 2:
log.info("Empty transcription")
self._audio.speak(self._config["messages"]["no_speech"])
self._state = State.IDLE
return
log.info("Command: %s", text)
# Send to brain callback (lang always "en" in this build)
if self._on_command:
try:
self._on_command(text, "en")
except Exception as e:
log.error("Brain callback error: %s", e)
self._state = State.IDLE
# ─── START / STOP ─────────────────────────────────────
def start(self):
"""Start voice listening in background thread."""
if self._running:
log.warning("Voice module already running")
return
self._running = True
self._state = State.IDLE
self._thread = threading.Thread(target=self._voice_loop, daemon=True, name="voice")
self._thread.start()
log.info("Voice module started")
def stop(self):
"""Stop voice listening."""
self._running = False
try:
self._mic_capture.stop()
except Exception:
pass
if self._thread:
self._thread.join(timeout=5)
self._thread = None
log.info("Voice module stopped")
@property
def state(self) -> str:
return self._state
@property
def is_running(self) -> bool:
return self._running
# ─── STANDALONE TEST ─────────────────────────────────────
if __name__ == "__main__":
import sys
sys.path.insert(0, PROJECT_ROOT)
from API.audio_api import AudioAPI
def on_command(text, lang):
print(f"\n{'='*50}")
print(f" COMMAND [{lang}]: {text}")
print(f"{'='*50}\n")
audio = AudioAPI()
voice = VoiceModule(audio, on_command=on_command)
print("Starting voice module... say 'Marcus' to wake.")
print("Press Ctrl+C to stop.\n")
voice.start()
try:
while voice.is_running:
time.sleep(0.5)
except KeyboardInterrupt:
print("\nStopping...")
voice.stop()
print("Done.")