Marcus/Voice/marcus_voice.py

351 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Voice/marcus_voice.py — Marcus Always-Listening Voice Module (English)
=======================================================================
State machine:
IDLE → (wake word detected) → WAKE_HEARD
WAKE_HEARD → (record command) → PROCESSING
PROCESSING → (Whisper transcribe) → send to brain → SPEAKING
SPEAKING → (TTS done) → IDLE
Wake word: "Marcus" (detected by Whisper tiny)
Commands: Transcribed by Whisper small
Mic: G1 built-in array mic via UDP multicast (Voice/builtin_mic.py)
TTS: English only, Unitree built-in TtsMaker (API/audio_api.py)
Usage:
from Voice.marcus_voice import VoiceModule
voice = VoiceModule(audio_api, on_command=brain.handle_voice_command)
voice.start() # background thread
voice.stop()
"""
import logging
import os
import sys
import threading
import time
import numpy as np
# ─── PATH + CONFIG ───────────────────────────────────────
# Single source of truth lives in Core/; everyone else imports from there.
_PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if _PROJECT_DIR not in sys.path:
sys.path.insert(0, _PROJECT_DIR)
from Core.env_loader import PROJECT_ROOT
from Core.config_loader import load_config
LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
os.makedirs(LOG_DIR, exist_ok=True)
# Idempotent — only the first call per process installs handlers.
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
handlers=[
logging.FileHandler(os.path.join(LOG_DIR, "voice.log")),
logging.StreamHandler(),
],
)
log = logging.getLogger("marcus_voice")
# ─── STATE ENUM ──────────────────────────────────────────
class State:
IDLE = "IDLE"
WAKE_HEARD = "WAKE_HEARD"
PROCESSING = "PROCESSING"
SPEAKING = "SPEAKING"
# ─── VOICE MODULE ────────────────────────────────────────
class VoiceModule:
"""Always-listening voice interface for Marcus."""
def __init__(self, audio_api, on_command=None):
"""
Args:
audio_api: AudioAPI instance (from API/audio_api.py)
on_command: callback(text: str, lang: str) — "lang" is always "en"
now; kept in the signature for interface stability.
"""
self._audio = audio_api
self._on_command = on_command
self._config = load_config("Voice")
self._stt = self._config["stt"]
self._mic = self._config["mic"]
# Whisper models — lazy loaded on first _voice_loop() iteration
self._wake_model = None
self._cmd_model = None
# Wake words (English only — built-in TTS doesn't do Arabic)
self._wake_en = [w.lower() for w in self._stt.get("wake_words_en",
["marcus", "marcos"])]
# G1 built-in mic (UDP multicast).
from Voice.builtin_mic import BuiltinMic
_mcfg = self._config.get("mic_udp", {})
self._mic_capture = BuiltinMic(
group=_mcfg.get("group", "239.168.123.161"),
port=_mcfg.get("port", 5555),
buf_max=_mcfg.get("buffer_max_bytes", 64000),
)
self._sample_rate = self._mic_capture.sample_rate # 16000
# State
self._state = State.IDLE
self._running = False
self._thread = None
self._lock = threading.Lock()
log.info("VoiceModule initialized (mic: G1 built-in UDP)")
# ─── MODEL LOADING ────────────────────────────────────
def _load_whisper(self):
"""Lazy-load Whisper models."""
import whisper
if self._wake_model is None:
log.info("Loading Whisper '%s' for wake word...", self._stt["wake_model"])
self._wake_model = whisper.load_model(self._stt["wake_model"])
log.info("Wake model ready")
if self._cmd_model is None:
log.info("Loading Whisper '%s' for commands...", self._stt["command_model"])
self._cmd_model = whisper.load_model(self._stt["command_model"])
log.info("Command model ready")
# ─── MIC RECORDING (G1 built-in UDP) ──────────────────
def _record_chunk(self, seconds: float) -> np.ndarray:
"""Capture a fixed-duration chunk from the G1 built-in mic."""
num_bytes = int(seconds * self._sample_rate * 2) # int16 mono
raw = bytearray()
bite = 1024
while len(raw) < num_bytes:
raw.extend(self._mic_capture.read_chunk(min(bite, num_bytes - len(raw))))
return np.frombuffer(bytes(raw), dtype=np.int16)
def _record_until_silence(self) -> np.ndarray:
"""Capture until RMS drops below threshold for `silence_duration_sec`."""
threshold = self._stt.get("silence_threshold", 500)
silence_dur = self._stt.get("silence_duration_sec", 1.5)
max_dur = self._stt.get("max_record_sec", 15)
chunk_sec = 0.5
chunk_bytes = int(self._sample_rate * chunk_sec) * 2
silence_chunks_need = int(silence_dur / chunk_sec)
max_chunks = int(max_dur / chunk_sec)
all_audio = []
silence_count = 0
chunk_count = 0
while chunk_count < max_chunks:
raw = self._mic_capture.read_chunk(chunk_bytes)
if not raw:
break
chunk = np.frombuffer(raw, dtype=np.int16)
all_audio.append(chunk)
chunk_count += 1
rms = np.sqrt(np.mean(chunk.astype(np.float64) ** 2))
if rms < threshold:
silence_count += 1
else:
silence_count = 0
if silence_count >= silence_chunks_need and chunk_count > 2:
log.info("Silence detected after %.1fs", chunk_count * chunk_sec)
break
if all_audio:
return np.concatenate(all_audio)
return np.array([], dtype=np.int16)
# ─── TRANSCRIPTION ────────────────────────────────────
def _transcribe(self, audio: np.ndarray, model, task: str = "transcribe") -> str:
"""Transcribe audio using Whisper. Returns text."""
import whisper
# Convert int16 to float32 [-1, 1]
audio_f32 = audio.astype(np.float32) / 32768.0
# Whisper expects 16kHz
result = model.transcribe(
audio_f32,
language=self._stt["language"], # None = auto-detect
task=task,
fp16=False,
)
text = result["text"].strip()
detected_lang = result.get("language", "unknown")
log.info("Transcribed [%s]: %s", detected_lang, text[:100])
return text
def _check_wake_word(self, text: str) -> bool:
"""Check if transcribed text contains an English wake word."""
text_lower = text.lower().strip()
return any(w in text_lower for w in self._wake_en)
# ─── MAIN LOOP ────────────────────────────────────────
def _voice_loop(self):
"""Main voice processing loop — runs in background thread."""
self._load_whisper()
self._mic_capture.start()
log.info("Voice loop started — listening for wake word...")
while self._running:
try:
if self._state == State.IDLE:
self._do_idle()
elif self._state == State.WAKE_HEARD:
self._do_wake_heard()
elif self._state == State.PROCESSING:
self._do_processing()
elif self._state == State.SPEAKING:
# Wait for any TTS to finish before returning to IDLE
while self._audio.is_speaking:
time.sleep(0.1)
self._state = State.IDLE
except Exception as e:
log.error("Voice loop error: %s", e, exc_info=True)
self._state = State.IDLE
time.sleep(1)
def _do_idle(self):
"""Listen for wake word in 2-second chunks."""
# Skip if robot is speaking — prevents self-listening
if self._audio.is_speaking:
time.sleep(0.2)
return
audio = self._record_chunk(2.0)
# Double-check speaking didn't start during recording
if self._audio.is_speaking:
return
# Skip if too quiet (no one talking)
if audio.std() < 100:
return
text = self._transcribe(audio, self._wake_model)
if self._check_wake_word(text):
log.info("Wake word detected!")
self._state = State.WAKE_HEARD
# Acknowledge
self._audio.speak(self._config["messages"]["wake_heard"])
def _do_wake_heard(self):
"""Record the command until silence."""
# Wait for "Listening..." TTS to finish before recording
while self._audio.is_speaking:
time.sleep(0.1)
log.info("Recording command...")
audio = self._record_until_silence()
if len(audio) < 4000: # < 0.25s at 16kHz
log.info("Too short, ignoring")
self._audio.speak(self._config["messages"]["no_speech"])
self._state = State.IDLE
return
self._command_audio = audio
self._state = State.PROCESSING
def _do_processing(self):
"""Transcribe the command and send to brain."""
text = self._transcribe(self._command_audio, self._cmd_model)
self._command_audio = None
if not text or len(text.strip()) < 2:
log.info("Empty transcription")
self._audio.speak(self._config["messages"]["no_speech"])
self._state = State.IDLE
return
log.info("Command: %s", text)
# Send to brain callback (lang always "en" in this build)
if self._on_command:
try:
self._on_command(text, "en")
except Exception as e:
log.error("Brain callback error: %s", e)
self._state = State.IDLE
# ─── START / STOP ─────────────────────────────────────
def start(self):
"""Start voice listening in background thread."""
if self._running:
log.warning("Voice module already running")
return
self._running = True
self._state = State.IDLE
self._thread = threading.Thread(target=self._voice_loop, daemon=True, name="voice")
self._thread.start()
log.info("Voice module started")
def stop(self):
"""Stop voice listening."""
self._running = False
try:
self._mic_capture.stop()
except Exception:
pass
if self._thread:
self._thread.join(timeout=5)
self._thread = None
log.info("Voice module stopped")
@property
def state(self) -> str:
return self._state
@property
def is_running(self) -> bool:
return self._running
# ─── STANDALONE TEST ─────────────────────────────────────
if __name__ == "__main__":
import sys
sys.path.insert(0, PROJECT_ROOT)
from API.audio_api import AudioAPI
def on_command(text, lang):
print(f"\n{'='*50}")
print(f" COMMAND [{lang}]: {text}")
print(f"{'='*50}\n")
audio = AudioAPI()
voice = VoiceModule(audio, on_command=on_command)
print("Starting voice module... say 'Marcus' to wake.")
print("Press Ctrl+C to stop.\n")
voice.start()
try:
while voice.is_running:
time.sleep(0.5)
except KeyboardInterrupt:
print("\nStopping...")
voice.stop()
print("Done.")