358 lines
12 KiB
Python
358 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Voice/marcus_voice.py — Marcus Always-Listening Voice Module (English)
|
|
=======================================================================
|
|
State machine:
|
|
IDLE → (wake word detected) → WAKE_HEARD
|
|
WAKE_HEARD → (record command) → PROCESSING
|
|
PROCESSING → (Whisper transcribe) → send to brain → SPEAKING
|
|
SPEAKING → (TTS done) → IDLE
|
|
|
|
Wake word: "Marcus" (detected by Whisper tiny)
|
|
Commands: Transcribed by Whisper small
|
|
Mic: G1 built-in array mic via UDP multicast (Voice/builtin_mic.py)
|
|
TTS: English only, Unitree built-in TtsMaker (API/audio_api.py)
|
|
|
|
Usage:
|
|
from Voice.marcus_voice import VoiceModule
|
|
voice = VoiceModule(audio_api, on_command=brain.handle_voice_command)
|
|
voice.start() # background thread
|
|
voice.stop()
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import sys
|
|
import threading
|
|
import time
|
|
from logging.handlers import RotatingFileHandler
|
|
|
|
import numpy as np
|
|
|
|
# ─── PATH + CONFIG ───────────────────────────────────────
|
|
# Single source of truth lives in Core/; everyone else imports from there.
|
|
_PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
if _PROJECT_DIR not in sys.path:
|
|
sys.path.insert(0, _PROJECT_DIR)
|
|
from Core.env_loader import PROJECT_ROOT
|
|
from Core.config_loader import load_config
|
|
|
|
LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
|
|
os.makedirs(LOG_DIR, exist_ok=True)
|
|
|
|
# basicConfig is idempotent. Whichever of audio_api / marcus_voice imports
|
|
# first installs the rotating handler; the other no-ops. Both loggers then
|
|
# share the same file handle with stdlib's per-handler thread lock.
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
|
|
handlers=[
|
|
RotatingFileHandler(
|
|
os.path.join(LOG_DIR, "voice.log"),
|
|
maxBytes=5_000_000, backupCount=3, encoding="utf-8",
|
|
),
|
|
logging.StreamHandler(),
|
|
],
|
|
)
|
|
log = logging.getLogger("marcus_voice")
|
|
|
|
|
|
# ─── STATE ENUM ──────────────────────────────────────────
|
|
|
|
class State:
|
|
IDLE = "IDLE"
|
|
WAKE_HEARD = "WAKE_HEARD"
|
|
PROCESSING = "PROCESSING"
|
|
SPEAKING = "SPEAKING"
|
|
|
|
|
|
# ─── VOICE MODULE ────────────────────────────────────────
|
|
|
|
class VoiceModule:
|
|
"""Always-listening voice interface for Marcus."""
|
|
|
|
def __init__(self, audio_api, on_command=None):
|
|
"""
|
|
Args:
|
|
audio_api: AudioAPI instance (from API/audio_api.py)
|
|
on_command: callback(text: str, lang: str) — "lang" is always "en"
|
|
now; kept in the signature for interface stability.
|
|
"""
|
|
self._audio = audio_api
|
|
self._on_command = on_command
|
|
self._config = load_config("Voice")
|
|
|
|
self._stt = self._config["stt"]
|
|
self._mic = self._config["mic"]
|
|
|
|
# Whisper models — lazy loaded on first _voice_loop() iteration
|
|
self._wake_model = None
|
|
self._cmd_model = None
|
|
|
|
# Wake words (English only — built-in TTS doesn't do Arabic)
|
|
self._wake_en = [w.lower() for w in self._stt.get("wake_words_en",
|
|
["marcus", "marcos"])]
|
|
|
|
# G1 built-in mic (UDP multicast).
|
|
from Voice.builtin_mic import BuiltinMic
|
|
_mcfg = self._config.get("mic_udp", {})
|
|
self._mic_capture = BuiltinMic(
|
|
group=_mcfg.get("group", "239.168.123.161"),
|
|
port=_mcfg.get("port", 5555),
|
|
buf_max=_mcfg.get("buffer_max_bytes", 64000),
|
|
)
|
|
self._sample_rate = self._mic_capture.sample_rate # 16000
|
|
|
|
# State
|
|
self._state = State.IDLE
|
|
self._running = False
|
|
self._thread = None
|
|
self._lock = threading.Lock()
|
|
|
|
log.info("VoiceModule initialized (mic: G1 built-in UDP)")
|
|
|
|
# ─── MODEL LOADING ────────────────────────────────────
|
|
|
|
def _load_whisper(self):
|
|
"""Lazy-load Whisper models."""
|
|
import whisper
|
|
|
|
if self._wake_model is None:
|
|
log.info("Loading Whisper '%s' for wake word...", self._stt["wake_model"])
|
|
self._wake_model = whisper.load_model(self._stt["wake_model"])
|
|
log.info("Wake model ready")
|
|
|
|
if self._cmd_model is None:
|
|
log.info("Loading Whisper '%s' for commands...", self._stt["command_model"])
|
|
self._cmd_model = whisper.load_model(self._stt["command_model"])
|
|
log.info("Command model ready")
|
|
|
|
# ─── MIC RECORDING (G1 built-in UDP) ──────────────────
|
|
|
|
def _record_chunk(self, seconds: float) -> np.ndarray:
|
|
"""Capture a fixed-duration chunk from the G1 built-in mic."""
|
|
num_bytes = int(seconds * self._sample_rate * 2) # int16 mono
|
|
raw = bytearray()
|
|
bite = 1024
|
|
while len(raw) < num_bytes:
|
|
raw.extend(self._mic_capture.read_chunk(min(bite, num_bytes - len(raw))))
|
|
return np.frombuffer(bytes(raw), dtype=np.int16)
|
|
|
|
def _record_until_silence(self) -> np.ndarray:
|
|
"""Capture until RMS drops below threshold for `silence_duration_sec`."""
|
|
threshold = self._stt.get("silence_threshold", 500)
|
|
silence_dur = self._stt.get("silence_duration_sec", 1.5)
|
|
max_dur = self._stt.get("max_record_sec", 15)
|
|
|
|
chunk_sec = 0.5
|
|
chunk_bytes = int(self._sample_rate * chunk_sec) * 2
|
|
silence_chunks_need = int(silence_dur / chunk_sec)
|
|
max_chunks = int(max_dur / chunk_sec)
|
|
|
|
all_audio = []
|
|
silence_count = 0
|
|
chunk_count = 0
|
|
|
|
while chunk_count < max_chunks:
|
|
raw = self._mic_capture.read_chunk(chunk_bytes)
|
|
if not raw:
|
|
break
|
|
chunk = np.frombuffer(raw, dtype=np.int16)
|
|
all_audio.append(chunk)
|
|
chunk_count += 1
|
|
|
|
rms = np.sqrt(np.mean(chunk.astype(np.float64) ** 2))
|
|
if rms < threshold:
|
|
silence_count += 1
|
|
else:
|
|
silence_count = 0
|
|
|
|
if silence_count >= silence_chunks_need and chunk_count > 2:
|
|
log.info("Silence detected after %.1fs", chunk_count * chunk_sec)
|
|
break
|
|
|
|
if all_audio:
|
|
return np.concatenate(all_audio)
|
|
return np.array([], dtype=np.int16)
|
|
|
|
# ─── TRANSCRIPTION ────────────────────────────────────
|
|
|
|
def _transcribe(self, audio: np.ndarray, model, task: str = "transcribe") -> str:
|
|
"""Transcribe audio using Whisper. Returns text."""
|
|
import whisper
|
|
|
|
# Convert int16 to float32 [-1, 1]
|
|
audio_f32 = audio.astype(np.float32) / 32768.0
|
|
|
|
# Whisper expects 16kHz
|
|
result = model.transcribe(
|
|
audio_f32,
|
|
language=self._stt["language"], # None = auto-detect
|
|
task=task,
|
|
fp16=False,
|
|
)
|
|
text = result["text"].strip()
|
|
detected_lang = result.get("language", "unknown")
|
|
log.info("Transcribed [%s]: %s", detected_lang, text[:100])
|
|
return text
|
|
|
|
def _check_wake_word(self, text: str) -> bool:
|
|
"""Check if transcribed text contains an English wake word."""
|
|
text_lower = text.lower().strip()
|
|
return any(w in text_lower for w in self._wake_en)
|
|
|
|
# ─── MAIN LOOP ────────────────────────────────────────
|
|
|
|
def _voice_loop(self):
|
|
"""Main voice processing loop — runs in background thread."""
|
|
self._load_whisper()
|
|
self._mic_capture.start()
|
|
log.info("Voice loop started — listening for wake word...")
|
|
|
|
while self._running:
|
|
try:
|
|
if self._state == State.IDLE:
|
|
self._do_idle()
|
|
elif self._state == State.WAKE_HEARD:
|
|
self._do_wake_heard()
|
|
elif self._state == State.PROCESSING:
|
|
self._do_processing()
|
|
elif self._state == State.SPEAKING:
|
|
# Wait for any TTS to finish before returning to IDLE
|
|
while self._audio.is_speaking:
|
|
time.sleep(0.1)
|
|
self._state = State.IDLE
|
|
except Exception as e:
|
|
log.error("Voice loop error: %s", e, exc_info=True)
|
|
self._state = State.IDLE
|
|
time.sleep(1)
|
|
|
|
def _do_idle(self):
|
|
"""Listen for wake word in 2-second chunks."""
|
|
# Skip if robot is speaking — prevents self-listening
|
|
if self._audio.is_speaking:
|
|
time.sleep(0.2)
|
|
return
|
|
|
|
audio = self._record_chunk(2.0)
|
|
|
|
# Double-check speaking didn't start during recording
|
|
if self._audio.is_speaking:
|
|
return
|
|
|
|
# Skip if too quiet (no one talking)
|
|
if audio.std() < 100:
|
|
return
|
|
|
|
text = self._transcribe(audio, self._wake_model)
|
|
|
|
if self._check_wake_word(text):
|
|
log.info("Wake word detected!")
|
|
self._state = State.WAKE_HEARD
|
|
|
|
# Acknowledge
|
|
self._audio.speak(self._config["messages"]["wake_heard"])
|
|
|
|
def _do_wake_heard(self):
|
|
"""Record the command until silence."""
|
|
# Wait for "Listening..." TTS to finish before recording
|
|
while self._audio.is_speaking:
|
|
time.sleep(0.1)
|
|
|
|
log.info("Recording command...")
|
|
audio = self._record_until_silence()
|
|
|
|
if len(audio) < 4000: # < 0.25s at 16kHz
|
|
log.info("Too short, ignoring")
|
|
self._audio.speak(self._config["messages"]["no_speech"])
|
|
self._state = State.IDLE
|
|
return
|
|
|
|
self._command_audio = audio
|
|
self._state = State.PROCESSING
|
|
|
|
def _do_processing(self):
|
|
"""Transcribe the command and send to brain."""
|
|
text = self._transcribe(self._command_audio, self._cmd_model)
|
|
self._command_audio = None
|
|
|
|
if not text or len(text.strip()) < 2:
|
|
log.info("Empty transcription")
|
|
self._audio.speak(self._config["messages"]["no_speech"])
|
|
self._state = State.IDLE
|
|
return
|
|
|
|
log.info("Command: %s", text)
|
|
|
|
# Send to brain callback (lang always "en" in this build)
|
|
if self._on_command:
|
|
try:
|
|
self._on_command(text, "en")
|
|
except Exception as e:
|
|
log.error("Brain callback error: %s", e)
|
|
|
|
self._state = State.IDLE
|
|
|
|
# ─── START / STOP ─────────────────────────────────────
|
|
|
|
def start(self):
|
|
"""Start voice listening in background thread."""
|
|
if self._running:
|
|
log.warning("Voice module already running")
|
|
return
|
|
|
|
self._running = True
|
|
self._state = State.IDLE
|
|
self._thread = threading.Thread(target=self._voice_loop, daemon=True, name="voice")
|
|
self._thread.start()
|
|
log.info("Voice module started")
|
|
|
|
def stop(self):
|
|
"""Stop voice listening."""
|
|
self._running = False
|
|
try:
|
|
self._mic_capture.stop()
|
|
except Exception:
|
|
pass
|
|
if self._thread:
|
|
self._thread.join(timeout=5)
|
|
self._thread = None
|
|
log.info("Voice module stopped")
|
|
|
|
@property
|
|
def state(self) -> str:
|
|
return self._state
|
|
|
|
@property
|
|
def is_running(self) -> bool:
|
|
return self._running
|
|
|
|
|
|
# ─── STANDALONE TEST ─────────────────────────────────────
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
sys.path.insert(0, PROJECT_ROOT)
|
|
from API.audio_api import AudioAPI
|
|
|
|
def on_command(text, lang):
|
|
print(f"\n{'='*50}")
|
|
print(f" COMMAND [{lang}]: {text}")
|
|
print(f"{'='*50}\n")
|
|
|
|
audio = AudioAPI()
|
|
voice = VoiceModule(audio, on_command=on_command)
|
|
|
|
print("Starting voice module... say 'Marcus' to wake.")
|
|
print("Press Ctrl+C to stop.\n")
|
|
|
|
voice.start()
|
|
|
|
try:
|
|
while voice.is_running:
|
|
time.sleep(0.5)
|
|
except KeyboardInterrupt:
|
|
print("\nStopping...")
|
|
voice.stop()
|
|
print("Done.")
|