Marcus/Voice/marcus_voice.py

210 lines
7.6 KiB
Python

#!/usr/bin/env python3
"""
Voice/marcus_voice.py — Marcus Wake-Signal Module (no ML, no STT).
This is a deliberately-minimal voice subsystem:
- A custom energy-based wake detector (Voice/wake_detector.py) listens
to the G1's on-board mic continuously.
- When the user says any short word (~0.2-1.5 s of speech followed by
silence), wake fires.
- The robot acknowledges via TTS ("Yes" — configurable).
- The user then types their command at the Marcus terminal prompt.
No Vosk, no Whisper, no torch, no network. Pure numpy DSP.
Why not STT here:
Both Vosk's small English model ("sanad" absent from lexicon) and
openai-whisper ("!!!!!" numerical garbage on this Jetson's torch-aarch64)
proved unreliable for this hardware. Rather than fight either, the
wake path becomes a simple "did the user say something?" signal.
Interface with Marcus brain:
VoiceModule(audio_api, on_wake=callback)
on_wake() is called when wake fires. Brain can display a prompt
or do anything else.
"""
from __future__ import annotations
import logging
import os
import sys
import threading
import time
from logging.handlers import RotatingFileHandler
from typing import Callable, Optional
# ─── PATH + CONFIG ───────────────────────────────────────
_PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if _PROJECT_DIR not in sys.path:
sys.path.insert(0, _PROJECT_DIR)
from Core.env_loader import PROJECT_ROOT
from Core.config_loader import load_config
LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
os.makedirs(LOG_DIR, exist_ok=True)
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
handlers=[
RotatingFileHandler(
os.path.join(LOG_DIR, "voice.log"),
maxBytes=5_000_000, backupCount=3, encoding="utf-8",
),
],
)
log = logging.getLogger("marcus_voice")
class VoiceModule:
"""Wake-only voice subsystem — fires a callback when speech is detected."""
def __init__(self, audio_api, on_command: Optional[Callable] = None,
on_wake: Optional[Callable] = None):
"""
Args:
audio_api: AudioAPI instance (for TTS ack).
on_command: kept for API compatibility; always called with
text="" because there's no STT. Brain should
prompt the user to type.
on_wake: alternative callback fired when wake detected.
Exactly one of on_command / on_wake is used.
"""
self._audio = audio_api
self._on_command = on_command
self._on_wake = on_wake
self._config = load_config("Voice")
self._stt = self._config.get("stt", {})
self._messages = self._config.get("messages", {})
# Wake-detector parameters (tweakable via config_Voice.json::stt).
from Voice.wake_detector import WakeDetector, WakeConfig
wcfg = WakeConfig(
sample_rate = 16_000,
speech_threshold = float(self._stt.get("speech_threshold", 150.0)),
min_word_duration_s= float(self._stt.get("min_word_duration", 0.20)),
max_word_duration_s= float(self._stt.get("max_word_duration", 1.50)),
post_silence_s = float(self._stt.get("post_silence", 0.30)),
cooldown_s = float(self._stt.get("wake_cooldown", 1.50)),
chunk_ms = int( self._stt.get("wake_chunk_ms", 50)),
)
self._detector = WakeDetector(wcfg)
# G1 built-in mic (UDP multicast).
from Voice.builtin_mic import BuiltinMic
_mcfg = self._config.get("mic_udp", {})
self._mic_capture = BuiltinMic(
group = _mcfg.get("group", "239.168.123.161"),
port = _mcfg.get("port", 5555),
buf_max = _mcfg.get("buffer_max_bytes", 64000),
)
self._running = False
self._thread = None
log.info(
"VoiceModule initialized (custom wake detector, "
"speech_threshold=%s, min/max_word=%s/%s s)",
wcfg.speech_threshold, wcfg.min_word_duration_s, wcfg.max_word_duration_s,
)
# ─── main loop ────────────────────────────────────────
def _voice_loop(self):
self._mic_capture.start()
log.info("Voice loop started — listening for wake (energy-based, no ML)")
while self._running:
try:
# Don't listen while the robot is speaking (prevents
# self-trigger from TTS output leaking into the mic).
if self._audio.is_speaking:
time.sleep(0.1)
self._detector.reset()
continue
chunk = self._mic_capture.read_chunk(1024) # ~32 ms at 16 kHz
if not chunk:
continue
if self._detector.process(chunk):
self._on_wake_fired()
except Exception as e:
log.error("Voice loop error: %s", e, exc_info=True)
time.sleep(1)
def _on_wake_fired(self):
log.info("Wake detected (acoustic)")
print("\n [Sanad] wake heard — type your command at the prompt.")
# TTS ack
msg = self._messages.get("wake_heard", "Yes")
try:
self._audio.speak(msg)
except Exception as e:
log.warning("TTS ack failed: %s", e)
# Brain callbacks for compatibility with the old interface.
if self._on_wake:
try:
self._on_wake()
except Exception as e:
log.error("on_wake callback error: %s", e)
elif self._on_command:
# Old API expected (text, lang). We have no transcription, so
# pass empty text — brain is expected to prompt for typed input.
try:
self._on_command("", "en")
except Exception as e:
log.error("on_command callback error: %s", e)
# ─── start / stop ─────────────────────────────────────
def start(self):
if self._running:
log.warning("VoiceModule already running")
return
self._running = True
self._thread = threading.Thread(
target=self._voice_loop, daemon=True, name="voice",
)
self._thread.start()
log.info("Voice module started")
def stop(self):
self._running = False
try:
self._mic_capture.stop()
except Exception:
pass
if self._thread:
self._thread.join(timeout=5)
self._thread = None
log.info("Voice module stopped")
@property
def is_running(self) -> bool:
return self._running
# ─── standalone test ─────────────────────────────────────
if __name__ == "__main__":
from API.audio_api import AudioAPI
def on_wake():
print(" (brain callback fired)")
audio = AudioAPI()
voice = VoiceModule(audio, on_wake=on_wake)
print("Starting voice module... say any short word to test the wake.")
print("Press Ctrl-C to stop.\n")
voice.start()
try:
while voice.is_running:
time.sleep(0.5)
except KeyboardInterrupt:
print("\nStopping...")
voice.stop()
print("Done.")