Marcus/Voice/marcus_voice.py

455 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Voice/marcus_voice.py — Marcus Always-Listening Voice Module (English)
=======================================================================
State machine:
IDLE → (wake word detected) → WAKE_HEARD
WAKE_HEARD → (record command) → PROCESSING
PROCESSING → (Whisper transcribe) → send to brain → SPEAKING
SPEAKING → (TTS done) → IDLE
Wake word: "Sanad" (detected by Whisper tiny; mistranscription variants in
config_Voice.json::stt.wake_words_en)
Commands: Transcribed by Whisper tiny (small if quality suffers)
Mic: G1 built-in array mic via UDP multicast (Voice/builtin_mic.py)
TTS: English only, Unitree built-in TtsMaker (API/audio_api.py)
Usage:
from Voice.marcus_voice import VoiceModule
voice = VoiceModule(audio_api, on_command=brain.handle_voice_command)
voice.start() # background thread
voice.stop()
"""
import logging
import os
import sys
import threading
import time
from logging.handlers import RotatingFileHandler
from typing import Optional
import numpy as np
# ─── PATH + CONFIG ───────────────────────────────────────
# Single source of truth lives in Core/; everyone else imports from there.
_PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if _PROJECT_DIR not in sys.path:
sys.path.insert(0, _PROJECT_DIR)
from Core.env_loader import PROJECT_ROOT
from Core.config_loader import load_config
LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
os.makedirs(LOG_DIR, exist_ok=True)
# Voice runs as a background subsystem — its INFO/DEBUG logs go ONLY to
# logs/voice.log so they don't drown out the interactive `Command:` prompt.
# Anything the user needs to see (wake-word fired, command heard) is
# print()-ed explicitly from the callbacks below.
# basicConfig is idempotent; audio_api may have already called it.
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
handlers=[
RotatingFileHandler(
os.path.join(LOG_DIR, "voice.log"),
maxBytes=5_000_000, backupCount=3, encoding="utf-8",
),
],
)
log = logging.getLogger("marcus_voice")
# ─── STATE ENUM ──────────────────────────────────────────
class State:
IDLE = "IDLE"
WAKE_HEARD = "WAKE_HEARD"
PROCESSING = "PROCESSING"
SPEAKING = "SPEAKING"
# ─── VOICE MODULE ────────────────────────────────────────
class VoiceModule:
"""Always-listening voice interface for Marcus."""
def __init__(self, audio_api, on_command=None):
"""
Args:
audio_api: AudioAPI instance (from API/audio_api.py)
on_command: callback(text: str, lang: str) — "lang" is always "en"
now; kept in the signature for interface stability.
"""
self._audio = audio_api
self._on_command = on_command
self._config = load_config("Voice")
self._stt = self._config["stt"]
self._mic = self._config["mic"]
# STT (Vosk) — lazy loaded on first _voice_loop() iteration.
# One Model instance, recognizers are created fresh per-utterance.
self._vosk_model = None
self._KaldiRecognizer = None
# Wake words (English only — built-in TTS doesn't do Arabic)
self._wake_en = [w.lower() for w in self._stt.get("wake_words_en",
["marcus", "marcos"])]
# G1 built-in mic (UDP multicast).
from Voice.builtin_mic import BuiltinMic
_mcfg = self._config.get("mic_udp", {})
self._mic_capture = BuiltinMic(
group=_mcfg.get("group", "239.168.123.161"),
port=_mcfg.get("port", 5555),
buf_max=_mcfg.get("buffer_max_bytes", 64000),
)
self._sample_rate = self._mic_capture.sample_rate # 16000
# State
self._state = State.IDLE
self._running = False
self._thread = None
self._lock = threading.Lock()
log.info("VoiceModule initialized (mic: G1 built-in UDP)")
# ─── MODEL LOADING ────────────────────────────────────
def _load_stt(self):
"""
Load Vosk ASR model. Replaces openai-whisper which produced garbage
(!!!!!!!) on this Jetson's torch-aarch64 install regardless of
audio quality. Vosk uses Kaldi's own CPU kernels — no torch, no
numerical instability, ~10× faster than Whisper base on CPU.
Model path is configured via stt.vosk_model_path (relative to
PROJECT_ROOT, or absolute). Default: the small English model,
which is ~40 MB and plenty for short voice commands.
"""
from vosk import Model, KaldiRecognizer, SetLogLevel
SetLogLevel(-1) # silence Vosk's stderr spam
if self._vosk_model is None:
rel = self._stt.get("vosk_model_path", "Models/vosk-model-small-en-us-0.15")
model_path = rel if os.path.isabs(rel) else os.path.join(PROJECT_ROOT, rel)
if not os.path.isdir(model_path):
raise RuntimeError(
"[Voice] Vosk model not found at " + model_path + "\n"
" Download it on the Jetson:\n"
" cd ~/Marcus/Models\n"
" wget https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip\n"
" unzip vosk-model-small-en-us-0.15.zip"
)
log.info("Loading Vosk model: %s", model_path)
self._vosk_model = Model(model_path)
self._KaldiRecognizer = KaldiRecognizer
log.info("Vosk model ready")
# NO restricted grammar. Vosk's small English model's lexicon
# doesn't contain "sanad" (it's not an English word), so passing
# it in a restricted grammar makes Vosk drop the word with:
# WARNING (VoskAPI:UpdateGrammarFst) Ignoring word missing in
# vocabulary: 'sanad'
# and the decoder then only has "[unk]" → never matches
# anything → Transcribed always empty.
#
# Instead: open vocabulary transcription, fuzzy-match against
# the stt.wake_words_en list which contains the English words
# Vosk ACTUALLY hears when you say "sanad" (then, send, sand,
# step, signed, etc.).
self._wake_grammar = None
# Back-compat alias for any caller that still references the old name
_load_whisper = _load_stt
# ─── MIC RECORDING (G1 built-in UDP) ──────────────────
def _record_chunk(self, seconds: float) -> np.ndarray:
"""Capture a fixed-duration chunk from the G1 built-in mic."""
num_bytes = int(seconds * self._sample_rate * 2) # int16 mono
raw = bytearray()
bite = 1024
while len(raw) < num_bytes:
raw.extend(self._mic_capture.read_chunk(min(bite, num_bytes - len(raw))))
return np.frombuffer(bytes(raw), dtype=np.int16)
def _record_until_silence(self) -> np.ndarray:
"""Capture until RMS drops below threshold for `silence_duration_sec`."""
threshold = self._stt.get("silence_threshold", 500)
silence_dur = self._stt.get("silence_duration_sec", 1.5)
max_dur = self._stt.get("max_record_sec", 15)
chunk_sec = 0.5
chunk_bytes = int(self._sample_rate * chunk_sec) * 2
silence_chunks_need = int(silence_dur / chunk_sec)
max_chunks = int(max_dur / chunk_sec)
all_audio = []
silence_count = 0
chunk_count = 0
while chunk_count < max_chunks:
raw = self._mic_capture.read_chunk(chunk_bytes)
if not raw:
break
chunk = np.frombuffer(raw, dtype=np.int16)
all_audio.append(chunk)
chunk_count += 1
rms = np.sqrt(np.mean(chunk.astype(np.float64) ** 2))
if rms < threshold:
silence_count += 1
else:
silence_count = 0
if silence_count >= silence_chunks_need and chunk_count > 2:
log.info("Silence detected after %.1fs", chunk_count * chunk_sec)
break
if all_audio:
return np.concatenate(all_audio)
return np.array([], dtype=np.int16)
# ─── TRANSCRIPTION ────────────────────────────────────
def _transcribe(self, audio: np.ndarray, grammar: Optional[str] = None) -> str:
"""
Transcribe audio using Vosk.
When `grammar` is a JSON list string (e.g. `'["sanad","[unk]"]'`),
Vosk is constrained to that vocabulary only — perfect for wake-word
detection where we KNOW the exact word we want to hear. Pass
grammar=None for open-vocabulary transcription (used for commands).
"""
import json as _json
# Audio stats — still useful for "mic is silent" diagnostics.
peak_i16 = int(np.abs(audio).max()) if audio.size else 0
rms_i16 = float(np.sqrt(np.mean(audio.astype(np.float64) ** 2))) if audio.size else 0.0
log.info("audio stats: samples=%d peak=%d rms=%.1f", audio.size, peak_i16, rms_i16)
if audio.size == 0:
return ""
# Fresh recognizer per utterance. Pass grammar if provided.
if grammar:
rec = self._KaldiRecognizer(self._vosk_model, self._sample_rate, grammar)
else:
rec = self._KaldiRecognizer(self._vosk_model, self._sample_rate)
rec.SetWords(False)
# Single-shot: feed the whole utterance in one AcceptWaveform call,
# then take FinalResult. Chunk-based feeding split short "sanad"
# utterances across chunk boundaries and Vosk's decoder often
# refused to commit, returning empty. Single-shot works for every
# voice-assistant example in Vosk's docs.
#
# When FinalResult is empty, also check PartialResult — sometimes
# Vosk heard something but didn't reach a segmentation boundary
# yet. PartialResult still has the text, just not "finalized".
rec.AcceptWaveform(audio.tobytes())
final = _json.loads(rec.FinalResult()).get("text", "").strip()
if not final:
partial = _json.loads(rec.PartialResult()).get("partial", "").strip()
if partial:
final = partial
log.info(" (partial only, no final commit)")
text = final
if not text:
log.info("Transcribed: (empty)")
return ""
log.info("Transcribed: %s", text[:100])
return text
def _check_wake_word(self, text: str) -> bool:
"""
Check if transcribed text contains an English wake word.
Matches on word boundary (so "sandstorm" doesn't trigger off "sand"),
but is lenient about punctuation/whitespace around the word.
"""
import re
text_lower = text.lower().strip()
# word-boundary regex built once per call (cheap; runs 2×/sec)
for w in self._wake_en:
if re.search(r'\b' + re.escape(w) + r'\b', text_lower):
return True
return False
# ─── MAIN LOOP ────────────────────────────────────────
def _voice_loop(self):
"""Main voice processing loop — runs in background thread."""
self._load_whisper()
self._mic_capture.start()
log.info("Voice loop started — listening for wake word...")
while self._running:
try:
if self._state == State.IDLE:
self._do_idle()
elif self._state == State.WAKE_HEARD:
self._do_wake_heard()
elif self._state == State.PROCESSING:
self._do_processing()
elif self._state == State.SPEAKING:
# Wait for any TTS to finish before returning to IDLE
while self._audio.is_speaking:
time.sleep(0.1)
self._state = State.IDLE
except Exception as e:
log.error("Voice loop error: %s", e, exc_info=True)
self._state = State.IDLE
time.sleep(1)
def _do_idle(self):
"""Listen for wake word in 4-second chunks. Longer windows give
Vosk's decoder enough context to commit short utterances like a
single 'sanad'."""
# Skip if robot is speaking — prevents self-listening
if self._audio.is_speaking:
time.sleep(0.2)
return
audio = self._record_chunk(4.0)
# Double-check speaking didn't start during recording
if self._audio.is_speaking:
return
# Skip if too quiet (no one talking). Threshold lowered to 60 to
# match the G1 on-board mic's typical noise floor (std ~30-80 when
# idle, ~150+ when someone speaks). With 100 we were skipping
# quiet "sanad" utterances entirely.
if audio.std() < 60:
return
# Wake-word pass uses restricted Vosk grammar (only "sanad" or "[unk]")
text = self._transcribe(audio, grammar=self._wake_grammar)
if self._check_wake_word(text):
log.info("Wake word detected!")
# One clean line to the terminal so the operator knows voice
# actually heard them, even though all other voice logs are
# file-only. \n leads because we may be painting over a
# half-drawn `Command:` prompt.
print("\n [Sanad] wake heard — recording command…")
self._state = State.WAKE_HEARD
# Acknowledge
self._audio.speak(self._config["messages"]["wake_heard"])
def _do_wake_heard(self):
"""Record the command until silence."""
# Wait for "Yes" TTS to finish before recording.
while self._audio.is_speaking:
time.sleep(0.1)
# CRITICAL: flush the mic ring buffer. The UDP multicast receiver
# has been accumulating audio continuously (including pre-wake
# silence and the TTS "Yes" that just played back into the mic
# path). Without flush, _record_until_silence() reads the old
# buffered silence instantly, counts 3 silent chunks, and exits
# before the user has started speaking the command.
self._mic_capture.flush()
log.info("Recording command...")
audio = self._record_until_silence()
if len(audio) < 4000: # < 0.25s at 16kHz
log.info("Too short, ignoring")
self._audio.speak(self._config["messages"]["no_speech"])
self._state = State.IDLE
return
self._command_audio = audio
self._state = State.PROCESSING
def _do_processing(self):
"""Transcribe the command and send to brain."""
text = self._transcribe(self._command_audio)
self._command_audio = None
if not text or len(text.strip()) < 2:
log.info("Empty transcription")
self._audio.speak(self._config["messages"]["no_speech"])
self._state = State.IDLE
return
log.info("Command: %s", text)
# Send to brain callback (lang always "en" in this build)
if self._on_command:
try:
self._on_command(text, "en")
except Exception as e:
log.error("Brain callback error: %s", e)
self._state = State.IDLE
# ─── START / STOP ─────────────────────────────────────
def start(self):
"""Start voice listening in background thread."""
if self._running:
log.warning("Voice module already running")
return
self._running = True
self._state = State.IDLE
self._thread = threading.Thread(target=self._voice_loop, daemon=True, name="voice")
self._thread.start()
log.info("Voice module started")
def stop(self):
"""Stop voice listening."""
self._running = False
try:
self._mic_capture.stop()
except Exception:
pass
if self._thread:
self._thread.join(timeout=5)
self._thread = None
log.info("Voice module stopped")
@property
def state(self) -> str:
return self._state
@property
def is_running(self) -> bool:
return self._running
# ─── STANDALONE TEST ─────────────────────────────────────
if __name__ == "__main__":
import sys
sys.path.insert(0, PROJECT_ROOT)
from API.audio_api import AudioAPI
def on_command(text, lang):
print(f"\n{'='*50}")
print(f" COMMAND [{lang}]: {text}")
print(f"{'='*50}\n")
audio = AudioAPI()
voice = VoiceModule(audio, on_command=on_command)
print("Starting voice module... say 'Marcus' to wake.")
print("Press Ctrl+C to stop.\n")
voice.start()
try:
while voice.is_running:
time.sleep(0.5)
except KeyboardInterrupt:
print("\nStopping...")
voice.stop()
print("Done.")