Marcus/Voice/marcus_voice.py
2026-04-12 18:50:22 +04:00

386 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Features/Voice/marcus_voice.py — Marcus Always-Listening Voice Module
======================================================================
State machine:
IDLE → (wake word detected) → WAKE_HEARD
WAKE_HEARD → (record command) → PROCESSING
PROCESSING → (Whisper transcribe) → send to brain → SPEAKING
SPEAKING → (TTS done) → IDLE
Wake word: "Marcus" / "ماركوس" (detected by Whisper tiny)
Commands: Transcribed by Whisper small
TTS: Handled by API/audio_api.py
Usage:
from Features.Voice.marcus_voice import VoiceModule
voice = VoiceModule(audio_api, on_command=brain.handle_voice_command)
voice.start() # background thread
voice.stop()
"""
import logging
import os
import subprocess
import threading
import time
import numpy as np
# ─── PATH CONFIG ─────────────────────────────────────────
from dotenv import load_dotenv
load_dotenv()
BASE_PATH = os.environ.get("PROJECT_BASE", "/home/unitree")
PROJECT_NAME = "Marcus"
PROJECT_ROOT = os.path.join(BASE_PATH, PROJECT_NAME)
import json
LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
os.makedirs(LOG_DIR, exist_ok=True)
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
handlers=[
logging.FileHandler(os.path.join(LOG_DIR, "voice.log")),
logging.StreamHandler(),
],
)
log = logging.getLogger("marcus_voice")
def load_config(name: str) -> dict:
path = os.path.join(PROJECT_ROOT, "Config", f"config_{name}.json")
with open(path, "r") as f:
return json.load(f)
# ─── STATE ENUM ──────────────────────────────────────────
class State:
IDLE = "IDLE"
WAKE_HEARD = "WAKE_HEARD"
PROCESSING = "PROCESSING"
SPEAKING = "SPEAKING"
# ─── VOICE MODULE ────────────────────────────────────────
class VoiceModule:
"""Always-listening voice interface for Marcus."""
def __init__(self, audio_api, on_command=None):
"""
Args:
audio_api: AudioAPI instance (from API/audio_api.py)
on_command: callback(text: str, lang: str) — called when command is transcribed
"""
self._audio = audio_api
self._on_command = on_command
self._config = load_config("Voice")
self._stt = self._config["stt"]
self._mic = self._config["mic"]
# Whisper models — lazy loaded
self._wake_model = None
self._cmd_model = None
# Wake words
self._wake_en = [w.lower() for w in self._stt["wake_words_en"]]
self._wake_ar = self._stt["wake_words_ar"]
# State
self._state = State.IDLE
self._running = False
self._thread = None
self._lock = threading.Lock()
log.info("VoiceModule initialized")
# ─── MODEL LOADING ────────────────────────────────────
def _load_whisper(self):
"""Lazy-load Whisper models."""
import whisper
if self._wake_model is None:
log.info("Loading Whisper '%s' for wake word...", self._stt["wake_model"])
self._wake_model = whisper.load_model(self._stt["wake_model"])
log.info("Wake model ready")
if self._cmd_model is None:
log.info("Loading Whisper '%s' for commands...", self._stt["command_model"])
self._cmd_model = whisper.load_model(self._stt["command_model"])
log.info("Command model ready")
# ─── MIC RECORDING ────────────────────────────────────
def _record_chunk(self, seconds: float) -> np.ndarray:
"""Record audio chunk from mic via parec."""
source = self._mic["source_index"]
rate = str(self._mic["rate"])
proc = subprocess.Popen(
["parec", "-d", source,
"--format=s16le", f"--rate={rate}", "--channels=1", "--raw"],
stdout=subprocess.PIPE,
)
time.sleep(seconds)
proc.terminate()
raw = proc.stdout.read()
return np.frombuffer(raw, dtype=np.int16)
def _record_until_silence(self) -> np.ndarray:
"""Record until silence is detected or max duration reached."""
source = self._mic["source_index"]
rate = self._mic["rate"]
threshold = self._stt["silence_threshold"]
silence_dur = self._stt["silence_duration_sec"]
max_dur = self._stt["max_record_sec"]
chunk_sec = 0.5
chunk_samples = int(rate * chunk_sec)
silence_chunks_needed = int(silence_dur / chunk_sec)
max_chunks = int(max_dur / chunk_sec)
proc = subprocess.Popen(
["parec", "-d", source,
"--format=s16le", f"--rate={rate}", "--channels=1", "--raw"],
stdout=subprocess.PIPE,
)
all_audio = []
silence_count = 0
chunk_count = 0
try:
while chunk_count < max_chunks:
data = proc.stdout.read(chunk_samples * 2) # 2 bytes per sample
if not data:
break
chunk = np.frombuffer(data, dtype=np.int16)
all_audio.append(chunk)
chunk_count += 1
# Check for silence
rms = np.sqrt(np.mean(chunk.astype(np.float64) ** 2))
if rms < threshold:
silence_count += 1
else:
silence_count = 0
if silence_count >= silence_chunks_needed and chunk_count > 2:
log.info("Silence detected after %.1fs", chunk_count * chunk_sec)
break
finally:
proc.terminate()
proc.stdout.read() # drain
if all_audio:
return np.concatenate(all_audio)
return np.array([], dtype=np.int16)
# ─── TRANSCRIPTION ────────────────────────────────────
def _transcribe(self, audio: np.ndarray, model, task: str = "transcribe") -> str:
"""Transcribe audio using Whisper. Returns text."""
import whisper
# Convert int16 to float32 [-1, 1]
audio_f32 = audio.astype(np.float32) / 32768.0
# Whisper expects 16kHz
result = model.transcribe(
audio_f32,
language=self._stt["language"], # None = auto-detect
task=task,
fp16=False,
)
text = result["text"].strip()
detected_lang = result.get("language", "unknown")
log.info("Transcribed [%s]: %s", detected_lang, text[:100])
return text
def _check_wake_word(self, text: str) -> bool:
"""Check if transcribed text contains a wake word."""
text_lower = text.lower().strip()
# English wake words
for w in self._wake_en:
if w in text_lower:
return True
# Arabic wake words
for w in self._wake_ar:
if w in text:
return True
return False
# ─── MAIN LOOP ────────────────────────────────────────
def _voice_loop(self):
"""Main voice processing loop — runs in background thread."""
self._load_whisper()
log.info("Voice loop started — listening for wake word...")
# Unmute mic once
subprocess.run(
["pactl", "set-source-mute", self._mic["source_index"], "0"],
capture_output=True,
)
subprocess.run(
["pactl", "set-source-volume", self._mic["source_index"], "100%"],
capture_output=True,
)
while self._running:
try:
if self._state == State.IDLE:
self._do_idle()
elif self._state == State.WAKE_HEARD:
self._do_wake_heard()
elif self._state == State.PROCESSING:
self._do_processing()
elif self._state == State.SPEAKING:
# Wait for any TTS to finish before returning to IDLE
while self._audio.is_speaking:
time.sleep(0.1)
self._state = State.IDLE
except Exception as e:
log.error("Voice loop error: %s", e, exc_info=True)
self._state = State.IDLE
time.sleep(1)
def _do_idle(self):
"""Listen for wake word in 2-second chunks."""
# Skip if robot is speaking — prevents self-listening
if self._audio.is_speaking:
time.sleep(0.2)
return
audio = self._record_chunk(2.0)
# Double-check speaking didn't start during recording
if self._audio.is_speaking:
return
# Skip if too quiet (no one talking)
if audio.std() < 100:
return
text = self._transcribe(audio, self._wake_model)
if self._check_wake_word(text):
log.info("Wake word detected!")
self._state = State.WAKE_HEARD
# Acknowledge
self._audio.speak(
self._config["messages"]["wake_heard"], "en"
)
def _do_wake_heard(self):
"""Record the command until silence."""
# Wait for "Listening..." TTS to finish before recording
while self._audio.is_speaking:
time.sleep(0.1)
log.info("Recording command...")
audio = self._record_until_silence()
if len(audio) < 4000: # < 0.25s at 16kHz
log.info("Too short, ignoring")
self._audio.speak(self._config["messages"]["no_speech"], "en")
self._state = State.IDLE
return
self._command_audio = audio
self._state = State.PROCESSING
def _do_processing(self):
"""Transcribe the command and send to brain."""
text = self._transcribe(self._command_audio, self._cmd_model)
self._command_audio = None
if not text or len(text.strip()) < 2:
log.info("Empty transcription")
self._audio.speak(self._config["messages"]["no_speech"], "en")
self._state = State.IDLE
return
# Detect language
lang = "ar" if any('\u0600' <= c <= '\u06FF' for c in text) else "en"
log.info("Command [%s]: %s", lang, text)
# Send to brain callback
if self._on_command:
try:
self._on_command(text, lang)
except Exception as e:
log.error("Brain callback error: %s", e)
self._state = State.IDLE
# ─── START / STOP ─────────────────────────────────────
def start(self):
"""Start voice listening in background thread."""
if self._running:
log.warning("Voice module already running")
return
self._running = True
self._state = State.IDLE
self._thread = threading.Thread(target=self._voice_loop, daemon=True, name="voice")
self._thread.start()
log.info("Voice module started")
def stop(self):
"""Stop voice listening."""
self._running = False
if self._thread:
self._thread.join(timeout=5)
self._thread = None
log.info("Voice module stopped")
@property
def state(self) -> str:
return self._state
@property
def is_running(self) -> bool:
return self._running
# ─── STANDALONE TEST ─────────────────────────────────────
if __name__ == "__main__":
import sys
sys.path.insert(0, PROJECT_ROOT)
from API.audio_api import AudioAPI
def on_command(text, lang):
print(f"\n{'='*50}")
print(f" COMMAND [{lang}]: {text}")
print(f"{'='*50}\n")
audio = AudioAPI()
voice = VoiceModule(audio, on_command=on_command)
print("Starting voice module... say 'Marcus' to wake.")
print("Press Ctrl+C to stop.\n")
voice.start()
try:
while voice.is_running:
time.sleep(0.5)
except KeyboardInterrupt:
print("\nStopping...")
voice.stop()
print("Done.")