#!/usr/bin/env python3 """ Features/Voice/marcus_voice.py — Marcus Always-Listening Voice Module ====================================================================== State machine: IDLE → (wake word detected) → WAKE_HEARD WAKE_HEARD → (record command) → PROCESSING PROCESSING → (Whisper transcribe) → send to brain → SPEAKING SPEAKING → (TTS done) → IDLE Wake word: "Marcus" / "ماركوس" (detected by Whisper tiny) Commands: Transcribed by Whisper small TTS: Handled by API/audio_api.py Usage: from Features.Voice.marcus_voice import VoiceModule voice = VoiceModule(audio_api, on_command=brain.handle_voice_command) voice.start() # background thread voice.stop() """ import logging import os import subprocess import threading import time import numpy as np # ─── PATH CONFIG ───────────────────────────────────────── from dotenv import load_dotenv load_dotenv() BASE_PATH = os.environ.get("PROJECT_BASE", "/home/unitree") PROJECT_NAME = "Marcus" PROJECT_ROOT = os.path.join(BASE_PATH, PROJECT_NAME) import json LOG_DIR = os.path.join(PROJECT_ROOT, "logs") os.makedirs(LOG_DIR, exist_ok=True) logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)s: %(message)s", handlers=[ logging.FileHandler(os.path.join(LOG_DIR, "voice.log")), logging.StreamHandler(), ], ) log = logging.getLogger("marcus_voice") def load_config(name: str) -> dict: path = os.path.join(PROJECT_ROOT, "Config", f"config_{name}.json") with open(path, "r") as f: return json.load(f) # ─── STATE ENUM ────────────────────────────────────────── class State: IDLE = "IDLE" WAKE_HEARD = "WAKE_HEARD" PROCESSING = "PROCESSING" SPEAKING = "SPEAKING" # ─── VOICE MODULE ──────────────────────────────────────── class VoiceModule: """Always-listening voice interface for Marcus.""" def __init__(self, audio_api, on_command=None): """ Args: audio_api: AudioAPI instance (from API/audio_api.py) on_command: callback(text: str, lang: str) — called when command is transcribed """ self._audio = audio_api self._on_command = on_command self._config = load_config("Voice") self._stt = self._config["stt"] self._mic = self._config["mic"] # Whisper models — lazy loaded self._wake_model = None self._cmd_model = None # Wake words self._wake_en = [w.lower() for w in self._stt["wake_words_en"]] self._wake_ar = self._stt["wake_words_ar"] # State self._state = State.IDLE self._running = False self._thread = None self._lock = threading.Lock() log.info("VoiceModule initialized") # ─── MODEL LOADING ──────────────────────────────────── def _load_whisper(self): """Lazy-load Whisper models.""" import whisper if self._wake_model is None: log.info("Loading Whisper '%s' for wake word...", self._stt["wake_model"]) self._wake_model = whisper.load_model(self._stt["wake_model"]) log.info("Wake model ready") if self._cmd_model is None: log.info("Loading Whisper '%s' for commands...", self._stt["command_model"]) self._cmd_model = whisper.load_model(self._stt["command_model"]) log.info("Command model ready") # ─── MIC RECORDING ──────────────────────────────────── def _record_chunk(self, seconds: float) -> np.ndarray: """Record audio chunk from mic via parec.""" source = self._mic["source_index"] rate = str(self._mic["rate"]) proc = subprocess.Popen( ["parec", "-d", source, "--format=s16le", f"--rate={rate}", "--channels=1", "--raw"], stdout=subprocess.PIPE, ) time.sleep(seconds) proc.terminate() raw = proc.stdout.read() return np.frombuffer(raw, dtype=np.int16) def _record_until_silence(self) -> np.ndarray: """Record until silence is detected or max duration reached.""" source = self._mic["source_index"] rate = self._mic["rate"] threshold = self._stt["silence_threshold"] silence_dur = self._stt["silence_duration_sec"] max_dur = self._stt["max_record_sec"] chunk_sec = 0.5 chunk_samples = int(rate * chunk_sec) silence_chunks_needed = int(silence_dur / chunk_sec) max_chunks = int(max_dur / chunk_sec) proc = subprocess.Popen( ["parec", "-d", source, "--format=s16le", f"--rate={rate}", "--channels=1", "--raw"], stdout=subprocess.PIPE, ) all_audio = [] silence_count = 0 chunk_count = 0 try: while chunk_count < max_chunks: data = proc.stdout.read(chunk_samples * 2) # 2 bytes per sample if not data: break chunk = np.frombuffer(data, dtype=np.int16) all_audio.append(chunk) chunk_count += 1 # Check for silence rms = np.sqrt(np.mean(chunk.astype(np.float64) ** 2)) if rms < threshold: silence_count += 1 else: silence_count = 0 if silence_count >= silence_chunks_needed and chunk_count > 2: log.info("Silence detected after %.1fs", chunk_count * chunk_sec) break finally: proc.terminate() proc.stdout.read() # drain if all_audio: return np.concatenate(all_audio) return np.array([], dtype=np.int16) # ─── TRANSCRIPTION ──────────────────────────────────── def _transcribe(self, audio: np.ndarray, model, task: str = "transcribe") -> str: """Transcribe audio using Whisper. Returns text.""" import whisper # Convert int16 to float32 [-1, 1] audio_f32 = audio.astype(np.float32) / 32768.0 # Whisper expects 16kHz result = model.transcribe( audio_f32, language=self._stt["language"], # None = auto-detect task=task, fp16=False, ) text = result["text"].strip() detected_lang = result.get("language", "unknown") log.info("Transcribed [%s]: %s", detected_lang, text[:100]) return text def _check_wake_word(self, text: str) -> bool: """Check if transcribed text contains a wake word.""" text_lower = text.lower().strip() # English wake words for w in self._wake_en: if w in text_lower: return True # Arabic wake words for w in self._wake_ar: if w in text: return True return False # ─── MAIN LOOP ──────────────────────────────────────── def _voice_loop(self): """Main voice processing loop — runs in background thread.""" self._load_whisper() log.info("Voice loop started — listening for wake word...") # Unmute mic once subprocess.run( ["pactl", "set-source-mute", self._mic["source_index"], "0"], capture_output=True, ) subprocess.run( ["pactl", "set-source-volume", self._mic["source_index"], "100%"], capture_output=True, ) while self._running: try: if self._state == State.IDLE: self._do_idle() elif self._state == State.WAKE_HEARD: self._do_wake_heard() elif self._state == State.PROCESSING: self._do_processing() elif self._state == State.SPEAKING: # Wait for any TTS to finish before returning to IDLE while self._audio.is_speaking: time.sleep(0.1) self._state = State.IDLE except Exception as e: log.error("Voice loop error: %s", e, exc_info=True) self._state = State.IDLE time.sleep(1) def _do_idle(self): """Listen for wake word in 2-second chunks.""" # Skip if robot is speaking — prevents self-listening if self._audio.is_speaking: time.sleep(0.2) return audio = self._record_chunk(2.0) # Double-check speaking didn't start during recording if self._audio.is_speaking: return # Skip if too quiet (no one talking) if audio.std() < 100: return text = self._transcribe(audio, self._wake_model) if self._check_wake_word(text): log.info("Wake word detected!") self._state = State.WAKE_HEARD # Acknowledge self._audio.speak( self._config["messages"]["wake_heard"], "en" ) def _do_wake_heard(self): """Record the command until silence.""" # Wait for "Listening..." TTS to finish before recording while self._audio.is_speaking: time.sleep(0.1) log.info("Recording command...") audio = self._record_until_silence() if len(audio) < 4000: # < 0.25s at 16kHz log.info("Too short, ignoring") self._audio.speak(self._config["messages"]["no_speech"], "en") self._state = State.IDLE return self._command_audio = audio self._state = State.PROCESSING def _do_processing(self): """Transcribe the command and send to brain.""" text = self._transcribe(self._command_audio, self._cmd_model) self._command_audio = None if not text or len(text.strip()) < 2: log.info("Empty transcription") self._audio.speak(self._config["messages"]["no_speech"], "en") self._state = State.IDLE return # Detect language lang = "ar" if any('\u0600' <= c <= '\u06FF' for c in text) else "en" log.info("Command [%s]: %s", lang, text) # Send to brain callback if self._on_command: try: self._on_command(text, lang) except Exception as e: log.error("Brain callback error: %s", e) self._state = State.IDLE # ─── START / STOP ───────────────────────────────────── def start(self): """Start voice listening in background thread.""" if self._running: log.warning("Voice module already running") return self._running = True self._state = State.IDLE self._thread = threading.Thread(target=self._voice_loop, daemon=True, name="voice") self._thread.start() log.info("Voice module started") def stop(self): """Stop voice listening.""" self._running = False if self._thread: self._thread.join(timeout=5) self._thread = None log.info("Voice module stopped") @property def state(self) -> str: return self._state @property def is_running(self) -> bool: return self._running # ─── STANDALONE TEST ───────────────────────────────────── if __name__ == "__main__": import sys sys.path.insert(0, PROJECT_ROOT) from API.audio_api import AudioAPI def on_command(text, lang): print(f"\n{'='*50}") print(f" COMMAND [{lang}]: {text}") print(f"{'='*50}\n") audio = AudioAPI() voice = VoiceModule(audio, on_command=on_command) print("Starting voice module... say 'Marcus' to wake.") print("Press Ctrl+C to stop.\n") voice.start() try: while voice.is_running: time.sleep(0.5) except KeyboardInterrupt: print("\nStopping...") voice.stop() print("Done.")