#!/usr/bin/env python3 """ Voice/marcus_voice.py — Marcus Always-Listening Voice Module (English) ======================================================================= State machine: IDLE → (wake word detected) → WAKE_HEARD WAKE_HEARD → (record command) → PROCESSING PROCESSING → (Whisper transcribe) → send to brain → SPEAKING SPEAKING → (TTS done) → IDLE Wake word: "Sanad" (detected by Whisper tiny; mistranscription variants in config_Voice.json::stt.wake_words_en) Commands: Transcribed by Whisper tiny (small if quality suffers) Mic: G1 built-in array mic via UDP multicast (Voice/builtin_mic.py) TTS: English only, Unitree built-in TtsMaker (API/audio_api.py) Usage: from Voice.marcus_voice import VoiceModule voice = VoiceModule(audio_api, on_command=brain.handle_voice_command) voice.start() # background thread voice.stop() """ import logging import os import sys import threading import time from logging.handlers import RotatingFileHandler from typing import Optional import numpy as np # ─── PATH + CONFIG ─────────────────────────────────────── # Single source of truth lives in Core/; everyone else imports from there. _PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if _PROJECT_DIR not in sys.path: sys.path.insert(0, _PROJECT_DIR) from Core.env_loader import PROJECT_ROOT from Core.config_loader import load_config LOG_DIR = os.path.join(PROJECT_ROOT, "logs") os.makedirs(LOG_DIR, exist_ok=True) # Voice runs as a background subsystem — its INFO/DEBUG logs go ONLY to # logs/voice.log so they don't drown out the interactive `Command:` prompt. # Anything the user needs to see (wake-word fired, command heard) is # print()-ed explicitly from the callbacks below. # basicConfig is idempotent; audio_api may have already called it. logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)s: %(message)s", handlers=[ RotatingFileHandler( os.path.join(LOG_DIR, "voice.log"), maxBytes=5_000_000, backupCount=3, encoding="utf-8", ), ], ) log = logging.getLogger("marcus_voice") # ─── STATE ENUM ────────────────────────────────────────── class State: IDLE = "IDLE" WAKE_HEARD = "WAKE_HEARD" PROCESSING = "PROCESSING" SPEAKING = "SPEAKING" # ─── VOICE MODULE ──────────────────────────────────────── class VoiceModule: """Always-listening voice interface for Marcus.""" def __init__(self, audio_api, on_command=None): """ Args: audio_api: AudioAPI instance (from API/audio_api.py) on_command: callback(text: str, lang: str) — "lang" is always "en" now; kept in the signature for interface stability. """ self._audio = audio_api self._on_command = on_command self._config = load_config("Voice") self._stt = self._config["stt"] self._mic = self._config["mic"] # STT (Vosk) — lazy loaded on first _voice_loop() iteration. # One Model instance, recognizers are created fresh per-utterance. self._vosk_model = None self._KaldiRecognizer = None # Wake words (English only — built-in TTS doesn't do Arabic) self._wake_en = [w.lower() for w in self._stt.get("wake_words_en", ["marcus", "marcos"])] # G1 built-in mic (UDP multicast). from Voice.builtin_mic import BuiltinMic _mcfg = self._config.get("mic_udp", {}) self._mic_capture = BuiltinMic( group=_mcfg.get("group", "239.168.123.161"), port=_mcfg.get("port", 5555), buf_max=_mcfg.get("buffer_max_bytes", 64000), ) self._sample_rate = self._mic_capture.sample_rate # 16000 # State self._state = State.IDLE self._running = False self._thread = None self._lock = threading.Lock() log.info("VoiceModule initialized (mic: G1 built-in UDP)") # ─── MODEL LOADING ──────────────────────────────────── def _load_stt(self): """ Load Vosk ASR model. Replaces openai-whisper which produced garbage (!!!!!!!) on this Jetson's torch-aarch64 install regardless of audio quality. Vosk uses Kaldi's own CPU kernels — no torch, no numerical instability, ~10× faster than Whisper base on CPU. Model path is configured via stt.vosk_model_path (relative to PROJECT_ROOT, or absolute). Default: the small English model, which is ~40 MB and plenty for short voice commands. """ from vosk import Model, KaldiRecognizer, SetLogLevel SetLogLevel(-1) # silence Vosk's stderr spam if self._vosk_model is None: rel = self._stt.get("vosk_model_path", "Models/vosk-model-small-en-us-0.15") model_path = rel if os.path.isabs(rel) else os.path.join(PROJECT_ROOT, rel) if not os.path.isdir(model_path): raise RuntimeError( "[Voice] Vosk model not found at " + model_path + "\n" " Download it on the Jetson:\n" " cd ~/Marcus/Models\n" " wget https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip\n" " unzip vosk-model-small-en-us-0.15.zip" ) log.info("Loading Vosk model: %s", model_path) self._vosk_model = Model(model_path) self._KaldiRecognizer = KaldiRecognizer log.info("Vosk model ready") # NO restricted grammar. Vosk's small English model's lexicon # doesn't contain "sanad" (it's not an English word), so passing # it in a restricted grammar makes Vosk drop the word with: # WARNING (VoskAPI:UpdateGrammarFst) Ignoring word missing in # vocabulary: 'sanad' # and the decoder then only has "[unk]" → never matches # anything → Transcribed always empty. # # Instead: open vocabulary transcription, fuzzy-match against # the stt.wake_words_en list which contains the English words # Vosk ACTUALLY hears when you say "sanad" (then, send, sand, # step, signed, etc.). self._wake_grammar = None # Back-compat alias for any caller that still references the old name _load_whisper = _load_stt # ─── MIC RECORDING (G1 built-in UDP) ────────────────── def _record_chunk(self, seconds: float) -> np.ndarray: """Capture a fixed-duration chunk from the G1 built-in mic.""" num_bytes = int(seconds * self._sample_rate * 2) # int16 mono raw = bytearray() bite = 1024 while len(raw) < num_bytes: raw.extend(self._mic_capture.read_chunk(min(bite, num_bytes - len(raw)))) return np.frombuffer(bytes(raw), dtype=np.int16) def _record_until_silence(self) -> np.ndarray: """Capture until RMS drops below threshold for `silence_duration_sec`.""" threshold = self._stt.get("silence_threshold", 500) silence_dur = self._stt.get("silence_duration_sec", 1.5) max_dur = self._stt.get("max_record_sec", 15) chunk_sec = 0.5 chunk_bytes = int(self._sample_rate * chunk_sec) * 2 silence_chunks_need = int(silence_dur / chunk_sec) max_chunks = int(max_dur / chunk_sec) all_audio = [] silence_count = 0 chunk_count = 0 while chunk_count < max_chunks: raw = self._mic_capture.read_chunk(chunk_bytes) if not raw: break chunk = np.frombuffer(raw, dtype=np.int16) all_audio.append(chunk) chunk_count += 1 rms = np.sqrt(np.mean(chunk.astype(np.float64) ** 2)) if rms < threshold: silence_count += 1 else: silence_count = 0 if silence_count >= silence_chunks_need and chunk_count > 2: log.info("Silence detected after %.1fs", chunk_count * chunk_sec) break if all_audio: return np.concatenate(all_audio) return np.array([], dtype=np.int16) # ─── TRANSCRIPTION ──────────────────────────────────── def _transcribe(self, audio: np.ndarray, grammar: Optional[str] = None) -> str: """ Transcribe audio using Vosk. When `grammar` is a JSON list string (e.g. `'["sanad","[unk]"]'`), Vosk is constrained to that vocabulary only — perfect for wake-word detection where we KNOW the exact word we want to hear. Pass grammar=None for open-vocabulary transcription (used for commands). """ import json as _json # Audio stats — still useful for "mic is silent" diagnostics. peak_i16 = int(np.abs(audio).max()) if audio.size else 0 rms_i16 = float(np.sqrt(np.mean(audio.astype(np.float64) ** 2))) if audio.size else 0.0 log.info("audio stats: samples=%d peak=%d rms=%.1f", audio.size, peak_i16, rms_i16) if audio.size == 0: return "" # Fresh recognizer per utterance. Pass grammar if provided. if grammar: rec = self._KaldiRecognizer(self._vosk_model, self._sample_rate, grammar) else: rec = self._KaldiRecognizer(self._vosk_model, self._sample_rate) rec.SetWords(False) # Single-shot: feed the whole utterance in one AcceptWaveform call, # then take FinalResult. Chunk-based feeding split short "sanad" # utterances across chunk boundaries and Vosk's decoder often # refused to commit, returning empty. Single-shot works for every # voice-assistant example in Vosk's docs. # # When FinalResult is empty, also check PartialResult — sometimes # Vosk heard something but didn't reach a segmentation boundary # yet. PartialResult still has the text, just not "finalized". rec.AcceptWaveform(audio.tobytes()) final = _json.loads(rec.FinalResult()).get("text", "").strip() if not final: partial = _json.loads(rec.PartialResult()).get("partial", "").strip() if partial: final = partial log.info(" (partial only, no final commit)") text = final if not text: log.info("Transcribed: (empty)") return "" log.info("Transcribed: %s", text[:100]) return text def _check_wake_word(self, text: str) -> bool: """ Check if transcribed text contains an English wake word. Matches on word boundary (so "sandstorm" doesn't trigger off "sand"), but is lenient about punctuation/whitespace around the word. """ import re text_lower = text.lower().strip() # word-boundary regex built once per call (cheap; runs 2×/sec) for w in self._wake_en: if re.search(r'\b' + re.escape(w) + r'\b', text_lower): return True return False # ─── MAIN LOOP ──────────────────────────────────────── def _voice_loop(self): """Main voice processing loop — runs in background thread.""" self._load_whisper() self._mic_capture.start() log.info("Voice loop started — listening for wake word...") while self._running: try: if self._state == State.IDLE: self._do_idle() elif self._state == State.WAKE_HEARD: self._do_wake_heard() elif self._state == State.PROCESSING: self._do_processing() elif self._state == State.SPEAKING: # Wait for any TTS to finish before returning to IDLE while self._audio.is_speaking: time.sleep(0.1) self._state = State.IDLE except Exception as e: log.error("Voice loop error: %s", e, exc_info=True) self._state = State.IDLE time.sleep(1) def _do_idle(self): """Listen for wake word in 4-second chunks. Longer windows give Vosk's decoder enough context to commit short utterances like a single 'sanad'.""" # Skip if robot is speaking — prevents self-listening if self._audio.is_speaking: time.sleep(0.2) return audio = self._record_chunk(4.0) # Double-check speaking didn't start during recording if self._audio.is_speaking: return # Skip if too quiet (no one talking). Threshold lowered to 60 to # match the G1 on-board mic's typical noise floor (std ~30-80 when # idle, ~150+ when someone speaks). With 100 we were skipping # quiet "sanad" utterances entirely. if audio.std() < 60: return # Wake-word pass uses restricted Vosk grammar (only "sanad" or "[unk]") text = self._transcribe(audio, grammar=self._wake_grammar) if self._check_wake_word(text): log.info("Wake word detected!") # One clean line to the terminal so the operator knows voice # actually heard them, even though all other voice logs are # file-only. \n leads because we may be painting over a # half-drawn `Command:` prompt. print("\n [Sanad] wake heard — recording command…") self._state = State.WAKE_HEARD # Acknowledge self._audio.speak(self._config["messages"]["wake_heard"]) def _do_wake_heard(self): """Record the command until silence.""" # Wait for "Yes" TTS to finish before recording. while self._audio.is_speaking: time.sleep(0.1) # CRITICAL: flush the mic ring buffer. The UDP multicast receiver # has been accumulating audio continuously (including pre-wake # silence and the TTS "Yes" that just played back into the mic # path). Without flush, _record_until_silence() reads the old # buffered silence instantly, counts 3 silent chunks, and exits # before the user has started speaking the command. self._mic_capture.flush() log.info("Recording command...") audio = self._record_until_silence() if len(audio) < 4000: # < 0.25s at 16kHz log.info("Too short, ignoring") self._audio.speak(self._config["messages"]["no_speech"]) self._state = State.IDLE return self._command_audio = audio self._state = State.PROCESSING def _do_processing(self): """Transcribe the command and send to brain.""" text = self._transcribe(self._command_audio) self._command_audio = None if not text or len(text.strip()) < 2: log.info("Empty transcription") self._audio.speak(self._config["messages"]["no_speech"]) self._state = State.IDLE return log.info("Command: %s", text) # Send to brain callback (lang always "en" in this build) if self._on_command: try: self._on_command(text, "en") except Exception as e: log.error("Brain callback error: %s", e) self._state = State.IDLE # ─── START / STOP ───────────────────────────────────── def start(self): """Start voice listening in background thread.""" if self._running: log.warning("Voice module already running") return self._running = True self._state = State.IDLE self._thread = threading.Thread(target=self._voice_loop, daemon=True, name="voice") self._thread.start() log.info("Voice module started") def stop(self): """Stop voice listening.""" self._running = False try: self._mic_capture.stop() except Exception: pass if self._thread: self._thread.join(timeout=5) self._thread = None log.info("Voice module stopped") @property def state(self) -> str: return self._state @property def is_running(self) -> bool: return self._running # ─── STANDALONE TEST ───────────────────────────────────── if __name__ == "__main__": import sys sys.path.insert(0, PROJECT_ROOT) from API.audio_api import AudioAPI def on_command(text, lang): print(f"\n{'='*50}") print(f" COMMAND [{lang}]: {text}") print(f"{'='*50}\n") audio = AudioAPI() voice = VoiceModule(audio, on_command=on_command) print("Starting voice module... say 'Marcus' to wake.") print("Press Ctrl+C to stop.\n") voice.start() try: while voice.is_running: time.sleep(0.5) except KeyboardInterrupt: print("\nStopping...") voice.stop() print("Done.")