Marcus/Voice/marcus_voice.py

#!/usr/bin/env python3
"""
Voice/marcus_voice.py — Marcus Always-Listening Voice Module (English)
=======================================================================
State machine:
  IDLE → (wake word detected) → WAKE_HEARD
  WAKE_HEARD → (record command) → PROCESSING
  PROCESSING → (Whisper transcribe) → send to brain → SPEAKING
  SPEAKING → (TTS done) → IDLE

Wake word: "Sanad"  (detected by Whisper tiny; mistranscription variants in
                     config_Voice.json::stt.wake_words_en)
Commands:  Transcribed by Whisper tiny (small if quality suffers)
Mic:       G1 built-in array mic via UDP multicast (Voice/builtin_mic.py)
TTS:       English only, Unitree built-in TtsMaker (API/audio_api.py)

Usage:
    from Voice.marcus_voice import VoiceModule
    voice = VoiceModule(audio_api, on_command=brain.handle_voice_command)
    voice.start()   # background thread
    voice.stop()
"""

import logging
import os
import sys
import threading
import time
from logging.handlers import RotatingFileHandler
from typing import Optional

import numpy as np

# ─── PATH + CONFIG ───────────────────────────────────────
# Single source of truth lives in Core/; everyone else imports from there.
_PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if _PROJECT_DIR not in sys.path:
    sys.path.insert(0, _PROJECT_DIR)
from Core.env_loader import PROJECT_ROOT
from Core.config_loader import load_config

LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
os.makedirs(LOG_DIR, exist_ok=True)

# Voice runs as a background subsystem — its INFO/DEBUG logs go ONLY to
# logs/voice.log so they don't drown out the interactive `Command:` prompt.
# Anything the user needs to see (wake-word fired, command heard) is
# print()-ed explicitly from the callbacks below.
# basicConfig is idempotent; audio_api may have already called it.
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
    handlers=[
        RotatingFileHandler(
            os.path.join(LOG_DIR, "voice.log"),
            maxBytes=5_000_000, backupCount=3, encoding="utf-8",
        ),
    ],
)
log = logging.getLogger("marcus_voice")


# ─── STATE ENUM ──────────────────────────────────────────

class State:
    IDLE = "IDLE"
    WAKE_HEARD = "WAKE_HEARD"
    PROCESSING = "PROCESSING"
    SPEAKING = "SPEAKING"


# ─── VOICE MODULE ────────────────────────────────────────

class VoiceModule:
    """Always-listening voice interface for Marcus."""

    def __init__(self, audio_api, on_command=None):
        """
        Args:
            audio_api:   AudioAPI instance (from API/audio_api.py)
            on_command:  callback(text: str, lang: str) — "lang" is always "en"
                         now; kept in the signature for interface stability.
        """
        self._audio = audio_api
        self._on_command = on_command
        self._config = load_config("Voice")

        self._stt = self._config["stt"]
        self._mic = self._config["mic"]

        # STT (Vosk) — lazy loaded on first _voice_loop() iteration.
        # One Model instance, recognizers are created fresh per-utterance.
        self._vosk_model = None
        self._KaldiRecognizer = None

        # Wake words (English only — built-in TTS doesn't do Arabic)
        self._wake_en = [w.lower() for w in self._stt.get("wake_words_en",
                                                           ["marcus", "marcos"])]

        # G1 built-in mic (UDP multicast).
        from Voice.builtin_mic import BuiltinMic
        _mcfg = self._config.get("mic_udp", {})
        self._mic_capture = BuiltinMic(
            group=_mcfg.get("group", "239.168.123.161"),
            port=_mcfg.get("port",  5555),
            buf_max=_mcfg.get("buffer_max_bytes", 64000),
        )
        self._sample_rate = self._mic_capture.sample_rate    # 16000

        # State
        self._state = State.IDLE
        self._running = False
        self._thread = None
        self._lock = threading.Lock()

        log.info("VoiceModule initialized (mic: G1 built-in UDP)")

    # ─── MODEL LOADING ────────────────────────────────────

    def _load_stt(self):
        """
        Load Vosk ASR model. Replaces openai-whisper which produced garbage
        (!!!!!!!) on this Jetson's torch-aarch64 install regardless of
        audio quality. Vosk uses Kaldi's own CPU kernels — no torch, no
        numerical instability, ~10× faster than Whisper base on CPU.

        Model path is configured via stt.vosk_model_path (relative to
        PROJECT_ROOT, or absolute). Default: the small English model,
        which is ~40 MB and plenty for short voice commands.
        """
        from vosk import Model, KaldiRecognizer, SetLogLevel
        SetLogLevel(-1)   # silence Vosk's stderr spam

        if self._vosk_model is None:
            rel = self._stt.get("vosk_model_path", "Models/vosk-model-small-en-us-0.15")
            model_path = rel if os.path.isabs(rel) else os.path.join(PROJECT_ROOT, rel)
            if not os.path.isdir(model_path):
                raise RuntimeError(
                    "[Voice] Vosk model not found at " + model_path + "\n"
                    "  Download it on the Jetson:\n"
                    "    cd ~/Marcus/Models\n"
                    "    wget https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip\n"
                    "    unzip vosk-model-small-en-us-0.15.zip"
                )
            log.info("Loading Vosk model: %s", model_path)
            self._vosk_model = Model(model_path)
            self._KaldiRecognizer = KaldiRecognizer
            log.info("Vosk model ready")

            # NO restricted grammar. Vosk's small English model's lexicon
            # doesn't contain "sanad" (it's not an English word), so passing
            # it in a restricted grammar makes Vosk drop the word with:
            #   WARNING (VoskAPI:UpdateGrammarFst) Ignoring word missing in
            #   vocabulary: 'sanad'
            # and the decoder then only has "[unk]" → never matches
            # anything → Transcribed always empty.
            #
            # Instead: open vocabulary transcription, fuzzy-match against
            # the stt.wake_words_en list which contains the English words
            # Vosk ACTUALLY hears when you say "sanad" (then, send, sand,
            # step, signed, etc.).
            self._wake_grammar = None

    # Back-compat alias for any caller that still references the old name
    _load_whisper = _load_stt

    # ─── MIC RECORDING (G1 built-in UDP) ──────────────────

    def _record_chunk(self, seconds: float) -> np.ndarray:
        """Capture a fixed-duration chunk from the G1 built-in mic."""
        num_bytes = int(seconds * self._sample_rate * 2)   # int16 mono
        raw = bytearray()
        bite = 1024
        while len(raw) < num_bytes:
            raw.extend(self._mic_capture.read_chunk(min(bite, num_bytes - len(raw))))
        return np.frombuffer(bytes(raw), dtype=np.int16)

    def _record_until_silence(self) -> np.ndarray:
        """Capture until RMS drops below threshold for `silence_duration_sec`."""
        threshold   = self._stt.get("silence_threshold", 500)
        silence_dur = self._stt.get("silence_duration_sec", 1.5)
        max_dur     = self._stt.get("max_record_sec", 15)

        chunk_sec           = 0.5
        chunk_bytes         = int(self._sample_rate * chunk_sec) * 2
        silence_chunks_need = int(silence_dur / chunk_sec)
        max_chunks          = int(max_dur / chunk_sec)

        all_audio     = []
        silence_count = 0
        chunk_count   = 0

        while chunk_count < max_chunks:
            raw = self._mic_capture.read_chunk(chunk_bytes)
            if not raw:
                break
            chunk = np.frombuffer(raw, dtype=np.int16)
            all_audio.append(chunk)
            chunk_count += 1

            rms = np.sqrt(np.mean(chunk.astype(np.float64) ** 2))
            if rms < threshold:
                silence_count += 1
            else:
                silence_count = 0

            if silence_count >= silence_chunks_need and chunk_count > 2:
                log.info("Silence detected after %.1fs", chunk_count * chunk_sec)
                break

        if all_audio:
            return np.concatenate(all_audio)
        return np.array([], dtype=np.int16)

    # ─── TRANSCRIPTION ────────────────────────────────────

    def _transcribe(self, audio: np.ndarray, grammar: Optional[str] = None) -> str:
        """
        Transcribe audio using Vosk.

        When `grammar` is a JSON list string (e.g. `'["sanad","[unk]"]'`),
        Vosk is constrained to that vocabulary only — perfect for wake-word
        detection where we KNOW the exact word we want to hear. Pass
        grammar=None for open-vocabulary transcription (used for commands).
        """
        import json as _json

        # Audio stats — still useful for "mic is silent" diagnostics.
        peak_i16 = int(np.abs(audio).max()) if audio.size else 0
        rms_i16  = float(np.sqrt(np.mean(audio.astype(np.float64) ** 2))) if audio.size else 0.0
        log.info("audio stats: samples=%d  peak=%d  rms=%.1f", audio.size, peak_i16, rms_i16)

        if audio.size == 0:
            return ""

        # Fresh recognizer per utterance. Pass grammar if provided.
        if grammar:
            rec = self._KaldiRecognizer(self._vosk_model, self._sample_rate, grammar)
        else:
            rec = self._KaldiRecognizer(self._vosk_model, self._sample_rate)
        rec.SetWords(False)

        # Single-shot: feed the whole utterance in one AcceptWaveform call,
        # then take FinalResult. Chunk-based feeding split short "sanad"
        # utterances across chunk boundaries and Vosk's decoder often
        # refused to commit, returning empty. Single-shot works for every
        # voice-assistant example in Vosk's docs.
        #
        # When FinalResult is empty, also check PartialResult — sometimes
        # Vosk heard something but didn't reach a segmentation boundary
        # yet. PartialResult still has the text, just not "finalized".
        rec.AcceptWaveform(audio.tobytes())
        final = _json.loads(rec.FinalResult()).get("text", "").strip()
        if not final:
            partial = _json.loads(rec.PartialResult()).get("partial", "").strip()
            if partial:
                final = partial
                log.info("  (partial only, no final commit)")
        text = final

        if not text:
            log.info("Transcribed: (empty)")
            return ""

        log.info("Transcribed: %s", text[:100])
        return text

    def _check_wake_word(self, text: str) -> bool:
        """
        Check if transcribed text contains an English wake word.
        Matches on word boundary (so "sandstorm" doesn't trigger off "sand"),
        but is lenient about punctuation/whitespace around the word.
        """
        import re
        text_lower = text.lower().strip()
        # word-boundary regex built once per call (cheap; runs 2×/sec)
        for w in self._wake_en:
            if re.search(r'\b' + re.escape(w) + r'\b', text_lower):
                return True
        return False

    # ─── MAIN LOOP ────────────────────────────────────────

    def _voice_loop(self):
        """Main voice processing loop — runs in background thread."""
        self._load_whisper()
        self._mic_capture.start()
        log.info("Voice loop started — listening for wake word...")

        while self._running:
            try:
                if self._state == State.IDLE:
                    self._do_idle()
                elif self._state == State.WAKE_HEARD:
                    self._do_wake_heard()
                elif self._state == State.PROCESSING:
                    self._do_processing()
                elif self._state == State.SPEAKING:
                    # Wait for any TTS to finish before returning to IDLE
                    while self._audio.is_speaking:
                        time.sleep(0.1)
                    self._state = State.IDLE
            except Exception as e:
                log.error("Voice loop error: %s", e, exc_info=True)
                self._state = State.IDLE
                time.sleep(1)

    def _do_idle(self):
        """Listen for wake word in 4-second chunks. Longer windows give
        Vosk's decoder enough context to commit short utterances like a
        single 'sanad'."""
        # Skip if robot is speaking — prevents self-listening
        if self._audio.is_speaking:
            time.sleep(0.2)
            return

        audio = self._record_chunk(4.0)

        # Double-check speaking didn't start during recording
        if self._audio.is_speaking:
            return

        # Skip if too quiet (no one talking). Threshold lowered to 60 to
        # match the G1 on-board mic's typical noise floor (std ~30-80 when
        # idle, ~150+ when someone speaks). With 100 we were skipping
        # quiet "sanad" utterances entirely.
        if audio.std() < 60:
            return

        # Wake-word pass uses restricted Vosk grammar (only "sanad" or "[unk]")
        text = self._transcribe(audio, grammar=self._wake_grammar)

        if self._check_wake_word(text):
            log.info("Wake word detected!")
            # One clean line to the terminal so the operator knows voice
            # actually heard them, even though all other voice logs are
            # file-only. \n leads because we may be painting over a
            # half-drawn `Command:` prompt.
            print("\n  [Sanad] wake heard — recording command…")
            self._state = State.WAKE_HEARD

            # Acknowledge
            self._audio.speak(self._config["messages"]["wake_heard"])

    def _do_wake_heard(self):
        """Record the command until silence."""
        # Wait for "Yes" TTS to finish before recording.
        while self._audio.is_speaking:
            time.sleep(0.1)

        # CRITICAL: flush the mic ring buffer. The UDP multicast receiver
        # has been accumulating audio continuously (including pre-wake
        # silence and the TTS "Yes" that just played back into the mic
        # path). Without flush, _record_until_silence() reads the old
        # buffered silence instantly, counts 3 silent chunks, and exits
        # before the user has started speaking the command.
        self._mic_capture.flush()

        log.info("Recording command...")
        audio = self._record_until_silence()

        if len(audio) < 4000:  # < 0.25s at 16kHz
            log.info("Too short, ignoring")
            self._audio.speak(self._config["messages"]["no_speech"])
            self._state = State.IDLE
            return

        self._command_audio = audio
        self._state = State.PROCESSING

    def _do_processing(self):
        """Transcribe the command and send to brain."""
        text = self._transcribe(self._command_audio)
        self._command_audio = None

        if not text or len(text.strip()) < 2:
            log.info("Empty transcription")
            self._audio.speak(self._config["messages"]["no_speech"])
            self._state = State.IDLE
            return

        log.info("Command: %s", text)

        # Send to brain callback (lang always "en" in this build)
        if self._on_command:
            try:
                self._on_command(text, "en")
            except Exception as e:
                log.error("Brain callback error: %s", e)

        self._state = State.IDLE

    # ─── START / STOP ─────────────────────────────────────

    def start(self):
        """Start voice listening in background thread."""
        if self._running:
            log.warning("Voice module already running")
            return

        self._running = True
        self._state = State.IDLE
        self._thread = threading.Thread(target=self._voice_loop, daemon=True, name="voice")
        self._thread.start()
        log.info("Voice module started")

    def stop(self):
        """Stop voice listening."""
        self._running = False
        try:
            self._mic_capture.stop()
        except Exception:
            pass
        if self._thread:
            self._thread.join(timeout=5)
            self._thread = None
        log.info("Voice module stopped")

    @property
    def state(self) -> str:
        return self._state

    @property
    def is_running(self) -> bool:
        return self._running


# ─── STANDALONE TEST ─────────────────────────────────────

if __name__ == "__main__":
    import sys
    sys.path.insert(0, PROJECT_ROOT)
    from API.audio_api import AudioAPI

    def on_command(text, lang):
        print(f"\n{'='*50}")
        print(f"  COMMAND [{lang}]: {text}")
        print(f"{'='*50}\n")

    audio = AudioAPI()
    voice = VoiceModule(audio, on_command=on_command)

    print("Starting voice module... say 'Marcus' to wake.")
    print("Press Ctrl+C to stop.\n")

    voice.start()

    try:
        while voice.is_running:
            time.sleep(0.5)
    except KeyboardInterrupt:
        print("\nStopping...")
        voice.stop()
        print("Done.")