Marcus/Voice/marcus_voice.py

#!/usr/bin/env python3
"""
Voice/marcus_voice.py — Marcus Wake-Signal Module (no ML, no STT).

This is a deliberately-minimal voice subsystem:

    - A custom energy-based wake detector (Voice/wake_detector.py) listens
      to the G1's on-board mic continuously.
    - When the user says any short word (~0.2-1.5 s of speech followed by
      silence), wake fires.
    - The robot acknowledges via TTS ("Yes" — configurable).
    - The user then types their command at the Marcus terminal prompt.

No Vosk, no Whisper, no torch, no network. Pure numpy DSP.

Why not STT here:
    Both Vosk's small English model ("sanad" absent from lexicon) and
    openai-whisper ("!!!!!" numerical garbage on this Jetson's torch-aarch64)
    proved unreliable for this hardware. Rather than fight either, the
    wake path becomes a simple "did the user say something?" signal.

Interface with Marcus brain:
    VoiceModule(audio_api, on_wake=callback)
        on_wake() is called when wake fires. Brain can display a prompt
        or do anything else.
"""

from __future__ import annotations

import logging
import os
import sys
import threading
import time
from logging.handlers import RotatingFileHandler
from typing import Callable, Optional

# ─── PATH + CONFIG ───────────────────────────────────────
_PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if _PROJECT_DIR not in sys.path:
    sys.path.insert(0, _PROJECT_DIR)
from Core.env_loader import PROJECT_ROOT
from Core.config_loader import load_config

LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
os.makedirs(LOG_DIR, exist_ok=True)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
    handlers=[
        RotatingFileHandler(
            os.path.join(LOG_DIR, "voice.log"),
            maxBytes=5_000_000, backupCount=3, encoding="utf-8",
        ),
    ],
)
log = logging.getLogger("marcus_voice")


class VoiceModule:
    """Wake-only voice subsystem — fires a callback when speech is detected."""

    def __init__(self, audio_api, on_command: Optional[Callable] = None,
                 on_wake: Optional[Callable] = None):
        """
        Args:
            audio_api:   AudioAPI instance (for TTS ack).
            on_command:  kept for API compatibility; always called with
                         text="" because there's no STT. Brain should
                         prompt the user to type.
            on_wake:     alternative callback fired when wake detected.
                         Exactly one of on_command / on_wake is used.
        """
        self._audio = audio_api
        self._on_command = on_command
        self._on_wake = on_wake
        self._config = load_config("Voice")

        self._stt = self._config.get("stt", {})
        self._messages = self._config.get("messages", {})

        # Wake-detector parameters (tweakable via config_Voice.json::stt).
        from Voice.wake_detector import WakeDetector, WakeConfig
        wcfg = WakeConfig(
            sample_rate        = 16_000,
            speech_threshold   = float(self._stt.get("speech_threshold",   150.0)),
            min_word_duration_s= float(self._stt.get("min_word_duration",  0.20)),
            max_word_duration_s= float(self._stt.get("max_word_duration",  1.50)),
            post_silence_s     = float(self._stt.get("post_silence",       0.30)),
            cooldown_s         = float(self._stt.get("wake_cooldown",      1.50)),
            chunk_ms           = int(  self._stt.get("wake_chunk_ms",      50)),
        )
        self._detector = WakeDetector(wcfg)

        # G1 built-in mic (UDP multicast).
        from Voice.builtin_mic import BuiltinMic
        _mcfg = self._config.get("mic_udp", {})
        self._mic_capture = BuiltinMic(
            group   = _mcfg.get("group",            "239.168.123.161"),
            port    = _mcfg.get("port",             5555),
            buf_max = _mcfg.get("buffer_max_bytes", 64000),
        )

        self._running = False
        self._thread = None

        log.info(
            "VoiceModule initialized (custom wake detector, "
            "speech_threshold=%s, min/max_word=%s/%s s)",
            wcfg.speech_threshold, wcfg.min_word_duration_s, wcfg.max_word_duration_s,
        )

    # ─── main loop ────────────────────────────────────────

    def _voice_loop(self):
        self._mic_capture.start()
        log.info("Voice loop started — listening for wake (energy-based, no ML)")
        while self._running:
            try:
                # Don't listen while the robot is speaking (prevents
                # self-trigger from TTS output leaking into the mic).
                if self._audio.is_speaking:
                    time.sleep(0.1)
                    self._detector.reset()
                    continue

                chunk = self._mic_capture.read_chunk(1024)   # ~32 ms at 16 kHz
                if not chunk:
                    continue

                if self._detector.process(chunk):
                    self._on_wake_fired()
            except Exception as e:
                log.error("Voice loop error: %s", e, exc_info=True)
                time.sleep(1)

    def _on_wake_fired(self):
        log.info("Wake detected (acoustic)")
        print("\n  [Sanad] wake heard — type your command at the prompt.")
        # TTS ack
        msg = self._messages.get("wake_heard", "Yes")
        try:
            self._audio.speak(msg)
        except Exception as e:
            log.warning("TTS ack failed: %s", e)

        # Brain callbacks for compatibility with the old interface.
        if self._on_wake:
            try:
                self._on_wake()
            except Exception as e:
                log.error("on_wake callback error: %s", e)
        elif self._on_command:
            # Old API expected (text, lang). We have no transcription, so
            # pass empty text — brain is expected to prompt for typed input.
            try:
                self._on_command("", "en")
            except Exception as e:
                log.error("on_command callback error: %s", e)

    # ─── start / stop ─────────────────────────────────────

    def start(self):
        if self._running:
            log.warning("VoiceModule already running")
            return
        self._running = True
        self._thread = threading.Thread(
            target=self._voice_loop, daemon=True, name="voice",
        )
        self._thread.start()
        log.info("Voice module started")

    def stop(self):
        self._running = False
        try:
            self._mic_capture.stop()
        except Exception:
            pass
        if self._thread:
            self._thread.join(timeout=5)
            self._thread = None
        log.info("Voice module stopped")

    @property
    def is_running(self) -> bool:
        return self._running


# ─── standalone test ─────────────────────────────────────
if __name__ == "__main__":
    from API.audio_api import AudioAPI

    def on_wake():
        print("  (brain callback fired)")

    audio = AudioAPI()
    voice = VoiceModule(audio, on_wake=on_wake)
    print("Starting voice module... say any short word to test the wake.")
    print("Press Ctrl-C to stop.\n")
    voice.start()
    try:
        while voice.is_running:
            time.sleep(0.5)
    except KeyboardInterrupt:
        print("\nStopping...")
        voice.stop()
        print("Done.")