Marcus/Voice/marcus_voice.py

#!/usr/bin/env python3
"""
Features/Voice/marcus_voice.py — Marcus Always-Listening Voice Module
======================================================================
State machine:
  IDLE → (wake word detected) → WAKE_HEARD
  WAKE_HEARD → (record command) → PROCESSING
  PROCESSING → (Whisper transcribe) → send to brain → SPEAKING
  SPEAKING → (TTS done) → IDLE

Wake word: "Marcus" / "ماركوس" (detected by Whisper tiny)
Commands:  Transcribed by Whisper small
TTS:       Handled by API/audio_api.py

Usage:
    from Features.Voice.marcus_voice import VoiceModule
    voice = VoiceModule(audio_api, on_command=brain.handle_voice_command)
    voice.start()   # background thread
    voice.stop()
"""

import logging
import os
import subprocess
import threading
import time
import numpy as np

# ─── PATH CONFIG ─────────────────────────────────────────
from dotenv import load_dotenv
load_dotenv()

BASE_PATH = os.environ.get("PROJECT_BASE", "/home/unitree")
PROJECT_NAME = "Marcus"
PROJECT_ROOT = os.path.join(BASE_PATH, PROJECT_NAME)

import json

LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
os.makedirs(LOG_DIR, exist_ok=True)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
    handlers=[
        logging.FileHandler(os.path.join(LOG_DIR, "voice.log")),
        logging.StreamHandler(),
    ],
)
log = logging.getLogger("marcus_voice")


def load_config(name: str) -> dict:
    path = os.path.join(PROJECT_ROOT, "Config", f"config_{name}.json")
    with open(path, "r") as f:
        return json.load(f)


# ─── STATE ENUM ──────────────────────────────────────────

class State:
    IDLE = "IDLE"
    WAKE_HEARD = "WAKE_HEARD"
    PROCESSING = "PROCESSING"
    SPEAKING = "SPEAKING"


# ─── VOICE MODULE ────────────────────────────────────────

class VoiceModule:
    """Always-listening voice interface for Marcus."""

    def __init__(self, audio_api, on_command=None):
        """
        Args:
            audio_api:   AudioAPI instance (from API/audio_api.py)
            on_command:  callback(text: str, lang: str) — called when command is transcribed
        """
        self._audio = audio_api
        self._on_command = on_command
        self._config = load_config("Voice")

        self._stt = self._config["stt"]
        self._mic = self._config["mic"]

        # Whisper models — lazy loaded
        self._wake_model = None
        self._cmd_model = None

        # Wake words
        self._wake_en = [w.lower() for w in self._stt["wake_words_en"]]
        self._wake_ar = self._stt["wake_words_ar"]

        # State
        self._state = State.IDLE
        self._running = False
        self._thread = None
        self._lock = threading.Lock()

        log.info("VoiceModule initialized")

    # ─── MODEL LOADING ────────────────────────────────────

    def _load_whisper(self):
        """Lazy-load Whisper models."""
        import whisper

        if self._wake_model is None:
            log.info("Loading Whisper '%s' for wake word...", self._stt["wake_model"])
            self._wake_model = whisper.load_model(self._stt["wake_model"])
            log.info("Wake model ready")

        if self._cmd_model is None:
            log.info("Loading Whisper '%s' for commands...", self._stt["command_model"])
            self._cmd_model = whisper.load_model(self._stt["command_model"])
            log.info("Command model ready")

    # ─── MIC RECORDING ────────────────────────────────────

    def _record_chunk(self, seconds: float) -> np.ndarray:
        """Record audio chunk from mic via parec."""
        source = self._mic["source_index"]
        rate = str(self._mic["rate"])

        proc = subprocess.Popen(
            ["parec", "-d", source,
             "--format=s16le", f"--rate={rate}", "--channels=1", "--raw"],
            stdout=subprocess.PIPE,
        )
        time.sleep(seconds)
        proc.terminate()
        raw = proc.stdout.read()
        return np.frombuffer(raw, dtype=np.int16)

    def _record_until_silence(self) -> np.ndarray:
        """Record until silence is detected or max duration reached."""
        source = self._mic["source_index"]
        rate = self._mic["rate"]
        threshold = self._stt["silence_threshold"]
        silence_dur = self._stt["silence_duration_sec"]
        max_dur = self._stt["max_record_sec"]

        chunk_sec = 0.5
        chunk_samples = int(rate * chunk_sec)
        silence_chunks_needed = int(silence_dur / chunk_sec)
        max_chunks = int(max_dur / chunk_sec)

        proc = subprocess.Popen(
            ["parec", "-d", source,
             "--format=s16le", f"--rate={rate}", "--channels=1", "--raw"],
            stdout=subprocess.PIPE,
        )

        all_audio = []
        silence_count = 0
        chunk_count = 0

        try:
            while chunk_count < max_chunks:
                data = proc.stdout.read(chunk_samples * 2)  # 2 bytes per sample
                if not data:
                    break

                chunk = np.frombuffer(data, dtype=np.int16)
                all_audio.append(chunk)
                chunk_count += 1

                # Check for silence
                rms = np.sqrt(np.mean(chunk.astype(np.float64) ** 2))
                if rms < threshold:
                    silence_count += 1
                else:
                    silence_count = 0

                if silence_count >= silence_chunks_needed and chunk_count > 2:
                    log.info("Silence detected after %.1fs", chunk_count * chunk_sec)
                    break
        finally:
            proc.terminate()
            proc.stdout.read()  # drain

        if all_audio:
            return np.concatenate(all_audio)
        return np.array([], dtype=np.int16)

    # ─── TRANSCRIPTION ────────────────────────────────────

    def _transcribe(self, audio: np.ndarray, model, task: str = "transcribe") -> str:
        """Transcribe audio using Whisper. Returns text."""
        import whisper

        # Convert int16 to float32 [-1, 1]
        audio_f32 = audio.astype(np.float32) / 32768.0

        # Whisper expects 16kHz
        result = model.transcribe(
            audio_f32,
            language=self._stt["language"],  # None = auto-detect
            task=task,
            fp16=False,
        )
        text = result["text"].strip()
        detected_lang = result.get("language", "unknown")
        log.info("Transcribed [%s]: %s", detected_lang, text[:100])
        return text

    def _check_wake_word(self, text: str) -> bool:
        """Check if transcribed text contains a wake word."""
        text_lower = text.lower().strip()

        # English wake words
        for w in self._wake_en:
            if w in text_lower:
                return True

        # Arabic wake words
        for w in self._wake_ar:
            if w in text:
                return True

        return False

    # ─── MAIN LOOP ────────────────────────────────────────

    def _voice_loop(self):
        """Main voice processing loop — runs in background thread."""
        self._load_whisper()
        log.info("Voice loop started — listening for wake word...")

        # Unmute mic once
        subprocess.run(
            ["pactl", "set-source-mute", self._mic["source_index"], "0"],
            capture_output=True,
        )
        subprocess.run(
            ["pactl", "set-source-volume", self._mic["source_index"], "100%"],
            capture_output=True,
        )

        while self._running:
            try:
                if self._state == State.IDLE:
                    self._do_idle()
                elif self._state == State.WAKE_HEARD:
                    self._do_wake_heard()
                elif self._state == State.PROCESSING:
                    self._do_processing()
                elif self._state == State.SPEAKING:
                    # Wait for any TTS to finish before returning to IDLE
                    while self._audio.is_speaking:
                        time.sleep(0.1)
                    self._state = State.IDLE
            except Exception as e:
                log.error("Voice loop error: %s", e, exc_info=True)
                self._state = State.IDLE
                time.sleep(1)

    def _do_idle(self):
        """Listen for wake word in 2-second chunks."""
        # Skip if robot is speaking — prevents self-listening
        if self._audio.is_speaking:
            time.sleep(0.2)
            return

        audio = self._record_chunk(2.0)

        # Double-check speaking didn't start during recording
        if self._audio.is_speaking:
            return

        # Skip if too quiet (no one talking)
        if audio.std() < 100:
            return

        text = self._transcribe(audio, self._wake_model)

        if self._check_wake_word(text):
            log.info("Wake word detected!")
            self._state = State.WAKE_HEARD

            # Acknowledge
            self._audio.speak(
                self._config["messages"]["wake_heard"], "en"
            )

    def _do_wake_heard(self):
        """Record the command until silence."""
        # Wait for "Listening..." TTS to finish before recording
        while self._audio.is_speaking:
            time.sleep(0.1)

        log.info("Recording command...")
        audio = self._record_until_silence()

        if len(audio) < 4000:  # < 0.25s at 16kHz
            log.info("Too short, ignoring")
            self._audio.speak(self._config["messages"]["no_speech"], "en")
            self._state = State.IDLE
            return

        self._command_audio = audio
        self._state = State.PROCESSING

    def _do_processing(self):
        """Transcribe the command and send to brain."""
        text = self._transcribe(self._command_audio, self._cmd_model)
        self._command_audio = None

        if not text or len(text.strip()) < 2:
            log.info("Empty transcription")
            self._audio.speak(self._config["messages"]["no_speech"], "en")
            self._state = State.IDLE
            return

        # Detect language
        lang = "ar" if any('\u0600' <= c <= '\u06FF' for c in text) else "en"
        log.info("Command [%s]: %s", lang, text)

        # Send to brain callback
        if self._on_command:
            try:
                self._on_command(text, lang)
            except Exception as e:
                log.error("Brain callback error: %s", e)

        self._state = State.IDLE

    # ─── START / STOP ─────────────────────────────────────

    def start(self):
        """Start voice listening in background thread."""
        if self._running:
            log.warning("Voice module already running")
            return

        self._running = True
        self._state = State.IDLE
        self._thread = threading.Thread(target=self._voice_loop, daemon=True, name="voice")
        self._thread.start()
        log.info("Voice module started")

    def stop(self):
        """Stop voice listening."""
        self._running = False
        if self._thread:
            self._thread.join(timeout=5)
            self._thread = None
        log.info("Voice module stopped")

    @property
    def state(self) -> str:
        return self._state

    @property
    def is_running(self) -> bool:
        return self._running


# ─── STANDALONE TEST ─────────────────────────────────────

if __name__ == "__main__":
    import sys
    sys.path.insert(0, PROJECT_ROOT)
    from API.audio_api import AudioAPI

    def on_command(text, lang):
        print(f"\n{'='*50}")
        print(f"  COMMAND [{lang}]: {text}")
        print(f"{'='*50}\n")

    audio = AudioAPI()
    voice = VoiceModule(audio, on_command=on_command)

    print("Starting voice module... say 'Marcus' to wake.")
    print("Press Ctrl+C to stop.\n")

    voice.start()

    try:
        while voice.is_running:
            time.sleep(0.5)
    except KeyboardInterrupt:
        print("\nStopping...")
        voice.stop()
        print("Done.")