Marcus/Voice/marcus_voice.py

#!/usr/bin/env python3
"""
Voice/marcus_voice.py — Gemini Live voice orchestrator for Marcus.

Pipeline:

    G1 mic UDP ──► BuiltinMic (Voice/audio_io.py)
                         │
                         ▼
               GeminiBrain (Voice/gemini/script.py)
                         │ audio out (24 kHz)
                         ▼
                BuiltinSpeaker (Voice/audio_io.py) ──► G1 speaker
                         │ user transcript (on_command)
                         ▼
                _dispatch_gemini_command
                    - require wake word "Sanad"
                    - fuzzy-match command_vocab
                    - dedup within command_cooldown_sec
                         │
                         ▼
                    on_command(text, "en")  ──► Marcus brain

Gemini owns both STT and TTS — it hears the user and replies with its own
voice. Marcus's on_command hook fires alongside Gemini's verbal reply so
motion commands (\"Sanad, turn right\") still move the robot body while
the conversation flows naturally.

Wake word is enforced at dispatch only — Gemini chats normally on all
speech; the robot moves only when \"Sanad\" + a recognised action phrase
appears in the transcript.
"""

from __future__ import annotations

import logging
import os
import re
import sys
import threading
import time
from difflib import SequenceMatcher
from logging.handlers import RotatingFileHandler
from typing import Callable, Optional

import numpy as np

_PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if _PROJECT_DIR not in sys.path:
    sys.path.insert(0, _PROJECT_DIR)
from Core.env_loader import PROJECT_ROOT
from Core.config_loader import load_config

LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
os.makedirs(LOG_DIR, exist_ok=True)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
    handlers=[
        RotatingFileHandler(
            os.path.join(LOG_DIR, "voice.log"),
            maxBytes=5_000_000, backupCount=3, encoding="utf-8",
        ),
    ],
)
log = logging.getLogger("marcus_voice")


# ── Transcript log ─────────────────────────────────────────────
# Every user transcript Gemini emits is written here in a simple
# one-line-per-entry format. Rotates every 5 MB × 3 backups.
_TRANSCRIPT_PATH = os.path.join(LOG_DIR, "transcript.log")
_transcript_log = logging.getLogger("transcript")
_transcript_log.setLevel(logging.INFO)
_transcript_log.propagate = False
if not _transcript_log.handlers:
    _th = RotatingFileHandler(
        _TRANSCRIPT_PATH, maxBytes=5_000_000, backupCount=3, encoding="utf-8",
    )
    _th.setFormatter(logging.Formatter("%(asctime)s  %(message)s"))
    _transcript_log.addHandler(_th)


def _log_transcript(action: str, text: str) -> None:
    _transcript_log.info("%-5s %s", action, (text or "").strip())


# ─── instruction.json — bilingual phrase tables ──────────────────
#
# Single source of truth for every voice phrase the dispatch layer cares
# about: wake-word variants (EN + AR), per-action user_phrases (what the
# user might say), per-action bot_phrases (what Gemini might say back).
# Loaded ONCE at module import; rebuilds the runtime tables below.
# Adding a new accent / variant / action is a JSON-only edit — no Python
# change required.
import json as _json


def _load_instructions() -> dict:
    path = os.path.join(PROJECT_ROOT, "Config", "instruction.json")
    try:
        with open(path, "r", encoding="utf-8") as f:
            return _json.load(f) or {}
    except Exception as e:
        # Fail soft — empty tables mean the dispatch gate just rejects
        # everything and is_running stays True. Better than a crash.
        try:
            log.error("instruction.json not loaded: %s", e)
        except Exception:
            pass
        return {}


_INSTRUCTIONS = _load_instructions()


def _build_wake_words(data: dict) -> set:
    out = set()
    wake = data.get("wake_words", {}) or {}
    for lang in ("english", "arabic"):
        for w in wake.get(lang, []) or []:
            if isinstance(w, str) and w.strip():
                out.add(w.strip().lower())
    return out


def _build_command_vocab(data: dict) -> list:
    """English-only canonical phrases — used by the difflib fuzzy matcher.
    Includes every action's `canonical` plus all entries from
    `user_phrases.english` (deduped, original order preserved per action)."""
    seen = set()
    out = []
    for action in (data.get("actions", {}) or {}).values():
        canon = (action.get("canonical") or "").strip()
        if canon and canon not in seen:
            seen.add(canon)
            out.append(canon)
        for p in (action.get("user_phrases", {}) or {}).get("english", []) or []:
            p = (p or "").strip()
            if p and p not in seen:
                seen.add(p)
                out.append(p)
    return out


def _build_arabic_motion_map(data: dict) -> dict:
    """Map Arabic user phrase → English canonical for every action."""
    out = {}
    for action in (data.get("actions", {}) or {}).values():
        canon = (action.get("canonical") or "").strip()
        if not canon:
            continue
        for p in (action.get("user_phrases", {}) or {}).get("arabic", []) or []:
            p = (p or "").strip()
            if p:
                out[p] = canon
    return out


def _build_bot_motion_patterns(data: dict) -> list:
    """List of (needle, canonical) the bot dispatcher matches Gemini's
    spoken reply against. English needles are lowercased so the dispatcher
    can use case-insensitive `in` checks; Arabic needles are kept verbatim."""
    out = []
    for action in (data.get("actions", {}) or {}).values():
        canon = (action.get("canonical") or "").strip()
        if not canon:
            continue
        bot = action.get("bot_phrases", {}) or {}
        for p in bot.get("english", []) or []:
            p = (p or "").strip()
            if p:
                out.append((p.lower(), canon))
        for p in bot.get("arabic", []) or []:
            p = (p or "").strip()
            if p:
                out.append((p, canon))
    # Sort by needle length descending so multi-word phrases match before
    # their shorter prefixes (e.g. "moving forward" before "moving").
    out.sort(key=lambda x: len(x[0]), reverse=True)
    return out


# Module-level vocabulary tables, all derived from instruction.json.
# Mutable (rebuildable) — VoiceModule.__init__ re-reads in case the file
# changed since import.
WAKE_WORDS: set = _build_wake_words(_INSTRUCTIONS)
COMMAND_VOCAB: list = _build_command_vocab(_INSTRUCTIONS)
_ARABIC_MOTION_TO_CANONICAL: dict = _build_arabic_motion_map(_INSTRUCTIONS)
_BOT_MOTION_PATTERNS: list = _build_bot_motion_patterns(_INSTRUCTIONS)

# Garbage patterns + min length stay in config_Voice.json (they're
# noise filtering, not voice instructions).
GARBAGE_PATTERNS: set = set()
_MIN_TRANSCRIPTION_LENGTH: int = 3


def _has_wake_word(text: str) -> bool:
    """True if `text` contains any wake-word variant as a whole word."""
    low = text.lower()
    for w in WAKE_WORDS:
        if re.search(r'\b' + re.escape(w) + r'\b', low):
            return True
    return False


def _strip_wake_word_once(text: str) -> str:
    """Single pass of wake-word stripping. Use via _strip_wake_word()."""
    stripped = text.strip()
    for w in WAKE_WORDS:
        if re.fullmatch(rf'{re.escape(w)}[\s,.!?]*', stripped, re.IGNORECASE):
            return ""
    for w in sorted(WAKE_WORDS, key=len, reverse=True):
        m = re.match(
            rf'^\s*{re.escape(w)}\s*[,.!?]?\s+(.+)$',
            text, re.IGNORECASE,
        )
        if m:
            return m.group(1).strip(' ,.!?')
        m = re.match(
            rf'^(.+?)\s+{re.escape(w)}\s*[.!?]*\s*$',
            text, re.IGNORECASE,
        )
        if m:
            return m.group(1).strip(' ,.!?')
    return text


def _strip_wake_word(text: str) -> str:
    """
    Remove the wake word from the start or end of text, iteratively, so
    repeated-wake transcriptions ("Sanad. Sanad.") fully collapse.
    Capped at 5 passes to prevent pathological inputs from looping.
    """
    for _ in range(5):
        stripped = _strip_wake_word_once(text)
        if stripped == text:
            return text
        text = stripped
    return text


def _translate_arabic_motion(text: str) -> str:
    """Translate Arabic motion phrases to English canonical equivalents,
    using the table built from instruction.json::actions[*].user_phrases.arabic.
    Substring match; longest-needle wins so multi-word phrases match before
    their shorter prefixes."""
    s = text.strip()
    if not s or not _ARABIC_MOTION_TO_CANONICAL:
        return text
    for ar in sorted(_ARABIC_MOTION_TO_CANONICAL.keys(), key=len, reverse=True):
        if ar in s:
            return _ARABIC_MOTION_TO_CANONICAL[ar]
    return text


def _closest_command(text: str, cutoff: float = 0.72) -> str:
    """
    Map a transcription to the closest known command phrase.
    Returns the canonical command if there's a close-enough match, else
    returns the original text unchanged.
    """
    low = text.lower().strip().rstrip(".!?,")
    if not low:
        return text

    for cmd in COMMAND_VOCAB:
        if cmd in low:
            return cmd

    best_cmd = None
    best_ratio = 0.0
    for cmd in COMMAND_VOCAB:
        r = SequenceMatcher(None, low, cmd).ratio()
        if r > best_ratio:
            best_ratio = r
            best_cmd = cmd

    if best_ratio >= cutoff:
        return best_cmd
    return text


class VoiceModule:
    """Thin orchestrator around GeminiBrain + command dispatch."""

    def __init__(
        self,
        audio_api,
        on_command: Optional[Callable] = None,
        on_wake: Optional[Callable] = None,
    ):
        self._audio = audio_api
        self._on_command = on_command
        self._on_wake = on_wake

        self._config = load_config("Voice")
        self._stt = self._config.get("stt", {})
        self._messages = self._config.get("messages", {})

        # Reload instruction.json so a hot-edit between runs is picked
        # up without re-importing the module. All four phrase tables
        # (wake_words, command_vocab, Arabic→canonical map, bot motion
        # patterns) are rebuilt from instruction.json — single source of
        # truth. Garbage_patterns + min_transcription_length stay in
        # config_Voice.json (those are noise filtering, not voice
        # instruction tables).
        global WAKE_WORDS, COMMAND_VOCAB, GARBAGE_PATTERNS, \
            _MIN_TRANSCRIPTION_LENGTH, _ARABIC_MOTION_TO_CANONICAL, \
            _BOT_MOTION_PATTERNS, _INSTRUCTIONS
        _INSTRUCTIONS                = _load_instructions()
        WAKE_WORDS                   = _build_wake_words(_INSTRUCTIONS)
        COMMAND_VOCAB                = _build_command_vocab(_INSTRUCTIONS)
        _ARABIC_MOTION_TO_CANONICAL  = _build_arabic_motion_map(_INSTRUCTIONS)
        _BOT_MOTION_PATTERNS         = _build_bot_motion_patterns(_INSTRUCTIONS)
        GARBAGE_PATTERNS  = {p.lower() for p in self._stt.get("garbage_patterns", [])}
        _MIN_TRANSCRIPTION_LENGTH = int(self._stt.get("min_transcription_length", 3))
        self._vocab_cutoff = float(self._stt.get("command_vocab_cutoff", 0.72))
        log.info(
            "instruction.json loaded: %d wake_words, %d command_vocab, "
            "%d arabic→canonical, %d bot patterns; "
            "+ %d garbage_patterns from config_Voice.json",
            len(WAKE_WORDS), len(COMMAND_VOCAB),
            len(_ARABIC_MOTION_TO_CANONICAL), len(_BOT_MOTION_PATTERNS),
            len(GARBAGE_PATTERNS),
        )

        # Dispatch dedup state: Gemini's input_transcription can fire
        # multiple times per turn (streaming partials). Track the last
        # canonical command + timestamp so we don't move twice.
        self._last_gemini_canon = ""
        self._last_gemini_dispatch_at = 0.0

        # Wake-word gate state. Motion (whether triggered from the user's
        # transcript OR from Gemini's verbal confirmation) only fires
        # within a turn whose user transcript contained "Sanad" / "سند"
        # at least once. Reset on turn_end. This is the safety rule:
        # actions ALWAYS require the wake word, in either language.
        self._wake_active_for_turn = False

        # Gemini brain reference for flush_mic() — populated by
        # _voice_loop_gemini after spawning the runner subprocess.
        self._brain = None

        # Per-turn buffer for Gemini's spoken text. Gemini Live emits the
        # output transcription in many small chunks ("I", "see", "a", ...);
        # we accumulate them and print one clean `[Sanad] said: "..."`
        # line per turn. Flushed on turn_end OR when a chunk ends with
        # sentence-ending punctuation.
        self._gemini_say_buf = []
        self._gemini_say_lock = threading.Lock()
        self._gemini_say_last_chunk_at = 0.0

        self._running = False
        self._thread = None

        log.info("VoiceModule initialized (backend=gemini)")

    # ─── main loop ────────────────────────────────────────

    def _voice_loop(self):
        """
        Spawn the Gemini Live STT subprocess (runs in the gemini_sdk
        Python 3.10+ env) and forward its transcripts into Marcus's
        dispatch gate. Marcus's main process never opens the Gemini
        WebSocket itself — google-genai needs Python ≥3.9 and marcus
        is pinned to 3.8 by the Jetson torch wheel.
        """
        api_key = (
            os.environ.get("MARCUS_GEMINI_API_KEY")
            or os.environ.get("SANAD_GEMINI_API_KEY")
            or self._stt.get("gemini_api_key", "")
        )
        if not api_key:
            log.error(
                "No Gemini API key found. Set env MARCUS_GEMINI_API_KEY "
                "or stt.gemini_api_key in Config/config_Voice.json"
            )
            while self._running:
                time.sleep(0.5)
            return

        from Voice.gemini_script import GeminiBrain

        # Env overrides for model + voice are passed through to the
        # runner subprocess automatically (it reads the same env vars).
        model = (
            os.environ.get("MARCUS_GEMINI_MODEL")
            or self._stt.get(
                "gemini_model",
                "gemini-2.5-flash-native-audio-preview-12-2025",
            )
        )
        voice_name = (
            os.environ.get("MARCUS_GEMINI_VOICE")
            or self._stt.get("gemini_voice_name", "Charon")
        )
        # System prompt: the runner reads the same config & file paths,
        # but we forward the resolved string in case marcus's config layer
        # picked a fallback. Forwarded via env in GeminiBrain.start().
        system_prompt = self._stt.get(
            "gemini_system_prompt",
            "Transcribe what the user says to Sanad. Stay silent.",
        )
        sp_file = self._stt.get("gemini_system_prompt_file", "")
        if sp_file:
            sp_path = sp_file if os.path.isabs(sp_file) else os.path.join(
                PROJECT_ROOT, sp_file,
            )
            try:
                with open(sp_path, "r", encoding="utf-8") as f:
                    loaded = f.read().strip()
                if loaded:
                    system_prompt = loaded
                    log.info(
                        "gemini system prompt loaded from %s (%d chars)",
                        sp_path, len(loaded),
                    )
            except Exception as e:
                log.warning(
                    "gemini_system_prompt_file=%r unreadable: %s — "
                    "using inline config", sp_file, e,
                )

        log.info(
            "Voice loop started — GEMINI S2S subprocess "
            "(model=%s, voice=%s)", model, voice_name,
        )

        brain = GeminiBrain(
            None, None,                  # audio_io, recorder owned by runner
            voice_name=voice_name,
            system_prompt=system_prompt,
            api_key=api_key,
            on_transcript=self._on_gemini_transcript,
            on_command=self._dispatch_gemini_command,
            on_bot_text=self._on_gemini_say_chunk,
            on_turn_end=self._on_gemini_turn_end,
        )
        self._brain = brain
        brain.start()

        # ── Camera-frame sender ────────────────────────────────────
        # Stream JPEG frames to the runner so Gemini Live can SEE what
        # the robot sees. Without this, "what do you see" / "describe
        # this exhibit" answers would be hallucinations. The runner
        # forwards them to Gemini as image/jpeg blobs and de-stales
        # anything older than gemini_frame_max_age_sec.
        send_frames = bool(self._stt.get("gemini_send_frames", True))
        frame_interval = float(self._stt.get("gemini_frame_interval_sec", 0.5))
        frame_thread = None
        frame_stop = threading.Event()
        if send_frames:
            try:
                from API.camera_api import get_frame as _camera_get_frame
            except Exception as e:
                log.warning("camera_api unavailable — frame streaming disabled: %s", e)
                _camera_get_frame = None
            if _camera_get_frame is not None:
                def _frame_sender_loop():
                    log.info(
                        "frame sender started — %.2fs interval, "
                        "streaming camera frames to Gemini Live",
                        frame_interval,
                    )
                    while not frame_stop.is_set() and self._running:
                        try:
                            frame_b64 = _camera_get_frame()
                            if frame_b64:
                                # camera_api returns a base64 ASCII string —
                                # GeminiBrain.send_frame accepts that directly.
                                brain.send_frame(frame_b64)
                        except Exception as e:
                            log.debug("frame send failed: %s", e)
                        frame_stop.wait(frame_interval)
                    log.info("frame sender stopped")

                frame_thread = threading.Thread(
                    target=_frame_sender_loop,
                    daemon=True, name="gemini-frames",
                )
                frame_thread.start()

        try:
            while self._running:
                time.sleep(0.25)
        finally:
            frame_stop.set()
            if frame_thread is not None:
                frame_thread.join(timeout=2)
            brain.stop()
            self._brain = None

    # ─── dispatch side channel ────────────────────────────

    def _on_gemini_transcript(self, text: str) -> None:
        """Log every user transcript to logs/transcript.log."""
        if text:
            _log_transcript("HEARD", text)

    def _on_gemini_say_chunk(self, text: str) -> None:
        """
        Receive a Gemini output-transcription chunk. Two side effects:
          1. Forward to the bot dispatcher so motion can fire on
             confirmation phrases (Turning right / Sitting down / etc.).
          2. Buffer the chunk for the per-turn `[Sanad] said: ...` line
             that prints once on turn_end (or sooner if the chunk ends
             with sentence punctuation).
        """
        # Motion side-channel — chunk-level so dispatch is fast.
        try:
            self._dispatch_gemini_bot(text)
        except Exception:
            pass

        with self._gemini_say_lock:
            self._gemini_say_buf.append(text)
            self._gemini_say_last_chunk_at = time.time()
            # Flush early if this chunk closes a sentence — typical for
            # short acks like "Turning right." that arrive as one chunk.
            if text.rstrip().endswith((".", "!", "?")):
                self._flush_gemini_say_locked()

    def _on_gemini_turn_end(self) -> None:
        """Flush any pending Gemini output chunks at turn boundary,
        and close the wake-word motion gate for the next turn."""
        with self._gemini_say_lock:
            self._flush_gemini_say_locked()
        self._wake_active_for_turn = False

    def _flush_gemini_say_locked(self) -> None:
        """Caller MUST hold self._gemini_say_lock. Prints one [Sanad] said: line."""
        if not self._gemini_say_buf:
            return
        joined = " ".join(t.strip() for t in self._gemini_say_buf if t).strip()
        while "  " in joined:
            joined = joined.replace("  ", " ")
        self._gemini_say_buf = []
        if joined:
            _log_transcript("SAID", joined)
            try:
                print(f'  [Sanad] said: "{joined[:200]}"')
                print("Command: ", end="", flush=True)
            except Exception:
                pass

    def _dispatch_gemini_command(self, text: str, lang: str = "en") -> None:
        """
        Fire self._on_command for any transcript prefixed with the wake
        word "Sanad". Marcus's brain is the authoritative decision maker
        in the STT-only architecture — it handles motion AND Q&A AND
        vision queries AND replies via TtsMaker.

        The vocab-match gate has been dropped: if the user says
        "Sanad, what's the weather" the transcript still reaches the
        brain, which either answers via its VLM or declines. This keeps
        all Gemini-heard queries routed through one place (Marcus) and
        removes the audio collision that full S2S had.

        Examples:
            "Sanad, turn right"     → strip → "turn right" → brain → motion
            "Sanad, what do you see"→ strip → "what do you see" → brain → VLM
            "Sanad"                 → bare wake → skip (no payload)
            "turn right"            → no wake word → skip (conversation gate)

        Dedup: Gemini emits streaming partials; same normalized command
        within command_cooldown_sec fires only once.
        """
        if not text or not _has_wake_word(text):
            return

        # Wake word found — open the motion gate for this turn so the
        # bot-side dispatcher (Gemini's spoken confirmation) is also
        # allowed to fire. Reset on turn_end.
        self._wake_active_for_turn = True

        stripped = _strip_wake_word(text)
        if not stripped or len(stripped.strip()) < _MIN_TRANSCRIPTION_LENGTH:
            return

        # Bilingual support: translate Arabic motion phrases to their
        # English canonical equivalent BEFORE fuzzy-matching. Marcus's
        # command_parser is English-only.
        stripped = _translate_arabic_motion(stripped)

        low = stripped.lower().strip().rstrip(".!?,")
        vocab_exact = {c.lower() for c in COMMAND_VOCAB}
        if low in GARBAGE_PATTERNS and low not in vocab_exact:
            return

        # Fuzzy-normalize (maps "turn right up" → "turn right") if the
        # transcript is close to a vocab entry — but unlike before, we
        # forward everything that passed the wake-word gate, not just
        # vocab hits. Marcus's command_parser + VLM handles the rest.
        command = self._normalize_command(stripped)
        canon = command.lower().strip().rstrip(".!?,")

        now = time.time()
        cooldown = float(self._stt.get("command_cooldown_sec", 1.5))
        if (canon == self._last_gemini_canon
                and now - self._last_gemini_dispatch_at < cooldown):
            return
        self._last_gemini_canon = canon
        self._last_gemini_dispatch_at = now

        log.info("dispatch (gemini): %s", command[:120])
        _log_transcript("CMD", command)
        if self._on_command:
            try:
                self._on_command(command, lang or "en")
            except Exception as e:
                log.error("on_command error: %s", e, exc_info=True)

    def flush_mic(self) -> None:
        """
        Tell the Gemini runner subprocess to drop its mic buffer. Called
        before AND after `audio_api.speak()` so the robot's own voice
        (picked up by the mic during TtsMaker playback) doesn't come back
        from Gemini as a fake user utterance.
        No-op if the runner hasn't started yet.
        """
        b = getattr(self, "_brain", None)
        if b is None:
            return
        try:
            b.flush_mic()
        except Exception:
            pass

    def _normalize_command(self, text: str) -> str:
        """Fuzzy-match a transcription to the closest canonical phrase."""
        canonical = _closest_command(text, cutoff=self._vocab_cutoff)
        if canonical != text:
            log.info("fuzzy-match: %r → %r", text, canonical)
        return canonical

    # _BOT_MOTION_PATTERNS is built at module load from
    # Config/instruction.json::actions[*].bot_phrases (both English and
    # Arabic). The dispatcher reads it via the module-level reference.
    @property
    def _BOT_MOTION_PATTERNS(self):
        return _BOT_MOTION_PATTERNS

    def _dispatch_gemini_bot(self, text: str) -> None:
        """
        Dispatch motion when Gemini's spoken reply contains a known motion-
        confirmation pattern (English or Arabic). Strict gate: only fires
        if the current turn's user transcript already passed the wake-word
        check (`_wake_active_for_turn`) — so motion ALWAYS requires "Sanad"
        / "سند" somewhere in the user's request. Dedups against the
        user-transcript path so the same command can't fire twice.
        """
        if not text:
            return
        # SAFETY GATE: motion only when wake word was heard this turn.
        if not self._wake_active_for_turn:
            return

        low = text.strip().rstrip(".!?,").lower()
        # Note: we keep both English (lowercased) and Arabic patterns;
        # Arabic is unaffected by .lower() since it has no case.
        if not low:
            return

        canon = None
        for needle, cmd in self._BOT_MOTION_PATTERNS:
            # Match-anywhere so leading filler ("OK, " / "حسناً، ") doesn't
            # block the recognition. English needles are lowercase; Arabic
            # needles match as-is.
            if needle in low or needle in text:
                canon = cmd
                break
        if canon is None:
            return

        now = time.time()
        cooldown = float(self._stt.get("command_cooldown_sec", 1.5))
        if (canon == self._last_gemini_canon
                and now - self._last_gemini_dispatch_at < cooldown):
            return
        self._last_gemini_canon = canon
        self._last_gemini_dispatch_at = now

        log.info("dispatch (gemini-bot): %s  (heard: %r)", canon, text[:80])
        _log_transcript("CMD-BOT", canon)
        if self._on_command:
            try:
                self._on_command(canon, "en")
            except Exception as e:
                log.error("on_command error: %s", e, exc_info=True)

    # ─── start / stop ─────────────────────────────────────

    def start(self):
        if self._running:
            log.warning("VoiceModule already running")
            return
        self._running = True
        self._thread = threading.Thread(
            target=self._voice_loop, daemon=True, name="voice",
        )
        self._thread.start()
        log.info("Voice module started")

    def stop(self):
        self._running = False
        if self._thread:
            self._thread.join(timeout=5)
            self._thread = None
        log.info("Voice module stopped")

    @property
    def is_running(self) -> bool:
        """True while the voice loop thread is alive."""
        t = self._thread
        return bool(self._running and t is not None and t.is_alive())

    @property
    def is_speaking(self) -> bool:
        """Delegates to AudioAPI — True while TtsMaker is playing."""
        try:
            return bool(self._audio.is_speaking)
        except Exception:
            return False