Marcus/Voice/marcus_voice.py

#!/usr/bin/env python3
"""
Voice/marcus_voice.py — voice input for Marcus (custom wake + faster-whisper STT).

Pipeline:
    G1 mic ─► custom wake detector (numpy, offline, instant)
                       │
                       ▼
                 TTS "Yes"  (AudioAPI → G1 TtsMaker)
                       │
                       ▼
              record command audio until silence
                       │
                       ▼
        faster-whisper base.en int8 (CPU)  ──►  brain callback(text)

Wake detection is local and instant (Voice/wake_detector.py — pure DSP, no
ML). STT runs only on the recorded command, not on every 2 s of mic input,
so the CPU cost is bounded by how often the user talks.

Why faster-whisper (CTranslate2) instead of openai-whisper:
    The Jetson's torch-aarch64 build has a Categorical sampler bug that
    produces NaN logits on low-SNR input, which is exactly what the G1
    far-field mic captures. faster-whisper bypasses torch entirely and
    runs the int8-quantized model through CTranslate2 — same quality as
    Whisper base, no numerical instability, 3× faster on this hardware.
"""

from __future__ import annotations

import logging
import os
import sys
import threading
import time
from logging.handlers import RotatingFileHandler
from typing import Callable, Optional

import numpy as np

_PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if _PROJECT_DIR not in sys.path:
    sys.path.insert(0, _PROJECT_DIR)
from Core.env_loader import PROJECT_ROOT
from Core.config_loader import load_config

LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
os.makedirs(LOG_DIR, exist_ok=True)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
    handlers=[
        RotatingFileHandler(
            os.path.join(LOG_DIR, "voice.log"),
            maxBytes=5_000_000, backupCount=3, encoding="utf-8",
        ),
    ],
)
log = logging.getLogger("marcus_voice")


# Module-level vocabulary containers. EMPTY on import — populated by
# VoiceModule.__init__ from Config/config_Voice.json::stt.{wake_words,
# command_vocab, garbage_patterns}. Config is the single source of truth;
# there are no hardcoded string lists here anymore.
#
# If you import this module without running a VoiceModule() first, these
# stay empty → fuzzy-match is a no-op, wake detection rejects everything,
# garbage filter rejects nothing. That's by design: bad config = obvious
# broken behavior, not silently-drifting hardcoded defaults.
WAKE_WORDS: set = set()
COMMAND_VOCAB: list = []
GARBAGE_PATTERNS: set = set()
_MIN_TRANSCRIPTION_LENGTH: int = 3


def _has_wake_word(text: str) -> bool:
    """
    True if the utterance contains any wake-word variant as a *whole word*
    (word-boundary match, not substring — so "standard" doesn't trigger
    off "sand").
    """
    import re
    low = text.lower()
    for w in WAKE_WORDS:
        if re.search(r'\b' + re.escape(w) + r'\b', low):
            return True
    return False


def _strip_wake_word_once(text: str) -> str:
    """Single pass of wake-word stripping. Use via _strip_wake_word()."""
    import re
    stripped = text.strip()

    # Case 1: the entire utterance is just a wake word + optional
    # trailing punctuation. Return empty string so caller can ack-only.
    for w in WAKE_WORDS:
        if re.fullmatch(rf'{re.escape(w)}[\s,.!?]*', stripped, re.IGNORECASE):
            return ""

    # Case 2: "Sanad <command>" — require whitespace (or comma+ws) between
    # wake word and command so "Sanad." doesn't swallow "." as a command.
    for w in sorted(WAKE_WORDS, key=len, reverse=True):
        m = re.match(
            rf'^\s*{re.escape(w)}\s*[,.!?]?\s+(.+)$',
            text, re.IGNORECASE,
        )
        if m:
            return m.group(1).strip(' ,.!?')

        # Case 3: "<command> Sanad" — trailing wake word.
        m = re.match(
            rf'^(.+?)\s+{re.escape(w)}\s*[.!?]*\s*$',
            text, re.IGNORECASE,
        )
        if m:
            return m.group(1).strip(' ,.!?')

    return text


def _strip_wake_word(text: str) -> str:
    """
    Remove the wake word from the start or end of text, iteratively,
    so repeated-wake transcriptions ("Sanad. Sanad.") fully collapse
    to the actual command (or empty string if nothing else was said).

    Examples:
        "Sanad, turn left"  → "turn left"
        "Sanad turn left"   → "turn left"
        "turn left Sanad"   → "turn left"
        "Sanad."            → ""
        "Sanad"             → ""
        "Sanad. Sanad."     → ""     (was leaving "Sanad" before)
        "Sanad Sanad stop"  → "stop" (recursive strip)
    """
    # Iterate until stable — each pass peels off one wake word. Cap at
    # a handful of iterations so a malicious/garbled input can't loop.
    for _ in range(5):
        stripped = _strip_wake_word_once(text)
        if stripped == text:
            return text
        text = stripped
    return text


def _closest_command(text: str, cutoff: float = 0.72) -> str:
    """
    Map a Whisper transcription to the closest known command phrase.

    Returns the canonical command if there's a close-enough match, else
    returns the original text unchanged. Close = difflib SequenceMatcher
    ratio ≥ cutoff (0.72 empirically rejects unrelated phrases while
    accepting common Whisper near-misses like "Turn right up"→"turn right"
    or "What do you see?"→"what do you see").

    Also handles the "transcription contains a command" case — if the
    text has a command phrase as a substring (e.g. "Sanad, turn left"
    from an echo), extract the command.
    """
    from difflib import SequenceMatcher
    low = text.lower().strip().rstrip(".!?,")
    if not low:
        return text

    # Cheap substring win first — no fuzzy needed if the command is
    # literally in the transcription.
    for cmd in COMMAND_VOCAB:
        if cmd in low:
            return cmd

    best_cmd = None
    best_ratio = 0.0
    for cmd in COMMAND_VOCAB:
        r = SequenceMatcher(None, low, cmd).ratio()
        if r > best_ratio:
            best_ratio = r
            best_cmd = cmd

    if best_ratio >= cutoff:
        return best_cmd
    return text


class VoiceModule:
    def __init__(
        self,
        audio_api,
        on_command: Optional[Callable] = None,
        on_wake: Optional[Callable] = None,
    ):
        self._audio = audio_api
        self._on_command = on_command
        self._on_wake = on_wake

        self._config = load_config("Voice")
        self._stt = self._config.get("stt", {})
        self._messages = self._config.get("messages", {})

        # Load all voice vocabulary from config — these are the only
        # string lists the voice layer uses, and they come from
        # config_Voice.json. If a key is missing, the list is empty and
        # that feature silently degrades (fuzzy-match no-op, nothing
        # rejected as garbage, no wake-word match) — NEVER crashes.
        global WAKE_WORDS, COMMAND_VOCAB, GARBAGE_PATTERNS, _MIN_TRANSCRIPTION_LENGTH
        WAKE_WORDS        = {w.lower() for w in self._stt.get("wake_words", [])}
        COMMAND_VOCAB     = list(self._stt.get("command_vocab", []))
        GARBAGE_PATTERNS  = {p.lower() for p in self._stt.get("garbage_patterns", [])}
        _MIN_TRANSCRIPTION_LENGTH = int(self._stt.get("min_transcription_length", 3))
        self._vocab_cutoff = float(self._stt.get("command_vocab_cutoff", 0.72))
        log.info("vocab loaded: %d wake_words, %d command_vocab, %d garbage_patterns",
                 len(WAKE_WORDS), len(COMMAND_VOCAB), len(GARBAGE_PATTERNS))

        # ── Custom wake detector ──
        from Voice.wake_detector import WakeDetector, WakeConfig
        wcfg = WakeConfig(
            sample_rate         = 16_000,
            speech_threshold    = float(self._stt.get("speech_threshold",   80.0)),
            min_word_duration_s = float(self._stt.get("min_word_duration",  0.20)),
            max_word_duration_s = float(self._stt.get("max_word_duration",  1.50)),
            post_silence_s      = float(self._stt.get("post_silence",       0.30)),
            cooldown_s          = float(self._stt.get("wake_cooldown",      1.50)),
            chunk_ms            = int(  self._stt.get("wake_chunk_ms",      50)),
            adaptive_window_n   = int(  self._stt.get("wake_adaptive_window_n", 50)),
            adaptive_mult       = float(self._stt.get("wake_adaptive_mult",    3.0)),
            diag_log_sec        = float(self._stt.get("wake_diag_log_sec",     3.0)),
        )
        self._detector = WakeDetector(wcfg)

        # ── G1 mic ──
        from Voice.builtin_mic import BuiltinMic
        _mcfg = self._config.get("mic_udp", {})
        self._mic_capture = BuiltinMic(
            group   = _mcfg.get("group",            "239.168.123.161"),
            port    = _mcfg.get("port",             5555),
            buf_max = _mcfg.get("buffer_max_bytes", 64000),
        )
        self._sample_rate = self._mic_capture.sample_rate

        # ── global software mic gain ──
        # Applied to every byte read from the mic, so wake detector, VAD,
        # AND Whisper all see the boosted audio. One knob, uniform effect.
        # G1 far-field mic benefits from 2.0-3.0 for normal speaking volume;
        # above 4.0 you start clipping loud words.
        self._mic_gain = float(self._stt.get("mic_gain", 1.0))
        if self._mic_gain != 1.0:
            log.info("mic_gain = %.2fx (applied to all mic reads)", self._mic_gain)

        # ── faster-whisper (lazy-init on first wake) ──
        self._fw = None

        self._running = False
        self._thread = None
        self._cooldown_until = 0.0
        log.info("VoiceModule initialized (wake=custom, stt=faster-whisper)")

    # ─── gain-applied mic read ────────────────────────────

    def _read_mic_raw(self, num_bytes: int) -> bytes:
        """Raw mic read — no gain. Used by the wake detector whose
        thresholds are calibrated against unamplified G1 ambient."""
        return self._mic_capture.read_chunk(num_bytes)

    def _read_mic_gained(self, num_bytes: int) -> bytes:
        """
        Mic read with self._mic_gain applied. Used during command
        recording so Whisper sees a louder, cleaner signal. NOT used
        in the wake loop — amplifying ambient there pushes it over
        the wake threshold and the detector can never find its
        silent baseline.
        """
        raw = self._mic_capture.read_chunk(num_bytes)
        if not raw or self._mic_gain == 1.0:
            return raw
        arr = np.frombuffer(raw, dtype=np.int16).astype(np.float32) * self._mic_gain
        return np.clip(arr, -32768, 32767).astype(np.int16).tobytes()

    # ─── lazy faster-whisper init ─────────────────────────

    def _get_fw(self):
        """Load faster-whisper on first use — startup saved for cold path."""
        if self._fw is not None:
            return self._fw
        model   = self._stt.get("whisper_model",        "base.en")
        device  = self._stt.get("whisper_device",       "cpu")
        compute = self._stt.get("whisper_compute_type", "int8")
        log.info(
            "Loading faster-whisper: model=%s device=%s compute=%s",
            model, device, compute,
        )
        try:
            from faster_whisper import WhisperModel
            self._fw = WhisperModel(model, device=device, compute_type=compute)
            log.info("faster-whisper ready")
        except Exception as e:
            log.error("faster-whisper init failed: %s — voice will be wake-only", e)
            self._fw = None
        return self._fw

    # ─── command recording ────────────────────────────────

    def _record_command(self) -> np.ndarray:
        """
        Record the user's command with a hysteretic, adaptive-baseline VAD.

        Design (handles quiet, normal, and loud voices on the G1 mic):

            1. Sample 200 ms of ambient noise first to learn the floor,
               then set the "silence" gate to max(ambient * 2.5, floor).
               Eliminates the "my silence threshold is higher than my
               user's speaking level" failure mode.

            2. Two thresholds with hysteresis:
                 speech_entry  — RMS required to count as "speech started"
                 silence_exit  — RMS below which we count silence
                                 (< speech_entry; prevents mid-word bail on
                                 breaths and short consonant gaps).

            3. Recording can only *end* after we've actually heard speech.
               Pure silence just runs out to max_record_sec, then returns
               empty (the caller plays "I didn't catch that" without
               burning a Whisper call on noise).

            4. After speech is seen, silence_budget accumulates only while
               RMS stays below silence_exit. A single loud burst resets
               it to zero — so natural "turn... left" pauses don't end the
               recording.
        """
        # ── config knobs (all overridable via config_Voice.json::stt) ─
        speech_entry_rms = float(self._stt.get("speech_entry_rms", 250.0))
        silence_exit_rms = float(self._stt.get("silence_exit_rms", 120.0))
        silence_dur      = float(self._stt.get("silence_duration_sec", 1.2))
        max_dur          = float(self._stt.get("max_record_sec",       8.0))
        min_dur          = float(self._stt.get("min_record_sec",       0.4))
        ambient_probe_s  = float(self._stt.get("ambient_probe_sec",    0.2))
        ambient_mult     = float(self._stt.get("ambient_mult",         2.5))

        small_chunk_bytes = 1024
        analysis_ms       = 100
        analysis_bytes    = int(self._sample_rate * analysis_ms / 1000) * 2

        # ── 1. Reuse the wake detector's baseline instead of probing
        #    the mic right now. The wake detector's _baseline is a rolling
        #    mean of idle-silence RMS values from the last few seconds.
        #
        #    Why NOT probe at record-time: we arrive here right after TTS
        #    "Yes", and the user typically starts speaking within 200 ms
        #    of hearing the ack. A probe window sized to the ambient floor
        #    then measures the *user's speech* as "ambient" and sets
        #    speech_entry above the user's actual amplitude — causing the
        #    "no speech in 8.00s" failure mode observed in the wild.
        #
        #    Cap the baseline at a sensible ceiling so a one-off loud
        #    transient during idle doesn't lock us out either.
        probe_buf = bytearray()                         # no probe audio kept
        ambient_rms = getattr(self._detector, "_baseline", 0.0) or 0.0
        ambient_cap = float(self._stt.get("ambient_cap_rms", 200.0))
        ambient_rms = min(ambient_rms, ambient_cap)

        if ambient_rms > 0:
            adaptive_exit  = max(silence_exit_rms, ambient_rms * ambient_mult)
            adaptive_entry = max(speech_entry_rms, ambient_rms * ambient_mult * 1.8)
        else:
            adaptive_exit, adaptive_entry = silence_exit_rms, speech_entry_rms

        log.info("vad: ambient_rms=%.0f (from wake baseline, cap=%.0f)  "
                 "speech_entry=%.0f  silence_exit=%.0f",
                 ambient_rms, ambient_cap, adaptive_entry, adaptive_exit)

        # ── 2. main capture loop ──────────────────────────────────────
        collected      = bytearray(probe_buf)   # keep probe audio — user may
                                                # have already started talking
        analysis_buf   = bytearray()
        silence_budget = 0.0
        total_time     = len(probe_buf) / 2 / self._sample_rate
        speech_seen    = False
        peak_rms_seen  = 0.0
        # Byte offset into `collected` at which speech first crossed
        # adaptive_entry. We trim pre-speech silence to this point (minus
        # ~300 ms pre-roll) before returning. Keeping Whisper's input
        # tight (speech + small tails) improves transcription accuracy
        # by removing the ambient/HVAC portion that dilutes the mel
        # features.
        speech_start_byte: Optional[int] = None
        preroll_bytes = int(self._sample_rate * 0.3) * 2   # 300 ms
        wall_start     = time.time()

        while total_time < max_dur and (time.time() - wall_start) < max_dur + 2:
            raw = self._read_mic_gained(small_chunk_bytes)
            if not raw:
                time.sleep(0.005)
                continue
            collected.extend(raw)
            analysis_buf.extend(raw)
            total_time += (len(raw) // 2) / self._sample_rate

            while len(analysis_buf) >= analysis_bytes:
                win = np.frombuffer(bytes(analysis_buf[:analysis_bytes]), dtype=np.int16)
                del analysis_buf[:analysis_bytes]
                rms = float(np.sqrt(np.mean(win.astype(np.float64) ** 2)))
                peak_rms_seen = max(peak_rms_seen, rms)

                if rms >= adaptive_entry:
                    if not speech_seen:
                        speech_seen = True
                        # Record where speech started (byte offset
                        # in `collected`) so we can trim pre-roll later.
                        speech_start_byte = max(0, len(collected) - preroll_bytes)
                    silence_budget = 0.0
                elif speech_seen and rms < adaptive_exit:
                    silence_budget += analysis_ms / 1000.0
                # between exit and entry → hold state (hysteresis zone)

            # end only *after* we've heard real speech
            if (speech_seen
                    and silence_budget >= silence_dur
                    and total_time >= min_dur):
                log.info("silence after speech at %.2fs  (peak_rms=%.0f)",
                         total_time, peak_rms_seen)
                break

        if not speech_seen:
            log.info("no speech in %.2fs (peak_rms=%.0f < entry=%.0f) — dropping",
                     total_time, peak_rms_seen, adaptive_entry)
            return np.array([], dtype=np.int16)

        if total_time >= max_dur:
            log.info("max-record-sec hit at %.2fs (peak_rms=%.0f)",
                     total_time, peak_rms_seen)

        # Trim leading pre-speech silence. Keep 300 ms of pre-roll so
        # the onset of the first phoneme is preserved for Whisper.
        if speech_start_byte and speech_start_byte > 0:
            trimmed_ms = speech_start_byte / 2 / self._sample_rate * 1000
            log.info("trimmed %.0f ms of leading silence "
                     "(pre-speech buffer %d bytes)",
                     trimmed_ms, speech_start_byte)
            collected = collected[speech_start_byte:]

        return (np.frombuffer(bytes(collected), dtype=np.int16)
                if collected else np.array([], dtype=np.int16))

    # ─── transcription ────────────────────────────────────

    def _transcribe(self, audio_i16: np.ndarray) -> str:
        """int16 PCM → Whisper transcription. Returns '' on no-speech/noise."""
        fw = self._get_fw()
        if fw is None:
            return ""

        # mic_gain was already applied in _read_mic_gained() during
        # _record_command, so audio_i16 here is already boosted.

        # int16 → float32 [-1, 1] + DSP pre-processing:
        #   1. DC offset removal (subtract mean) — removes any mic bias
        #   2. High-pass filter at 80 Hz — kills HVAC rumble, G1 fan noise,
        #      and speaker-vibration resonance. Whisper ignores the
        #      rumble band anyway, but it inflates RMS estimation and
        #      steals dynamic range from the speech band.
        #   3. Pre-emphasis (0.97 coeff) — mild high-frequency boost
        #      that sharpens consonants (/t/, /s/, /k/ plosives/fricatives)
        #      which Whisper's mel features care most about.
        #   4. Peak-normalize to 0.7.
        audio_f32 = audio_i16.astype(np.float32) / 32768.0
        # 1. DC removal
        audio_f32 = audio_f32 - np.mean(audio_f32)
        # 2. High-pass at 80 Hz (1-pole IIR, stable + cheap)
        audio_f32 = self._highpass_80hz(audio_f32)
        # 3. Pre-emphasis y[n] = x[n] - 0.97 * x[n-1]
        audio_f32 = np.append(
            audio_f32[:1], audio_f32[1:] - 0.97 * audio_f32[:-1]
        )
        # 4. Peak-normalize
        peak = float(np.abs(audio_f32).max())
        if peak > 1e-4 and peak < 0.7:
            boost = 0.7 / peak
            audio_f32 = audio_f32 * boost
            log.info("peak-normalized ×%.2f (peak %.3f → 0.70)", boost, peak)

        # Initial prompt biases the model toward our command vocabulary.
        # Whisper uses this as decoder context — words in the prompt become
        # more likely, which converts ambiguous low-SNR audio like "muv rahh"
        # from a plausible English phrase ("and provide") into the intended
        # command ("move right"). Keep short — long prompts can be echoed.
        init_prompt = self._stt.get(
            "whisper_initial_prompt",
            "turn left, turn right, move forward, walk back, stop, come here, "
            "sit down, stand up, raise arm, wave, look around, what do you see, "
            "remember this, go home, patrol."
        )

        beam_size           = int(self._stt.get("whisper_beam_size",           5))
        no_speech_threshold = float(self._stt.get("whisper_no_speech_threshold", 0.6))
        log_prob_threshold  = float(self._stt.get("whisper_log_prob_threshold", -1.0))
        compression_ratio_t = float(self._stt.get("whisper_compression_ratio_threshold", 2.4))

        # Temperature fallback: greedy first (T=0), then 0.2, then 0.4.
        # Whisper retries automatically when a pass is rejected by
        # its confidence gates (log_prob < threshold etc.). On noisy
        # audio this commonly rescues a bad greedy decode.
        temperatures = self._stt.get(
            "whisper_temperature_fallback", [0.0, 0.2, 0.4]
        )
        try:
            segments, info = fw.transcribe(
                audio_f32,
                language="en",
                beam_size=beam_size,          # 5 = much better than greedy on noisy audio
                temperature=temperatures,     # greedy → 0.2 → 0.4 fallback
                initial_prompt=init_prompt,   # command-vocabulary bias (empty by default)
                condition_on_previous_text=False,
                vad_filter=False,             # we already trimmed silence
                without_timestamps=True,
                # Whisper's built-in gates — drop transcripts that look
                # like hallucinations (very low prob, highly compressed).
                no_speech_threshold=no_speech_threshold,
                log_prob_threshold=log_prob_threshold,
                compression_ratio_threshold=compression_ratio_t,
            )
            # Collect segments and their mean log-prob for a confidence signal.
            seg_list = list(segments)
            text = " ".join(s.text for s in seg_list).strip()
            nsp = float(getattr(info, "no_speech_prob", 0.0))
            if seg_list:
                mean_lp = sum(getattr(s, "avg_logprob", 0.0) for s in seg_list) / len(seg_list)
                log.info("whisper: lp=%.2f nsp=%.2f text=%r",
                         mean_lp, nsp, text[:80])
            else:
                # CRITICAL: log even when Whisper returned zero segments
                # so we can see WHY it dropped everything. Usually nsp is
                # above the threshold or the log-prob fallback killed it.
                log.info("whisper: (no segments)  nsp=%.2f  thresholds: nsp>%.2f && lp<%.2f → drop",
                         nsp, no_speech_threshold, log_prob_threshold)
        except Exception as e:
            log.error("faster-whisper transcribe failed: %s", e)
            return ""

        if not text:
            return ""

        # Reject Whisper garbage patterns (stt.garbage_patterns) and
        # transcriptions shorter than stt.min_transcription_length.
        # Preserve:
        #   - bare wake words (valid "just Sanad" signal → ack)
        #   - exact matches in stt.command_vocab (legitimate short
        #     commands like "go", "hi" must survive the length filter)
        low = text.lower().strip().rstrip(".!?,")
        vocab_exact = {c.lower() for c in COMMAND_VOCAB}
        if low in GARBAGE_PATTERNS or len(low) < _MIN_TRANSCRIPTION_LENGTH:
            if low not in WAKE_WORDS and low not in vocab_exact:
                log.info("Rejecting likely noise transcription: %r", text)
                return ""

        # NOTE: fuzzy-match to canonical command phrase used to happen
        # here, but it runs BEFORE gated-mode could see the wake word.
        # Moved to _normalize_command() and called at dispatch time
        # AFTER the wake-word gate + wake-word strip, so the gate
        # always sees the raw Whisper text.
        return text

    @staticmethod
    def _highpass_80hz(x: np.ndarray, sr: int = 16_000) -> np.ndarray:
        """
        1-pole IIR high-pass at ~80 Hz. Attenuates HVAC/fan rumble
        without touching the speech band. Cheap: 2 multiplies per sample.
        """
        if x.size < 2:
            return x
        # Alpha from fc=80Hz: alpha = RC / (RC + dt), RC = 1/(2*pi*fc)
        import math
        rc = 1.0 / (2 * math.pi * 80.0)
        dt = 1.0 / sr
        alpha = rc / (rc + dt)
        y = np.empty_like(x)
        y[0] = x[0]
        # vectorised enough — the loop is JITted by numpy internally
        # for reasonable sizes (~25k samples).
        prev_y, prev_x = x[0], x[0]
        for i in range(1, x.size):
            cur = alpha * (prev_y + x[i] - prev_x)
            y[i] = cur
            prev_y, prev_x = cur, x[i]
        return y

    def _transcribe_raw(self, audio_i16: np.ndarray) -> str:
        """
        Like _transcribe but WITHOUT the garbage-pattern / length filters
        and without the `initial_prompt` bias. Used for wake verify, where:
          - We only care about the first phoneme (s/sh/z) — a 2-char "so"
            is a valid /sa-/ signature and MUST NOT be dropped by
            min_transcription_length.
          - A biased initial_prompt makes Whisper echo itself on unclear
            audio ("This is a robot assistant" → not s-starting → reject).
        The downside (no Sanad nudge) is fine here because the acoustic
        detector has already gated out non-speech.
        """
        fw = self._get_fw()
        if fw is None:
            return ""
        if self._mic_gain != 1.0:
            audio_i16 = np.clip(
                audio_i16.astype(np.float32) * self._mic_gain, -32768, 32767
            ).astype(np.int16)
        audio_f32 = audio_i16.astype(np.float32) / 32768.0
        peak = float(np.abs(audio_f32).max())
        if peak > 1e-4 and peak < 0.7:
            audio_f32 = audio_f32 * (0.7 / peak)
        try:
            segments, info = fw.transcribe(
                audio_f32,
                language="en",
                beam_size=int(self._stt.get("whisper_beam_size", 5)),
                temperature=0.0,
                initial_prompt="",              # NO bias → NO prompt echo
                condition_on_previous_text=False,
                vad_filter=False,
                without_timestamps=True,
                # Looser gates — we're about to do phonetic match,
                # not trust the transcription verbatim.
                no_speech_threshold=0.85,
                log_prob_threshold=-1.8,
                compression_ratio_threshold=3.0,
            )
            seg_list = list(segments)
            text = " ".join(s.text for s in seg_list).strip()
            if seg_list:
                mean_lp = sum(getattr(s, "avg_logprob", 0.0) for s in seg_list) / len(seg_list)
                log.info("whisper-raw: lp=%.2f nsp=%.2f text=%r",
                         mean_lp, getattr(info, "no_speech_prob", 0.0), text[:80])
            return text
        except Exception as e:
            log.error("whisper-raw transcribe failed: %s", e)
            return ""

    # ─── command transcription ────────────────────────────

    def _transcribe_command(self, audio_i16: np.ndarray) -> str:
        """
        Decode the recorded command audio with faster-whisper. Thin
        wrapper over self._transcribe(); exists so _handle_wake and the
        always-on loop share one entry point.
        """
        if audio_i16.size == 0:
            return ""
        return self._transcribe(audio_i16)

    def _save_turn_wav(
        self, audio_i16: np.ndarray, transcription: str = "", tag: str = "cmd",
    ) -> Optional[str]:
        """
        Save a single-turn command recording for debugging.

        Filename: {tag}_{epoch}_{sanitised_transcription}.wav
        Examples:
          cmd_1728562000_turn_right.wav       ← successful command
          cmd_1728562030_hi.wav               ← Whisper misheard as 'Hi'
          unk_1728562045_.wav                 ← Whisper returned empty
          cmd_1728562060_thanks_for_watch.wav ← garbage-filtered

        Rotation: keeps the most recent 50 across all tags so the disk
        doesn't fill up during a long session. Tunable via
        stt.recording_keep_count.
        """
        try:
            import re as _re
            import wave
            rec_dir = os.path.join(
                PROJECT_ROOT,
                self._config.get("audio", {}).get("data_dir", "Data/Voice/Recordings"),
            )
            os.makedirs(rec_dir, exist_ok=True)

            # Rotate — keep only the most recent N across all command WAVs.
            keep = int(self._stt.get("recording_keep_count", 50))
            existing = sorted(
                f for f in os.listdir(rec_dir)
                if (f.startswith("cmd_") or f.startswith("unk_")) and f.endswith(".wav")
            )
            for old in existing[:max(0, len(existing) - keep + 1)]:
                try: os.remove(os.path.join(rec_dir, old))
                except Exception: pass

            # Sanitise transcription for filename: lowercase, alnum + _, <=40 chars
            slug = _re.sub(r'[^a-z0-9]+', '_', (transcription or "").lower()).strip('_')[:40]
            path = os.path.join(
                rec_dir, f"{tag}_{int(time.time())}_{slug}.wav"
            )
            with wave.open(path, "wb") as w:
                w.setnchannels(1)
                w.setsampwidth(2)
                w.setframerate(self._sample_rate)
                w.writeframes(audio_i16.astype(np.int16).tobytes())
            return path
        except Exception as e:
            log.warning("failed to save turn wav: %s", e)
            return None

    def _save_unk_wav(self, audio_i16: np.ndarray) -> Optional[str]:
        """Backward-compat wrapper — save with the `unk` tag."""
        return self._save_turn_wav(audio_i16, transcription="", tag="unk")

    # ─── command normalization (post-gate) ────────────────

    def _normalize_command(self, text: str) -> str:
        """
        Apply fuzzy-match to the closest canonical command phrase.
        Call AFTER the gated wake check so the wake word has already
        been stripped by the caller if appropriate. Turns near-misses
        like "Turn right up" → "turn right" so command_parser.py's
        regex fast-path can hit them without an LLM round-trip.
        """
        canonical = _closest_command(text, cutoff=self._vocab_cutoff)
        if canonical != text:
            log.info("fuzzy-match: %r → %r", text, canonical)
        return canonical

    # ─── main loop ────────────────────────────────────────

    def _voice_loop(self):
        """
        Dispatch to the right loop based on stt.mode:
            "wake_and_command"  — require "Sanad" wake word (acoustic), then
                                  record and transcribe a command.
            "always_on"         — Transcribe every utterance, log all, and
                                  dispatch all to the brain. No wake.
            "always_on_gated"   — Transcribe every utterance and log all,
                                  but ONLY dispatch utterances that contain
                                  "Sanad" (fuzzy). Wake word is stripped
                                  before the command is sent to the brain.
        """
        mode = self._stt.get("mode", "wake_and_command").lower()
        self._mic_capture.start()
        if mode in ("always_on", "always_on_gated"):
            self._voice_loop_always_on(gated=(mode == "always_on_gated"))
        else:
            self._voice_loop_wake()

    def _voice_loop_wake(self):
        """Classic wake-and-command: listen for 'Sanad', then record command."""
        log.info("Voice loop started — listening for wake (energy-based)")

        was_speaking = False
        while self._running:
            try:
                if self._audio.is_speaking:
                    was_speaking = True
                    time.sleep(0.1)
                    self._detector.reset()
                    continue

                if was_speaking:
                    time.sleep(0.25)
                    self._mic_capture.flush()
                    self._detector.reset()
                    was_speaking = False

                if time.time() < self._cooldown_until:
                    _ = self._read_mic_raw(1024)
                    self._detector.reset()
                    time.sleep(0.05)
                    continue

                chunk = self._read_mic_raw(1024)
                if not chunk:
                    continue

                if self._detector.process(chunk):
                    self._handle_wake()
            except Exception as e:
                log.error("Voice loop error: %s", e, exc_info=True)
                time.sleep(1)

    def _voice_loop_always_on(self, gated: bool = False):
        """
        Always-on mode — Sanad-style continuous listening.

        If `gated` is True, utterances that don't contain the wake word
        "Sanad" (or a fuzzy variant) are logged but NOT dispatched to the
        brain — the robot hears everything, speaks only when addressed.

        Architecture (no wake word, no ack TTS):
            1. Continuously read the gained mic stream in 32 ms chunks.
            2. Run a hysteretic VAD on the stream — speech_entry_rms
               starts an utterance, silence_exit_rms + silence_duration
               ends one.
            3. On each utterance end → Whisper transcribe → fuzzy-match
               → dispatch to brain.
            4. Every ~5 s of idle: log a `ambient: rms=... peak=...` line
               so you can SEE what the mic is doing at all times, even
               when nobody's talking. Matches Sanad's "always listening"
               visibility.
            5. Speech is not gated on amplitude — everything above the
               entry threshold is captured, quiet or loud. Loud speech
               clips naturally against int16; Whisper handles it.

        Thresholds come from the same stt.* config as wake mode but are
        typically tuned lower here (you want eager capture since there's
        no wake-word gate to prevent false positives).
        """
        log.info(
            "Voice loop started — ALWAYS-ON mode%s",
            " [gated: only 'Sanad' utterances dispatched]" if gated
            else " (no wake word — every utterance dispatched)"
        )

        speech_entry = float(self._stt.get("always_on_speech_entry_rms", 250.0))
        silence_exit = float(self._stt.get("always_on_silence_exit_rms", 120.0))
        silence_dur  = float(self._stt.get("always_on_silence_duration_sec", 0.8))
        min_utter_s  = float(self._stt.get("always_on_min_utterance_sec", 0.3))
        max_utter_s  = float(self._stt.get("always_on_max_utterance_sec", 12.0))
        idle_log_s   = float(self._stt.get("always_on_idle_log_sec", 5.0))
        ambient_mult = float(self._stt.get("always_on_ambient_mult", 1.4))
        ambient_win  = int(self._stt.get("always_on_ambient_window_chunks", 100))

        buffer           = bytearray()
        in_speech        = False
        silence_budget   = 0.0
        speech_duration  = 0.0
        peak_rms         = 0.0
        idle_peak_rms    = 0.0
        idle_sum_rms     = 0.0
        idle_chunks      = 0
        last_idle_log    = time.time()
        was_speaking_tts = False

        # Rolling ambient (idle-only) RMS buffer. Used to adapt silence_exit
        # so a noisy room doesn't trap the VAD at max_utter_s: if the
        # observed idle floor sits at rms=200, silence_exit needs to be
        # above 200 or silence never accumulates. We take
        # effective_exit = max(config_silence_exit, ambient_floor * mult).
        ambient_buf: list = []
        ambient_floor     = 0.0

        # Seed ambient_floor by sampling ~1s of mic BEFORE entering the
        # loop. Without this, the very first utterance runs with
        # ambient_floor=0 → eff_exit=config_floor, which under-cuts
        # noisy rooms and creates self-sustaining echo loops.
        seed_chunks = []
        seed_deadline = time.time() + 1.0
        while time.time() < seed_deadline:
            r = self._read_mic_gained(1024)
            if r:
                a = np.frombuffer(r, dtype=np.int16)
                if a.size:
                    seed_chunks.append(
                        float(np.sqrt(np.mean(a.astype(np.float64) ** 2)))
                    )
            else:
                time.sleep(0.005)
        if seed_chunks:
            # Use the median so one loud transient doesn't poison the seed.
            seed_chunks.sort()
            ambient_floor = seed_chunks[len(seed_chunks) // 2]
            ambient_buf = list(seed_chunks[-ambient_win:])
            log.info("ambient seeded: floor=%.0f from %d chunks",
                     ambient_floor, len(seed_chunks))

        while self._running:
            try:
                # Drop mic input while the robot itself is speaking so we
                # don't feed our own TTS back through Whisper.
                if self._audio.is_speaking:
                    was_speaking_tts = True
                    buffer.clear()
                    in_speech = False
                    silence_budget = 0.0
                    speech_duration = 0.0
                    peak_rms = 0.0
                    time.sleep(0.1)
                    continue

                if was_speaking_tts:
                    time.sleep(float(self._stt.get("post_tts_settle_sec", 0.3)))
                    self._mic_capture.flush()
                    was_speaking_tts = False

                raw = self._read_mic_gained(1024)
                if not raw:
                    time.sleep(0.005)
                    continue

                arr = np.frombuffer(raw, dtype=np.int16)
                rms = float(np.sqrt(np.mean(arr.astype(np.float64) ** 2)))
                chunk_s = (len(raw) // 2) / self._sample_rate

                if in_speech:
                    buffer.extend(raw)
                    speech_duration += chunk_s
                    peak_rms = max(peak_rms, rms)

                    # Adaptive silence exit: sits max(config_floor,
                    # ambient_floor × mult). Prevents the "room is noisier
                    # than silence_exit" failure mode where silence never
                    # accumulates and every utterance hits max_utter_s.
                    eff_exit = max(silence_exit, ambient_floor * ambient_mult)
                    if rms < eff_exit:
                        silence_budget += chunk_s
                    else:
                        silence_budget = 0.0

                    utter_over = (silence_budget >= silence_dur and
                                  speech_duration >= min_utter_s)
                    force_stop = speech_duration >= max_utter_s

                    if utter_over or force_stop:
                        reason = "max-duration" if force_stop else "silence"
                        audio = np.frombuffer(bytes(buffer), dtype=np.int16)
                        log.info("utterance end (%s): dur=%.2fs peak_rms=%.0f samples=%d",
                                 reason, speech_duration, peak_rms, audio.size)

                        # RESET STATE IMMEDIATELY — before any Whisper /
                        # speak() / dispatch. Previously a `continue` from
                        # the wake-only ack branch skipped the reset, and
                        # the 12-second buffer lived forever, re-transcribed
                        # every iteration into the same "Sanad" output,
                        # spawning a self-sustaining "Yes" loop.
                        buffer.clear()
                        in_speech = False
                        silence_budget = 0.0
                        speech_duration = 0.0
                        peak_rms = 0.0

                        text = self._transcribe_command(audio) if audio.size else ""
                        if text:
                            log.info("HEARD: %r", text)
                            # Gated mode: only dispatch if the wake word was
                            # spoken. Everything is still logged above so the
                            # operator has full visibility into what the mic
                            # is picking up.
                            if gated and not _has_wake_word(text):
                                log.info("  (no wake word — not dispatched)")
                            else:
                                if gated:
                                    command = _strip_wake_word(text)
                                    if command != text:
                                        log.info("  wake-stripped: %r → %r",
                                                 text, command)
                                    # Bare wake word ("Sanad.", "Sanad") →
                                    # speak a "Yes" ack, do NOT call the
                                    # brain (it would hallucinate a random
                                    # response from a 1-word prompt).
                                    if not command:
                                        log.info("  wake-only utterance — speaking ack")
                                        try:
                                            self._audio.speak(
                                                self._messages.get("wake_heard", "Yes")
                                            )
                                        except Exception as e:
                                            log.warning("wake-ack TTS failed: %s", e)
                                        continue
                                else:
                                    command = text

                                # Normalize near-misses ("Turn right up" →
                                # "turn right") so command_parser's regex
                                # fast-path can hit without an LLM round-trip.
                                command = self._normalize_command(command)

                                print(f'  [Sanad] heard: "{command}"')
                                if self._on_command:
                                    try:
                                        self._on_command(command, "en")
                                    except Exception as e:
                                        log.error("on_command: %s", e, exc_info=True)
                        else:
                            log.info("utterance rejected (empty/garbage after Whisper)")
                else:
                    idle_peak_rms = max(idle_peak_rms, rms)
                    idle_sum_rms += rms
                    idle_chunks += 1

                    # Maintain the rolling ambient floor so silence_exit can
                    # adapt. Use windows that are *clearly* not speech
                    # (rms < speech_entry / 2) — otherwise a borderline
                    # window just before transition pollutes the floor.
                    if rms < speech_entry * 0.5:
                        ambient_buf.append(rms)
                        if len(ambient_buf) > ambient_win:
                            ambient_buf.pop(0)
                        if ambient_buf:
                            ambient_floor = sum(ambient_buf) / len(ambient_buf)

                    if rms >= speech_entry:
                        # utterance starts — keep this chunk as pre-roll
                        log.info("utterance start (rms=%.0f >= entry=%.0f)",
                                 rms, speech_entry)
                        buffer.extend(raw)
                        in_speech = True
                        speech_duration = chunk_s
                        peak_rms = rms
                        silence_budget = 0.0

                    # periodic ambient log while idle — "I am listening"
                    now = time.time()
                    if (now - last_idle_log) >= idle_log_s and idle_chunks > 0:
                        eff_exit = max(silence_exit, ambient_floor * ambient_mult)
                        log.info("ambient: mean_rms=%.0f peak_rms=%.0f chunks=%d "
                                 "floor=%.0f entry=%.0f eff_exit=%.0f",
                                 idle_sum_rms / idle_chunks, idle_peak_rms,
                                 idle_chunks, ambient_floor, speech_entry, eff_exit)
                        idle_peak_rms = 0.0
                        idle_sum_rms = 0.0
                        idle_chunks = 0
                        last_idle_log = now
            except Exception as e:
                log.error("Always-on voice loop error: %s", e, exc_info=True)
                time.sleep(1)

    def _handle_wake(self):
        t_wake = time.time()
        log.info("Wake detected (acoustic)")

        # Verify the burst that triggered wake actually sounds like a
        # wake word. The acoustic detector fires on ANY 0.2-1.5s burst
        # (coughs, claps, door slams). We run a lightweight Whisper
        # decode on the burst and accept if EITHER:
        #   (a) a wake-word variant is in the transcription, OR
        #   (b) the transcription starts with 's'/'sh'/'z' — Whisper's
        #       consistent signature for mishearing non-English "Sanad"
        #       as an English /sa-/ word ("Stop", "Set", "Sand", "Send").
        # Reject if Whisper returns empty (pure noise / cough) or
        # confidently not-s speech ("hello", "okay").
        if self._stt.get("wake_verify_enabled", True):
            burst = self._detector.get_last_burst()
            if burst is not None and burst.size >= int(0.15 * self._sample_rate):
                t_verify = time.time()
                # Lenient transcribe — no garbage filter, no min-length,
                # no bias prompt. See _transcribe_raw docstring.
                heard = self._transcribe_raw(burst)
                verify_ms = (time.time() - t_verify) * 1000
                low = (heard or "").lower().strip().lstrip('"\'.,!?')
                if not low:
                    log.info("  wake REJECTED — whisper empty (%.0fms)", verify_ms)
                    return
                starts_with_s = low.startswith(("s", "sh", "z"))
                if _has_wake_word(heard):
                    log.info("  wake verified (wake-word: %r, %.0fms)",
                             heard, verify_ms)
                elif starts_with_s:
                    log.info("  wake verified (s-phonetic: %r, %.0fms)",
                             heard, verify_ms)
                else:
                    log.info("  wake REJECTED — %r (%.0fms, not s-starting)",
                             heard, verify_ms)
                    return

        print("\n  [Sanad] wake heard — listening…")

        ack_mode = self._stt.get("wake_ack", "tts").lower()
        if ack_mode == "none":
            log.info("  wake-ack: silent (no TTS)")
        else:
            try:
                self._audio.speak(self._messages.get("wake_heard", "Yes"))
            except Exception as e:
                log.warning("TTS ack failed: %s", e)

        # Wait for ack TTS + speaker reverberation to decay
        while self._audio.is_speaking:
            time.sleep(0.05)
        settle = float(self._stt.get("post_tts_settle_sec", 0.3))
        time.sleep(settle)
        self._mic_capture.flush()
        log.info("  wake→record-ready: %.2fs", time.time() - t_wake)

        log.info("Recording command...")
        audio = self._record_command()
        # _record_command returns empty if it never saw speech above the
        # adaptive entry threshold — no point running STT on noise.
        # Two cases:
        #   audio.size == 0     → no speech at all (likely false wake
        #                         from cough/slam). SILENTLY reset —
        #                         don't blurt "I didn't catch that" on
        #                         what was never a real interaction.
        #   0 < size < 8000     → brief speech burst (< 0.5s). Probably
        #                         a real-but-unintelligible attempt;
        #                         speak "I didn't catch that" so the
        #                         user knows to retry.
        if audio.size == 0:
            log.info("Command dropped (no speech — likely false wake); silent reset")
            self._cooldown_until = time.time() + float(
                self._stt.get("command_cooldown_sec", 1.5))
            return
        if audio.size < 8000:  # < 0.5 s but > 0 — real short attempt
            log.info("Command too short (%.2fs); asking user to repeat",
                     audio.size / self._sample_rate)
            try: self._audio.speak(self._messages.get("no_speech", "I didn't catch that"))
            except Exception: pass
            self._cooldown_until = time.time() + float(
                self._stt.get("command_cooldown_sec", 1.5))
            return

        peak = int(np.abs(audio).max())
        rms  = float(np.sqrt(np.mean(audio.astype(np.float64) ** 2)))
        log.info("command audio: samples=%d peak=%d rms=%.1f",
                 audio.size, peak, rms)

        text = self._transcribe_command(audio)
        if not text:
            log.info("Empty or rejected transcription")
            # Save WAV of the failed transcription for post-mortem.
            if self._stt.get("recording_enabled", True):
                self._save_turn_wav(audio, transcription="", tag="unk")
            try: self._audio.speak(self._messages.get("no_speech", "I didn't catch that"))
            except Exception: pass
            self._cooldown_until = time.time() + float(
                self._stt.get("command_cooldown_sec", 1.5))
            return

        # Normalize near-miss transcriptions like "Turn right up" → "turn
        # right" so the brain's regex fast-path catches them.
        text = self._normalize_command(text)
        log.info("Transcribed: %s", text[:120])

        # Save every successful command recording so you can listen back
        # later and see what the mic actually heard vs what Whisper
        # transcribed. Disable with stt.recording_enabled=false.
        if self._stt.get("recording_enabled", True):
            wav_path = self._save_turn_wav(audio, transcription=text, tag="cmd")
            if wav_path:
                log.info("saved: %s", os.path.basename(wav_path))

        if self._on_command:
            try:
                self._on_command(text, "en")
            except Exception as e:
                log.error("on_command error: %s", e, exc_info=True)
        elif self._on_wake:
            try: self._on_wake()
            except Exception: pass

        cd = float(self._stt.get("command_cooldown_sec", 1.5))
        self._cooldown_until = time.time() + cd
        log.info("wake→dispatch total: %.2fs  |  cooldown %.1fs",
                 time.time() - t_wake, cd)

    # ─── start / stop ─────────────────────────────────────

    def start(self):
        if self._running:
            log.warning("VoiceModule already running")
            return
        self._running = True
        self._thread = threading.Thread(target=self._voice_loop, daemon=True, name="voice")
        self._thread.start()
        log.info("Voice module started")

    def stop(self):
        self._running = False
        try: self._mic_capture.stop()
        except Exception: pass
        if self._thread:
            self._thread.join(timeout=5)
            self._thread = None
        log.info("Voice module stopped")

    @property
    def is_running(self) -> bool:
        return self._running


if __name__ == "__main__":
    from API.audio_api import AudioAPI
    def on_cmd(text, lang):
        print(f"\n  COMMAND [{lang}]: {text}\n")
    audio = AudioAPI()
    voice = VoiceModule(audio, on_command=on_cmd)
    print('Starting. Say "Sanad", then speak your command.\n')
    voice.start()
    try:
        while voice.is_running: time.sleep(0.5)
    except KeyboardInterrupt:
        voice.stop()