Marcus/Voice/wake_detector.py

#!/usr/bin/env python3
"""
Voice/wake_detector.py — custom wake-word detector (no ML, no Vosk, no Whisper).

Energy-envelope state machine. Monitors raw PCM audio and fires a wake
event when it sees a short speech burst (sized to match a single spoken
word like "Sanad") followed by a clear silence.

Why this exists:
    Vosk's small English lexicon doesn't contain the word "sanad" and
    substitutes arbitrary English words ("us", "of", "senate"). Whisper on
    this Jetson's torch-aarch64 produces "!!!!!" garbage. Both are broken
    for this specific hardware + wake word. An acoustic detector using
    only numpy doesn't care what the word actually is — it detects the
    *shape* of a single spoken word in the audio energy envelope.

Algorithm (state machine):
    SILENCE ──(rms > speech_threshold)──> SPEAKING
    SPEAKING ──(rms < silence_threshold for N chunks)──> ANALYZE
    ANALYZE: if 0.2 s < speech_duration < 1.5 s → fire WAKE
            else → reset to SILENCE (too short = cough, too long = sentence)
    after fire → COOLDOWN for 1.5 s before next detection

What it does NOT do:
    - Does not identify which word was spoken (anything in the
      duration range triggers)
    - Does not transcribe follow-on commands (you type those at the
      terminal)
    - Does not protect against loud non-speech (clapping, door slam)

Usage:
    from Voice.wake_detector import WakeDetector
    det = WakeDetector(sample_rate=16000)
    while True:
        chunk = mic.read_chunk(1024)        # bytes of int16 PCM
        if det.process(chunk):
            print("Wake!")
"""

from __future__ import annotations

import time
from dataclasses import dataclass
from typing import Optional

import numpy as np


@dataclass
class WakeConfig:
    sample_rate: int        = 16_000
    # RMS (int16 units) FLOOR for "this chunk is speech". The effective
    # threshold is max(speech_threshold, ambient_baseline * adaptive_mult)
    # so this is only a minimum guarantee — the detector adapts upward
    # in noisy rooms but never below this floor.
    # G1 far-field mic at normal speaking distance has rms ~ 80-400 for
    # quiet speech, 400-1500 for clear speech. 80 catches quiet speech;
    # raise to 120-150 if fan/typing noise triggers false wakes.
    speech_threshold:  float = 80.0
    # How long a burst of speech must last to count as a "word".
    min_word_duration_s: float = 0.20
    max_word_duration_s: float = 1.50
    # How long of continuous silence we need to consider the word ended.
    post_silence_s:     float = 0.30
    # Minimum gap between two consecutive wake fires. Prevents a single
    # spoken word from triggering twice.
    cooldown_s:         float = 1.50
    # RMS window size — we analyze this many ms of audio per step.
    chunk_ms:           int   = 50
    # Adaptive: how many *recent silent* chunks to average for the noise
    # floor, and the multiplier applied on top. effective_threshold =
    # max(speech_threshold, baseline * adaptive_mult).
    adaptive_window_n: int    = 50       # ~2.5 s at 50 ms chunks
    adaptive_mult:     float  = 3.0
    # Periodic diagnostic log cadence (seconds). 0 disables.
    diag_log_sec:      float  = 3.0


class WakeDetector:
    """Streaming acoustic wake detector — no language model required."""

    STATE_SILENCE  = "SILENCE"
    STATE_SPEAKING = "SPEAKING"

    def __init__(self, cfg: Optional[WakeConfig] = None):
        self.cfg = cfg or WakeConfig()
        self._chunk_samples = int(self.cfg.sample_rate * self.cfg.chunk_ms / 1000)
        self._min_speech    = int(self.cfg.min_word_duration_s * self.cfg.sample_rate)
        self._max_speech    = int(self.cfg.max_word_duration_s * self.cfg.sample_rate)
        self._post_silence  = int(self.cfg.post_silence_s      * self.cfg.sample_rate)

        self._state         = self.STATE_SILENCE
        self._speech_start  = 0        # sample index where current burst began
        self._silence_run   = 0        # consecutive silent samples inside SPEAKING
        self._sample_cursor = 0        # running sample count since start
        self._cooldown_until = 0.0     # wall-clock time after which we can fire again

        # A small rolling buffer of leftover samples (when the caller's
        # chunks don't align with our internal analysis window).
        self._carry = np.zeros(0, dtype=np.int16)

        # Audio of the most-recent wake-triggering burst. Saved when the
        # detector fires so callers (marcus_voice) can run Whisper on it
        # and verify the word was actually "Sanad" rather than a cough.
        self._burst_samples: list = []         # accumulated during SPEAKING
        self._last_burst_audio: Optional[np.ndarray] = None

        # Adaptive noise floor (rolling mean of RMS during SILENCE chunks).
        self._baseline_buf  = []           # last N silent-window RMS values
        self._baseline      = 0.0          # current estimate
        self._peak_since_diag = 0.0        # max rms since last diag log
        self._last_diag     = time.time()
        # Logger is optional — if the host app set up logging, use it.
        try:
            import logging
            self._log = logging.getLogger("wake_detector")
        except Exception:
            self._log = None

    # ── public API ────────────────────────────────────────────────

    def process(self, pcm_bytes: bytes) -> bool:
        """
        Feed int16 PCM bytes. Returns True once per spoken "word"
        (short speech burst followed by silence).
        """
        if not pcm_bytes:
            return False
        incoming = np.frombuffer(pcm_bytes, dtype=np.int16)
        samples = np.concatenate([self._carry, incoming]) if self._carry.size else incoming

        fired = False
        n = self._chunk_samples
        i = 0
        while i + n <= samples.size:
            window = samples[i:i + n]
            if self._step(window):
                fired = True
                # break — flush the rest on next call so we get one fire per word
                i += n
                break
            i += n
            self._sample_cursor += n

        # Keep whatever didn't fit in a full window for next call.
        self._carry = samples[i:].copy()
        return fired

    def reset(self) -> None:
        """Drop all state — call when resuming from a long pause."""
        self._state = self.STATE_SILENCE
        self._silence_run = 0
        self._carry = np.zeros(0, dtype=np.int16)
        self._burst_samples = []

    def get_last_burst(self) -> Optional[np.ndarray]:
        """
        Return the int16 PCM samples of the most-recent wake-triggering
        burst, or None if no wake has fired yet. Used by marcus_voice to
        verify the triggering word was actually 'Sanad' before proceeding.
        """
        return self._last_burst_audio

    # ── internal ──────────────────────────────────────────────────

    def _step(self, window: np.ndarray) -> bool:
        rms = float(np.sqrt(np.mean(window.astype(np.float64) ** 2)))

        # Effective threshold = max(config floor, adaptive baseline * mult)
        eff = self.cfg.speech_threshold
        if self._baseline > 0:
            eff = max(eff, self._baseline * self.cfg.adaptive_mult)
        is_speech = rms > eff

        # Track peak for diag. Log periodically so you can *see* what the
        # detector is hearing — invaluable when "not hearing me" happens.
        if rms > self._peak_since_diag:
            self._peak_since_diag = rms
        now = time.time()
        if self.cfg.diag_log_sec > 0 and (now - self._last_diag) >= self.cfg.diag_log_sec:
            if self._log is not None:
                self._log.info(
                    "wake: peak=%.0f  baseline=%.0f  eff_threshold=%.0f  state=%s",
                    self._peak_since_diag, self._baseline, eff, self._state,
                )
            self._peak_since_diag = 0.0
            self._last_diag = now

        if now < self._cooldown_until:
            return False  # silent during cooldown

        if self._state == self.STATE_SILENCE:
            # Learn the noise floor ONLY in silence — so speech bursts
            # don't pull the baseline up and lock us out of wake.
            if not is_speech:
                self._baseline_buf.append(rms)
                if len(self._baseline_buf) > self.cfg.adaptive_window_n:
                    self._baseline_buf.pop(0)
                if self._baseline_buf:
                    self._baseline = sum(self._baseline_buf) / len(self._baseline_buf)
            if is_speech:
                self._state = self.STATE_SPEAKING
                self._speech_start = self._sample_cursor
                self._silence_run = 0
                # Begin capturing the burst audio for later Whisper verify.
                self._burst_samples = [window.copy()]
            return False

        # STATE_SPEAKING
        # Accumulate every window (speech OR silence inside the burst)
        # so we capture the full word + trailing quiet.
        self._burst_samples.append(window.copy())

        if is_speech:
            self._silence_run = 0
            # Abort if the burst is longer than a single word — user is
            # just talking, not addressing the robot.
            if self._sample_cursor - self._speech_start > self._max_speech:
                self._state = self.STATE_SILENCE
                self._burst_samples = []
            return False

        # Silent window inside SPEAKING — accumulate.
        self._silence_run += window.size
        if self._silence_run >= self._post_silence:
            speech_len = (self._sample_cursor - self._silence_run) - self._speech_start
            self._state = self.STATE_SILENCE
            self._silence_run = 0
            if self._min_speech <= speech_len <= self._max_speech:
                # Snapshot burst audio for the caller's Whisper verify.
                self._last_burst_audio = (
                    np.concatenate(self._burst_samples)
                    if self._burst_samples else None
                )
                self._burst_samples = []
                self._cooldown_until = now + self.cfg.cooldown_s
                return True
        return False


# ── standalone test ─────────────────────────────────────────────

if __name__ == "__main__":
    import os
    import sys
    _HERE = os.path.dirname(os.path.abspath(__file__))
    sys.path.insert(0, os.path.dirname(_HERE))
    from Voice.builtin_mic import BuiltinMic

    print("WakeDetector standalone test — say 'Sanad' a few times.")
    print("(Ctrl-C to quit)\n")
    det = WakeDetector()
    mic = BuiltinMic()
    mic.start()
    try:
        while True:
            chunk = mic.read_chunk(1024)
            if det.process(chunk):
                print(f"  [WAKE]  (t={time.strftime('%H:%M:%S')})")
    except KeyboardInterrupt:
        pass
    finally:
        mic.stop()