Marcus/Voice/wake_detector.py

#!/usr/bin/env python3
"""
Voice/wake_detector.py — custom wake-word detector (no ML, no Vosk, no Whisper).

Energy-envelope state machine. Monitors raw PCM audio and fires a wake
event when it sees a short speech burst (sized to match a single spoken
word like "Sanad") followed by a clear silence.

Why this exists:
    Vosk's small English lexicon doesn't contain the word "sanad" and
    substitutes arbitrary English words ("us", "of", "senate"). Whisper on
    this Jetson's torch-aarch64 produces "!!!!!" garbage. Both are broken
    for this specific hardware + wake word. An acoustic detector using
    only numpy doesn't care what the word actually is — it detects the
    *shape* of a single spoken word in the audio energy envelope.

Algorithm (state machine):
    SILENCE ──(rms > speech_threshold)──> SPEAKING
    SPEAKING ──(rms < silence_threshold for N chunks)──> ANALYZE
    ANALYZE: if 0.2 s < speech_duration < 1.5 s → fire WAKE
            else → reset to SILENCE (too short = cough, too long = sentence)
    after fire → COOLDOWN for 1.5 s before next detection

What it does NOT do:
    - Does not identify which word was spoken (anything in the
      duration range triggers)
    - Does not transcribe follow-on commands (you type those at the
      terminal)
    - Does not protect against loud non-speech (clapping, door slam)

Usage:
    from Voice.wake_detector import WakeDetector
    det = WakeDetector(sample_rate=16000)
    while True:
        chunk = mic.read_chunk(1024)        # bytes of int16 PCM
        if det.process(chunk):
            print("Wake!")
"""

from __future__ import annotations

import time
from dataclasses import dataclass
from typing import Optional

import numpy as np


@dataclass
class WakeConfig:
    sample_rate: int        = 16_000
    # RMS (int16 units) above which we consider a chunk to be speech.
    # G1 on-board mic at normal speaking distance has rms ≈ 500-1500
    # during speech and ≈ 40-100 in silence. 150 is a safe middle ground.
    speech_threshold:  float = 150.0
    # How long a burst of speech must last to count as a "word".
    min_word_duration_s: float = 0.20
    max_word_duration_s: float = 1.50
    # How long of continuous silence we need to consider the word ended.
    post_silence_s:     float = 0.30
    # Minimum gap between two consecutive wake fires. Prevents a single
    # spoken word from triggering twice.
    cooldown_s:         float = 1.50
    # RMS window size — we analyze this many ms of audio per step.
    chunk_ms:           int   = 50


class WakeDetector:
    """Streaming acoustic wake detector — no language model required."""

    STATE_SILENCE  = "SILENCE"
    STATE_SPEAKING = "SPEAKING"

    def __init__(self, cfg: Optional[WakeConfig] = None):
        self.cfg = cfg or WakeConfig()
        self._chunk_samples = int(self.cfg.sample_rate * self.cfg.chunk_ms / 1000)
        self._min_speech    = int(self.cfg.min_word_duration_s * self.cfg.sample_rate)
        self._max_speech    = int(self.cfg.max_word_duration_s * self.cfg.sample_rate)
        self._post_silence  = int(self.cfg.post_silence_s      * self.cfg.sample_rate)

        self._state         = self.STATE_SILENCE
        self._speech_start  = 0        # sample index where current burst began
        self._silence_run   = 0        # consecutive silent samples inside SPEAKING
        self._sample_cursor = 0        # running sample count since start
        self._cooldown_until = 0.0     # wall-clock time after which we can fire again

        # A small rolling buffer of leftover samples (when the caller's
        # chunks don't align with our internal analysis window).
        self._carry = np.zeros(0, dtype=np.int16)

    # ── public API ────────────────────────────────────────────────

    def process(self, pcm_bytes: bytes) -> bool:
        """
        Feed int16 PCM bytes. Returns True once per spoken "word"
        (short speech burst followed by silence).
        """
        if not pcm_bytes:
            return False
        incoming = np.frombuffer(pcm_bytes, dtype=np.int16)
        samples = np.concatenate([self._carry, incoming]) if self._carry.size else incoming

        fired = False
        n = self._chunk_samples
        i = 0
        while i + n <= samples.size:
            window = samples[i:i + n]
            if self._step(window):
                fired = True
                # break — flush the rest on next call so we get one fire per word
                i += n
                break
            i += n
            self._sample_cursor += n

        # Keep whatever didn't fit in a full window for next call.
        self._carry = samples[i:].copy()
        return fired

    def reset(self) -> None:
        """Drop all state — call when resuming from a long pause."""
        self._state = self.STATE_SILENCE
        self._silence_run = 0
        self._carry = np.zeros(0, dtype=np.int16)

    # ── internal ──────────────────────────────────────────────────

    def _step(self, window: np.ndarray) -> bool:
        rms = float(np.sqrt(np.mean(window.astype(np.float64) ** 2)))
        is_speech = rms > self.cfg.speech_threshold

        now = time.time()
        if now < self._cooldown_until:
            return False  # silent during cooldown

        if self._state == self.STATE_SILENCE:
            if is_speech:
                self._state = self.STATE_SPEAKING
                self._speech_start = self._sample_cursor
                self._silence_run = 0
            return False

        # STATE_SPEAKING
        if is_speech:
            self._silence_run = 0
            # Abort if the burst is longer than a single word — user is
            # just talking, not addressing the robot.
            if self._sample_cursor - self._speech_start > self._max_speech:
                self._state = self.STATE_SILENCE
            return False

        # Silent window inside SPEAKING — accumulate.
        self._silence_run += window.size
        if self._silence_run >= self._post_silence:
            speech_len = (self._sample_cursor - self._silence_run) - self._speech_start
            self._state = self.STATE_SILENCE
            self._silence_run = 0
            if self._min_speech <= speech_len <= self._max_speech:
                self._cooldown_until = now + self.cfg.cooldown_s
                return True
        return False


# ── standalone test ─────────────────────────────────────────────

if __name__ == "__main__":
    import os
    import sys
    _HERE = os.path.dirname(os.path.abspath(__file__))
    sys.path.insert(0, os.path.dirname(_HERE))
    from Voice.builtin_mic import BuiltinMic

    print("WakeDetector standalone test — say 'Sanad' a few times.")
    print("(Ctrl-C to quit)\n")
    det = WakeDetector()
    mic = BuiltinMic()
    mic.start()
    try:
        while True:
            chunk = mic.read_chunk(1024)
            if det.process(chunk):
                print(f"  [WAKE]  (t={time.strftime('%H:%M:%S')})")
    except KeyboardInterrupt:
        pass
    finally:
        mic.stop()