#!/usr/bin/env python3 """ Voice/wake_detector.py — custom wake-word detector (no ML, no Vosk, no Whisper). Energy-envelope state machine. Monitors raw PCM audio and fires a wake event when it sees a short speech burst (sized to match a single spoken word like "Sanad") followed by a clear silence. Why this exists: Vosk's small English lexicon doesn't contain the word "sanad" and substitutes arbitrary English words ("us", "of", "senate"). Whisper on this Jetson's torch-aarch64 produces "!!!!!" garbage. Both are broken for this specific hardware + wake word. An acoustic detector using only numpy doesn't care what the word actually is — it detects the *shape* of a single spoken word in the audio energy envelope. Algorithm (state machine): SILENCE ──(rms > speech_threshold)──> SPEAKING SPEAKING ──(rms < silence_threshold for N chunks)──> ANALYZE ANALYZE: if 0.2 s < speech_duration < 1.5 s → fire WAKE else → reset to SILENCE (too short = cough, too long = sentence) after fire → COOLDOWN for 1.5 s before next detection What it does NOT do: - Does not identify which word was spoken (anything in the duration range triggers) - Does not transcribe follow-on commands (you type those at the terminal) - Does not protect against loud non-speech (clapping, door slam) Usage: from Voice.wake_detector import WakeDetector det = WakeDetector(sample_rate=16000) while True: chunk = mic.read_chunk(1024) # bytes of int16 PCM if det.process(chunk): print("Wake!") """ from __future__ import annotations import time from dataclasses import dataclass from typing import Optional import numpy as np @dataclass class WakeConfig: sample_rate: int = 16_000 # RMS (int16 units) FLOOR for "this chunk is speech". The effective # threshold is max(speech_threshold, ambient_baseline * adaptive_mult) # so this is only a minimum guarantee — the detector adapts upward # in noisy rooms but never below this floor. # G1 far-field mic at normal speaking distance has rms ~ 80-400 for # quiet speech, 400-1500 for clear speech. 80 catches quiet speech; # raise to 120-150 if fan/typing noise triggers false wakes. speech_threshold: float = 80.0 # How long a burst of speech must last to count as a "word". min_word_duration_s: float = 0.20 max_word_duration_s: float = 1.50 # How long of continuous silence we need to consider the word ended. post_silence_s: float = 0.30 # Minimum gap between two consecutive wake fires. Prevents a single # spoken word from triggering twice. cooldown_s: float = 1.50 # RMS window size — we analyze this many ms of audio per step. chunk_ms: int = 50 # Adaptive: how many *recent silent* chunks to average for the noise # floor, and the multiplier applied on top. effective_threshold = # max(speech_threshold, baseline * adaptive_mult). adaptive_window_n: int = 50 # ~2.5 s at 50 ms chunks adaptive_mult: float = 3.0 # Periodic diagnostic log cadence (seconds). 0 disables. diag_log_sec: float = 3.0 class WakeDetector: """Streaming acoustic wake detector — no language model required.""" STATE_SILENCE = "SILENCE" STATE_SPEAKING = "SPEAKING" def __init__(self, cfg: Optional[WakeConfig] = None): self.cfg = cfg or WakeConfig() self._chunk_samples = int(self.cfg.sample_rate * self.cfg.chunk_ms / 1000) self._min_speech = int(self.cfg.min_word_duration_s * self.cfg.sample_rate) self._max_speech = int(self.cfg.max_word_duration_s * self.cfg.sample_rate) self._post_silence = int(self.cfg.post_silence_s * self.cfg.sample_rate) self._state = self.STATE_SILENCE self._speech_start = 0 # sample index where current burst began self._silence_run = 0 # consecutive silent samples inside SPEAKING self._sample_cursor = 0 # running sample count since start self._cooldown_until = 0.0 # wall-clock time after which we can fire again # A small rolling buffer of leftover samples (when the caller's # chunks don't align with our internal analysis window). self._carry = np.zeros(0, dtype=np.int16) # Audio of the most-recent wake-triggering burst. Saved when the # detector fires so callers (marcus_voice) can run Whisper on it # and verify the word was actually "Sanad" rather than a cough. self._burst_samples: list = [] # accumulated during SPEAKING self._last_burst_audio: Optional[np.ndarray] = None # Adaptive noise floor (rolling mean of RMS during SILENCE chunks). self._baseline_buf = [] # last N silent-window RMS values self._baseline = 0.0 # current estimate self._peak_since_diag = 0.0 # max rms since last diag log self._last_diag = time.time() # Logger is optional — if the host app set up logging, use it. try: import logging self._log = logging.getLogger("wake_detector") except Exception: self._log = None # ── public API ──────────────────────────────────────────────── def process(self, pcm_bytes: bytes) -> bool: """ Feed int16 PCM bytes. Returns True once per spoken "word" (short speech burst followed by silence). """ if not pcm_bytes: return False incoming = np.frombuffer(pcm_bytes, dtype=np.int16) samples = np.concatenate([self._carry, incoming]) if self._carry.size else incoming fired = False n = self._chunk_samples i = 0 while i + n <= samples.size: window = samples[i:i + n] if self._step(window): fired = True # break — flush the rest on next call so we get one fire per word i += n break i += n self._sample_cursor += n # Keep whatever didn't fit in a full window for next call. self._carry = samples[i:].copy() return fired def reset(self) -> None: """Drop all state — call when resuming from a long pause.""" self._state = self.STATE_SILENCE self._silence_run = 0 self._carry = np.zeros(0, dtype=np.int16) self._burst_samples = [] def get_last_burst(self) -> Optional[np.ndarray]: """ Return the int16 PCM samples of the most-recent wake-triggering burst, or None if no wake has fired yet. Used by marcus_voice to verify the triggering word was actually 'Sanad' before proceeding. """ return self._last_burst_audio # ── internal ────────────────────────────────────────────────── def _step(self, window: np.ndarray) -> bool: rms = float(np.sqrt(np.mean(window.astype(np.float64) ** 2))) # Effective threshold = max(config floor, adaptive baseline * mult) eff = self.cfg.speech_threshold if self._baseline > 0: eff = max(eff, self._baseline * self.cfg.adaptive_mult) is_speech = rms > eff # Track peak for diag. Log periodically so you can *see* what the # detector is hearing — invaluable when "not hearing me" happens. if rms > self._peak_since_diag: self._peak_since_diag = rms now = time.time() if self.cfg.diag_log_sec > 0 and (now - self._last_diag) >= self.cfg.diag_log_sec: if self._log is not None: self._log.info( "wake: peak=%.0f baseline=%.0f eff_threshold=%.0f state=%s", self._peak_since_diag, self._baseline, eff, self._state, ) self._peak_since_diag = 0.0 self._last_diag = now if now < self._cooldown_until: return False # silent during cooldown if self._state == self.STATE_SILENCE: # Learn the noise floor ONLY in silence — so speech bursts # don't pull the baseline up and lock us out of wake. if not is_speech: self._baseline_buf.append(rms) if len(self._baseline_buf) > self.cfg.adaptive_window_n: self._baseline_buf.pop(0) if self._baseline_buf: self._baseline = sum(self._baseline_buf) / len(self._baseline_buf) if is_speech: self._state = self.STATE_SPEAKING self._speech_start = self._sample_cursor self._silence_run = 0 # Begin capturing the burst audio for later Whisper verify. self._burst_samples = [window.copy()] return False # STATE_SPEAKING # Accumulate every window (speech OR silence inside the burst) # so we capture the full word + trailing quiet. self._burst_samples.append(window.copy()) if is_speech: self._silence_run = 0 # Abort if the burst is longer than a single word — user is # just talking, not addressing the robot. if self._sample_cursor - self._speech_start > self._max_speech: self._state = self.STATE_SILENCE self._burst_samples = [] return False # Silent window inside SPEAKING — accumulate. self._silence_run += window.size if self._silence_run >= self._post_silence: speech_len = (self._sample_cursor - self._silence_run) - self._speech_start self._state = self.STATE_SILENCE self._silence_run = 0 if self._min_speech <= speech_len <= self._max_speech: # Snapshot burst audio for the caller's Whisper verify. self._last_burst_audio = ( np.concatenate(self._burst_samples) if self._burst_samples else None ) self._burst_samples = [] self._cooldown_until = now + self.cfg.cooldown_s return True return False # ── standalone test ───────────────────────────────────────────── if __name__ == "__main__": import os import sys _HERE = os.path.dirname(os.path.abspath(__file__)) sys.path.insert(0, os.path.dirname(_HERE)) from Voice.builtin_mic import BuiltinMic print("WakeDetector standalone test — say 'Sanad' a few times.") print("(Ctrl-C to quit)\n") det = WakeDetector() mic = BuiltinMic() mic.start() try: while True: chunk = mic.read_chunk(1024) if det.process(chunk): print(f" [WAKE] (t={time.strftime('%H:%M:%S')})") except KeyboardInterrupt: pass finally: mic.stop()