Marcus/Voice/wake_detector.py

187 lines
7.2 KiB
Python

#!/usr/bin/env python3
"""
Voice/wake_detector.py — custom wake-word detector (no ML, no Vosk, no Whisper).
Energy-envelope state machine. Monitors raw PCM audio and fires a wake
event when it sees a short speech burst (sized to match a single spoken
word like "Sanad") followed by a clear silence.
Why this exists:
Vosk's small English lexicon doesn't contain the word "sanad" and
substitutes arbitrary English words ("us", "of", "senate"). Whisper on
this Jetson's torch-aarch64 produces "!!!!!" garbage. Both are broken
for this specific hardware + wake word. An acoustic detector using
only numpy doesn't care what the word actually is — it detects the
*shape* of a single spoken word in the audio energy envelope.
Algorithm (state machine):
SILENCE ──(rms > speech_threshold)──> SPEAKING
SPEAKING ──(rms < silence_threshold for N chunks)──> ANALYZE
ANALYZE: if 0.2 s < speech_duration < 1.5 s → fire WAKE
else → reset to SILENCE (too short = cough, too long = sentence)
after fire → COOLDOWN for 1.5 s before next detection
What it does NOT do:
- Does not identify which word was spoken (anything in the
duration range triggers)
- Does not transcribe follow-on commands (you type those at the
terminal)
- Does not protect against loud non-speech (clapping, door slam)
Usage:
from Voice.wake_detector import WakeDetector
det = WakeDetector(sample_rate=16000)
while True:
chunk = mic.read_chunk(1024) # bytes of int16 PCM
if det.process(chunk):
print("Wake!")
"""
from __future__ import annotations
import time
from dataclasses import dataclass
from typing import Optional
import numpy as np
@dataclass
class WakeConfig:
sample_rate: int = 16_000
# RMS (int16 units) above which we consider a chunk to be speech.
# G1 on-board mic at normal speaking distance has rms ≈ 500-1500
# during speech and ≈ 40-100 in silence. 150 is a safe middle ground.
speech_threshold: float = 150.0
# How long a burst of speech must last to count as a "word".
min_word_duration_s: float = 0.20
max_word_duration_s: float = 1.50
# How long of continuous silence we need to consider the word ended.
post_silence_s: float = 0.30
# Minimum gap between two consecutive wake fires. Prevents a single
# spoken word from triggering twice.
cooldown_s: float = 1.50
# RMS window size — we analyze this many ms of audio per step.
chunk_ms: int = 50
class WakeDetector:
"""Streaming acoustic wake detector — no language model required."""
STATE_SILENCE = "SILENCE"
STATE_SPEAKING = "SPEAKING"
def __init__(self, cfg: Optional[WakeConfig] = None):
self.cfg = cfg or WakeConfig()
self._chunk_samples = int(self.cfg.sample_rate * self.cfg.chunk_ms / 1000)
self._min_speech = int(self.cfg.min_word_duration_s * self.cfg.sample_rate)
self._max_speech = int(self.cfg.max_word_duration_s * self.cfg.sample_rate)
self._post_silence = int(self.cfg.post_silence_s * self.cfg.sample_rate)
self._state = self.STATE_SILENCE
self._speech_start = 0 # sample index where current burst began
self._silence_run = 0 # consecutive silent samples inside SPEAKING
self._sample_cursor = 0 # running sample count since start
self._cooldown_until = 0.0 # wall-clock time after which we can fire again
# A small rolling buffer of leftover samples (when the caller's
# chunks don't align with our internal analysis window).
self._carry = np.zeros(0, dtype=np.int16)
# ── public API ────────────────────────────────────────────────
def process(self, pcm_bytes: bytes) -> bool:
"""
Feed int16 PCM bytes. Returns True once per spoken "word"
(short speech burst followed by silence).
"""
if not pcm_bytes:
return False
incoming = np.frombuffer(pcm_bytes, dtype=np.int16)
samples = np.concatenate([self._carry, incoming]) if self._carry.size else incoming
fired = False
n = self._chunk_samples
i = 0
while i + n <= samples.size:
window = samples[i:i + n]
if self._step(window):
fired = True
# break — flush the rest on next call so we get one fire per word
i += n
break
i += n
self._sample_cursor += n
# Keep whatever didn't fit in a full window for next call.
self._carry = samples[i:].copy()
return fired
def reset(self) -> None:
"""Drop all state — call when resuming from a long pause."""
self._state = self.STATE_SILENCE
self._silence_run = 0
self._carry = np.zeros(0, dtype=np.int16)
# ── internal ──────────────────────────────────────────────────
def _step(self, window: np.ndarray) -> bool:
rms = float(np.sqrt(np.mean(window.astype(np.float64) ** 2)))
is_speech = rms > self.cfg.speech_threshold
now = time.time()
if now < self._cooldown_until:
return False # silent during cooldown
if self._state == self.STATE_SILENCE:
if is_speech:
self._state = self.STATE_SPEAKING
self._speech_start = self._sample_cursor
self._silence_run = 0
return False
# STATE_SPEAKING
if is_speech:
self._silence_run = 0
# Abort if the burst is longer than a single word — user is
# just talking, not addressing the robot.
if self._sample_cursor - self._speech_start > self._max_speech:
self._state = self.STATE_SILENCE
return False
# Silent window inside SPEAKING — accumulate.
self._silence_run += window.size
if self._silence_run >= self._post_silence:
speech_len = (self._sample_cursor - self._silence_run) - self._speech_start
self._state = self.STATE_SILENCE
self._silence_run = 0
if self._min_speech <= speech_len <= self._max_speech:
self._cooldown_until = now + self.cfg.cooldown_s
return True
return False
# ── standalone test ─────────────────────────────────────────────
if __name__ == "__main__":
import os
import sys
_HERE = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, os.path.dirname(_HERE))
from Voice.builtin_mic import BuiltinMic
print("WakeDetector standalone test — say 'Sanad' a few times.")
print("(Ctrl-C to quit)\n")
det = WakeDetector()
mic = BuiltinMic()
mic.start()
try:
while True:
chunk = mic.read_chunk(1024)
if det.process(chunk):
print(f" [WAKE] (t={time.strftime('%H:%M:%S')})")
except KeyboardInterrupt:
pass
finally:
mic.stop()