Update 2026-04-23 09:54:45

This commit is contained in:
kassam 2026-04-23 09:54:45 +04:00
parent ce09b6920a
commit f45e12fae5
3 changed files with 296 additions and 362 deletions

View File

@ -5,21 +5,14 @@
"target_sample_rate": 16000 "target_sample_rate": 16000
}, },
"stt": { "stt": {
"backend": "vosk", "backend": "custom_acoustic",
"vosk_model_path": "Models/vosk-model-small-en-us-0.15", "_comment": "Pure-DSP wake detector in Voice/wake_detector.py. No ML.",
"wake_words_en": [ "speech_threshold": 150.0,
"sanad", "sannad", "sanat", "sunnat", "min_word_duration": 0.20,
"senad", "sennad", "sanid", "sanud", "max_word_duration": 1.50,
"samad", "sandy", "sanday", "sunday", "synod", "signed", "post_silence": 0.30,
"sand", "send", "sent", "set", "seen", "seed", "wake_cooldown": 1.50,
"then", "than", "that", "step", "stuck", "wake_chunk_ms": 50
"said", "sad", "saw", "so", "sir", "sun"
],
"language": "en",
"command_timeout_sec": 10,
"silence_threshold": 150,
"silence_duration_sec": 2.0,
"max_record_sec": 15
}, },
"mic": { "mic": {
"backend": "builtin_udp", "backend": "builtin_udp",

View File

@ -1,38 +1,41 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
Voice/marcus_voice.py Marcus Always-Listening Voice Module (English) Voice/marcus_voice.py Marcus Wake-Signal Module (no ML, no STT).
=======================================================================
State machine:
IDLE (wake word detected) WAKE_HEARD
WAKE_HEARD (record command) PROCESSING
PROCESSING (Whisper transcribe) send to brain SPEAKING
SPEAKING (TTS done) IDLE
Wake word: "Sanad" (detected by Whisper tiny; mistranscription variants in This is a deliberately-minimal voice subsystem:
config_Voice.json::stt.wake_words_en)
Commands: Transcribed by Whisper tiny (small if quality suffers)
Mic: G1 built-in array mic via UDP multicast (Voice/builtin_mic.py)
TTS: English only, Unitree built-in TtsMaker (API/audio_api.py)
Usage: - A custom energy-based wake detector (Voice/wake_detector.py) listens
from Voice.marcus_voice import VoiceModule to the G1's on-board mic continuously.
voice = VoiceModule(audio_api, on_command=brain.handle_voice_command) - When the user says any short word (~0.2-1.5 s of speech followed by
voice.start() # background thread silence), wake fires.
voice.stop() - The robot acknowledges via TTS ("Yes" configurable).
- The user then types their command at the Marcus terminal prompt.
No Vosk, no Whisper, no torch, no network. Pure numpy DSP.
Why not STT here:
Both Vosk's small English model ("sanad" absent from lexicon) and
openai-whisper ("!!!!!" numerical garbage on this Jetson's torch-aarch64)
proved unreliable for this hardware. Rather than fight either, the
wake path becomes a simple "did the user say something?" signal.
Interface with Marcus brain:
VoiceModule(audio_api, on_wake=callback)
on_wake() is called when wake fires. Brain can display a prompt
or do anything else.
""" """
from __future__ import annotations
import logging import logging
import os import os
import sys import sys
import threading import threading
import time import time
from logging.handlers import RotatingFileHandler from logging.handlers import RotatingFileHandler
from typing import Optional from typing import Callable, Optional
import numpy as np
# ─── PATH + CONFIG ─────────────────────────────────────── # ─── PATH + CONFIG ───────────────────────────────────────
# Single source of truth lives in Core/; everyone else imports from there.
_PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) _PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if _PROJECT_DIR not in sys.path: if _PROJECT_DIR not in sys.path:
sys.path.insert(0, _PROJECT_DIR) sys.path.insert(0, _PROJECT_DIR)
@ -42,11 +45,6 @@ from Core.config_loader import load_config
LOG_DIR = os.path.join(PROJECT_ROOT, "logs") LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
os.makedirs(LOG_DIR, exist_ok=True) os.makedirs(LOG_DIR, exist_ok=True)
# Voice runs as a background subsystem — its INFO/DEBUG logs go ONLY to
# logs/voice.log so they don't drown out the interactive `Command:` prompt.
# Anything the user needs to see (wake-word fired, command heard) is
# print()-ed explicitly from the callbacks below.
# basicConfig is idempotent; audio_api may have already called it.
logging.basicConfig( logging.basicConfig(
level=logging.INFO, level=logging.INFO,
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s", format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
@ -60,352 +58,121 @@ logging.basicConfig(
log = logging.getLogger("marcus_voice") log = logging.getLogger("marcus_voice")
# ─── STATE ENUM ──────────────────────────────────────────
class State:
IDLE = "IDLE"
WAKE_HEARD = "WAKE_HEARD"
PROCESSING = "PROCESSING"
SPEAKING = "SPEAKING"
# ─── VOICE MODULE ────────────────────────────────────────
class VoiceModule: class VoiceModule:
"""Always-listening voice interface for Marcus.""" """Wake-only voice subsystem — fires a callback when speech is detected."""
def __init__(self, audio_api, on_command=None): def __init__(self, audio_api, on_command: Optional[Callable] = None,
on_wake: Optional[Callable] = None):
""" """
Args: Args:
audio_api: AudioAPI instance (from API/audio_api.py) audio_api: AudioAPI instance (for TTS ack).
on_command: callback(text: str, lang: str) "lang" is always "en" on_command: kept for API compatibility; always called with
now; kept in the signature for interface stability. text="" because there's no STT. Brain should
prompt the user to type.
on_wake: alternative callback fired when wake detected.
Exactly one of on_command / on_wake is used.
""" """
self._audio = audio_api self._audio = audio_api
self._on_command = on_command self._on_command = on_command
self._on_wake = on_wake
self._config = load_config("Voice") self._config = load_config("Voice")
self._stt = self._config["stt"] self._stt = self._config.get("stt", {})
self._mic = self._config["mic"] self._messages = self._config.get("messages", {})
# STT (Vosk) — lazy loaded on first _voice_loop() iteration. # Wake-detector parameters (tweakable via config_Voice.json::stt).
# One Model instance, recognizers are created fresh per-utterance. from Voice.wake_detector import WakeDetector, WakeConfig
self._vosk_model = None wcfg = WakeConfig(
self._KaldiRecognizer = None sample_rate = 16_000,
speech_threshold = float(self._stt.get("speech_threshold", 150.0)),
# Wake words (English only — built-in TTS doesn't do Arabic) min_word_duration_s= float(self._stt.get("min_word_duration", 0.20)),
self._wake_en = [w.lower() for w in self._stt.get("wake_words_en", max_word_duration_s= float(self._stt.get("max_word_duration", 1.50)),
["marcus", "marcos"])] post_silence_s = float(self._stt.get("post_silence", 0.30)),
cooldown_s = float(self._stt.get("wake_cooldown", 1.50)),
chunk_ms = int( self._stt.get("wake_chunk_ms", 50)),
)
self._detector = WakeDetector(wcfg)
# G1 built-in mic (UDP multicast). # G1 built-in mic (UDP multicast).
from Voice.builtin_mic import BuiltinMic from Voice.builtin_mic import BuiltinMic
_mcfg = self._config.get("mic_udp", {}) _mcfg = self._config.get("mic_udp", {})
self._mic_capture = BuiltinMic( self._mic_capture = BuiltinMic(
group=_mcfg.get("group", "239.168.123.161"), group = _mcfg.get("group", "239.168.123.161"),
port=_mcfg.get("port", 5555), port = _mcfg.get("port", 5555),
buf_max=_mcfg.get("buffer_max_bytes", 64000), buf_max = _mcfg.get("buffer_max_bytes", 64000),
) )
self._sample_rate = self._mic_capture.sample_rate # 16000
# State
self._state = State.IDLE
self._running = False self._running = False
self._thread = None self._thread = None
self._lock = threading.Lock()
log.info("VoiceModule initialized (mic: G1 built-in UDP)") log.info(
"VoiceModule initialized (custom wake detector, "
"speech_threshold=%s, min/max_word=%s/%s s)",
wcfg.speech_threshold, wcfg.min_word_duration_s, wcfg.max_word_duration_s,
)
# ─── MODEL LOADING ──────────────────────────────────── # ─── main loop ────────────────────────────────────────
def _load_stt(self):
"""
Load Vosk ASR model. Replaces openai-whisper which produced garbage
(!!!!!!!) on this Jetson's torch-aarch64 install regardless of
audio quality. Vosk uses Kaldi's own CPU kernels — no torch, no
numerical instability, ~10× faster than Whisper base on CPU.
Model path is configured via stt.vosk_model_path (relative to
PROJECT_ROOT, or absolute). Default: the small English model,
which is ~40 MB and plenty for short voice commands.
"""
from vosk import Model, KaldiRecognizer, SetLogLevel
SetLogLevel(-1) # silence Vosk's stderr spam
if self._vosk_model is None:
rel = self._stt.get("vosk_model_path", "Models/vosk-model-small-en-us-0.15")
model_path = rel if os.path.isabs(rel) else os.path.join(PROJECT_ROOT, rel)
if not os.path.isdir(model_path):
raise RuntimeError(
"[Voice] Vosk model not found at " + model_path + "\n"
" Download it on the Jetson:\n"
" cd ~/Marcus/Models\n"
" wget https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip\n"
" unzip vosk-model-small-en-us-0.15.zip"
)
log.info("Loading Vosk model: %s", model_path)
self._vosk_model = Model(model_path)
self._KaldiRecognizer = KaldiRecognizer
log.info("Vosk model ready")
# NO restricted grammar. Vosk's small English model's lexicon
# doesn't contain "sanad" (it's not an English word), so passing
# it in a restricted grammar makes Vosk drop the word with:
# WARNING (VoskAPI:UpdateGrammarFst) Ignoring word missing in
# vocabulary: 'sanad'
# and the decoder then only has "[unk]" → never matches
# anything → Transcribed always empty.
#
# Instead: open vocabulary transcription, fuzzy-match against
# the stt.wake_words_en list which contains the English words
# Vosk ACTUALLY hears when you say "sanad" (then, send, sand,
# step, signed, etc.).
self._wake_grammar = None
# Back-compat alias for any caller that still references the old name
_load_whisper = _load_stt
# ─── MIC RECORDING (G1 built-in UDP) ──────────────────
def _record_chunk(self, seconds: float) -> np.ndarray:
"""Capture a fixed-duration chunk from the G1 built-in mic."""
num_bytes = int(seconds * self._sample_rate * 2) # int16 mono
raw = bytearray()
bite = 1024
while len(raw) < num_bytes:
raw.extend(self._mic_capture.read_chunk(min(bite, num_bytes - len(raw))))
return np.frombuffer(bytes(raw), dtype=np.int16)
def _record_until_silence(self) -> np.ndarray:
"""Capture until RMS drops below threshold for `silence_duration_sec`."""
threshold = self._stt.get("silence_threshold", 500)
silence_dur = self._stt.get("silence_duration_sec", 1.5)
max_dur = self._stt.get("max_record_sec", 15)
chunk_sec = 0.5
chunk_bytes = int(self._sample_rate * chunk_sec) * 2
silence_chunks_need = int(silence_dur / chunk_sec)
max_chunks = int(max_dur / chunk_sec)
all_audio = []
silence_count = 0
chunk_count = 0
while chunk_count < max_chunks:
raw = self._mic_capture.read_chunk(chunk_bytes)
if not raw:
break
chunk = np.frombuffer(raw, dtype=np.int16)
all_audio.append(chunk)
chunk_count += 1
rms = np.sqrt(np.mean(chunk.astype(np.float64) ** 2))
if rms < threshold:
silence_count += 1
else:
silence_count = 0
if silence_count >= silence_chunks_need and chunk_count > 2:
log.info("Silence detected after %.1fs", chunk_count * chunk_sec)
break
if all_audio:
return np.concatenate(all_audio)
return np.array([], dtype=np.int16)
# ─── TRANSCRIPTION ────────────────────────────────────
def _transcribe(self, audio: np.ndarray, grammar: Optional[str] = None) -> str:
"""
Transcribe audio using Vosk.
When `grammar` is a JSON list string (e.g. `'["sanad","[unk]"]'`),
Vosk is constrained to that vocabulary only perfect for wake-word
detection where we KNOW the exact word we want to hear. Pass
grammar=None for open-vocabulary transcription (used for commands).
"""
import json as _json
# Audio stats — still useful for "mic is silent" diagnostics.
peak_i16 = int(np.abs(audio).max()) if audio.size else 0
rms_i16 = float(np.sqrt(np.mean(audio.astype(np.float64) ** 2))) if audio.size else 0.0
log.info("audio stats: samples=%d peak=%d rms=%.1f", audio.size, peak_i16, rms_i16)
if audio.size == 0:
return ""
# Fresh recognizer per utterance. Pass grammar if provided.
if grammar:
rec = self._KaldiRecognizer(self._vosk_model, self._sample_rate, grammar)
else:
rec = self._KaldiRecognizer(self._vosk_model, self._sample_rate)
rec.SetWords(False)
# Single-shot: feed the whole utterance in one AcceptWaveform call,
# then take FinalResult. Chunk-based feeding split short "sanad"
# utterances across chunk boundaries and Vosk's decoder often
# refused to commit, returning empty. Single-shot works for every
# voice-assistant example in Vosk's docs.
#
# When FinalResult is empty, also check PartialResult — sometimes
# Vosk heard something but didn't reach a segmentation boundary
# yet. PartialResult still has the text, just not "finalized".
rec.AcceptWaveform(audio.tobytes())
final = _json.loads(rec.FinalResult()).get("text", "").strip()
if not final:
partial = _json.loads(rec.PartialResult()).get("partial", "").strip()
if partial:
final = partial
log.info(" (partial only, no final commit)")
text = final
if not text:
log.info("Transcribed: (empty)")
return ""
log.info("Transcribed: %s", text[:100])
return text
def _check_wake_word(self, text: str) -> bool:
"""
Check if transcribed text contains an English wake word.
Matches on word boundary (so "sandstorm" doesn't trigger off "sand"),
but is lenient about punctuation/whitespace around the word.
"""
import re
text_lower = text.lower().strip()
# word-boundary regex built once per call (cheap; runs 2×/sec)
for w in self._wake_en:
if re.search(r'\b' + re.escape(w) + r'\b', text_lower):
return True
return False
# ─── MAIN LOOP ────────────────────────────────────────
def _voice_loop(self): def _voice_loop(self):
"""Main voice processing loop — runs in background thread."""
self._load_whisper()
self._mic_capture.start() self._mic_capture.start()
log.info("Voice loop started — listening for wake word...") log.info("Voice loop started — listening for wake (energy-based, no ML)")
while self._running: while self._running:
try: try:
if self._state == State.IDLE: # Don't listen while the robot is speaking (prevents
self._do_idle() # self-trigger from TTS output leaking into the mic).
elif self._state == State.WAKE_HEARD: if self._audio.is_speaking:
self._do_wake_heard() time.sleep(0.1)
elif self._state == State.PROCESSING: self._detector.reset()
self._do_processing() continue
elif self._state == State.SPEAKING:
# Wait for any TTS to finish before returning to IDLE chunk = self._mic_capture.read_chunk(1024) # ~32 ms at 16 kHz
while self._audio.is_speaking: if not chunk:
time.sleep(0.1) continue
self._state = State.IDLE
if self._detector.process(chunk):
self._on_wake_fired()
except Exception as e: except Exception as e:
log.error("Voice loop error: %s", e, exc_info=True) log.error("Voice loop error: %s", e, exc_info=True)
self._state = State.IDLE
time.sleep(1) time.sleep(1)
def _do_idle(self): def _on_wake_fired(self):
"""Listen for wake word in 4-second chunks. Longer windows give log.info("Wake detected (acoustic)")
Vosk's decoder enough context to commit short utterances like a print("\n [Sanad] wake heard — type your command at the prompt.")
single 'sanad'.""" # TTS ack
# Skip if robot is speaking — prevents self-listening msg = self._messages.get("wake_heard", "Yes")
if self._audio.is_speaking: try:
time.sleep(0.2) self._audio.speak(msg)
return except Exception as e:
log.warning("TTS ack failed: %s", e)
audio = self._record_chunk(4.0) # Brain callbacks for compatibility with the old interface.
if self._on_wake:
# Double-check speaking didn't start during recording
if self._audio.is_speaking:
return
# Skip if too quiet (no one talking). Threshold lowered to 60 to
# match the G1 on-board mic's typical noise floor (std ~30-80 when
# idle, ~150+ when someone speaks). With 100 we were skipping
# quiet "sanad" utterances entirely.
if audio.std() < 60:
return
# Wake-word pass uses restricted Vosk grammar (only "sanad" or "[unk]")
text = self._transcribe(audio, grammar=self._wake_grammar)
if self._check_wake_word(text):
log.info("Wake word detected!")
# One clean line to the terminal so the operator knows voice
# actually heard them, even though all other voice logs are
# file-only. \n leads because we may be painting over a
# half-drawn `Command:` prompt.
print("\n [Sanad] wake heard — recording command…")
self._state = State.WAKE_HEARD
# Acknowledge
self._audio.speak(self._config["messages"]["wake_heard"])
def _do_wake_heard(self):
"""Record the command until silence."""
# Wait for "Yes" TTS to finish before recording.
while self._audio.is_speaking:
time.sleep(0.1)
# CRITICAL: flush the mic ring buffer. The UDP multicast receiver
# has been accumulating audio continuously (including pre-wake
# silence and the TTS "Yes" that just played back into the mic
# path). Without flush, _record_until_silence() reads the old
# buffered silence instantly, counts 3 silent chunks, and exits
# before the user has started speaking the command.
self._mic_capture.flush()
log.info("Recording command...")
audio = self._record_until_silence()
if len(audio) < 4000: # < 0.25s at 16kHz
log.info("Too short, ignoring")
self._audio.speak(self._config["messages"]["no_speech"])
self._state = State.IDLE
return
self._command_audio = audio
self._state = State.PROCESSING
def _do_processing(self):
"""Transcribe the command and send to brain."""
text = self._transcribe(self._command_audio)
self._command_audio = None
if not text or len(text.strip()) < 2:
log.info("Empty transcription")
self._audio.speak(self._config["messages"]["no_speech"])
self._state = State.IDLE
return
log.info("Command: %s", text)
# Send to brain callback (lang always "en" in this build)
if self._on_command:
try: try:
self._on_command(text, "en") self._on_wake()
except Exception as e: except Exception as e:
log.error("Brain callback error: %s", e) log.error("on_wake callback error: %s", e)
elif self._on_command:
# Old API expected (text, lang). We have no transcription, so
# pass empty text — brain is expected to prompt for typed input.
try:
self._on_command("", "en")
except Exception as e:
log.error("on_command callback error: %s", e)
self._state = State.IDLE # ─── start / stop ─────────────────────────────────────
# ─── START / STOP ─────────────────────────────────────
def start(self): def start(self):
"""Start voice listening in background thread."""
if self._running: if self._running:
log.warning("Voice module already running") log.warning("VoiceModule already running")
return return
self._running = True self._running = True
self._state = State.IDLE self._thread = threading.Thread(
self._thread = threading.Thread(target=self._voice_loop, daemon=True, name="voice") target=self._voice_loop, daemon=True, name="voice",
)
self._thread.start() self._thread.start()
log.info("Voice module started") log.info("Voice module started")
def stop(self): def stop(self):
"""Stop voice listening."""
self._running = False self._running = False
try: try:
self._mic_capture.stop() self._mic_capture.stop()
@ -416,35 +183,23 @@ class VoiceModule:
self._thread = None self._thread = None
log.info("Voice module stopped") log.info("Voice module stopped")
@property
def state(self) -> str:
return self._state
@property @property
def is_running(self) -> bool: def is_running(self) -> bool:
return self._running return self._running
# ─── STANDALONE TEST ───────────────────────────────────── # ─── standalone test ─────────────────────────────────────
if __name__ == "__main__": if __name__ == "__main__":
import sys
sys.path.insert(0, PROJECT_ROOT)
from API.audio_api import AudioAPI from API.audio_api import AudioAPI
def on_command(text, lang): def on_wake():
print(f"\n{'='*50}") print(" (brain callback fired)")
print(f" COMMAND [{lang}]: {text}")
print(f"{'='*50}\n")
audio = AudioAPI() audio = AudioAPI()
voice = VoiceModule(audio, on_command=on_command) voice = VoiceModule(audio, on_wake=on_wake)
print("Starting voice module... say any short word to test the wake.")
print("Starting voice module... say 'Marcus' to wake.") print("Press Ctrl-C to stop.\n")
print("Press Ctrl+C to stop.\n")
voice.start() voice.start()
try: try:
while voice.is_running: while voice.is_running:
time.sleep(0.5) time.sleep(0.5)

186
Voice/wake_detector.py Normal file
View File

@ -0,0 +1,186 @@
#!/usr/bin/env python3
"""
Voice/wake_detector.py custom wake-word detector (no ML, no Vosk, no Whisper).
Energy-envelope state machine. Monitors raw PCM audio and fires a wake
event when it sees a short speech burst (sized to match a single spoken
word like "Sanad") followed by a clear silence.
Why this exists:
Vosk's small English lexicon doesn't contain the word "sanad" and
substitutes arbitrary English words ("us", "of", "senate"). Whisper on
this Jetson's torch-aarch64 produces "!!!!!" garbage. Both are broken
for this specific hardware + wake word. An acoustic detector using
only numpy doesn't care what the word actually is — it detects the
*shape* of a single spoken word in the audio energy envelope.
Algorithm (state machine):
SILENCE (rms > speech_threshold)> SPEAKING
SPEAKING (rms < silence_threshold for N chunks)> ANALYZE
ANALYZE: if 0.2 s < speech_duration < 1.5 s fire WAKE
else reset to SILENCE (too short = cough, too long = sentence)
after fire COOLDOWN for 1.5 s before next detection
What it does NOT do:
- Does not identify which word was spoken (anything in the
duration range triggers)
- Does not transcribe follow-on commands (you type those at the
terminal)
- Does not protect against loud non-speech (clapping, door slam)
Usage:
from Voice.wake_detector import WakeDetector
det = WakeDetector(sample_rate=16000)
while True:
chunk = mic.read_chunk(1024) # bytes of int16 PCM
if det.process(chunk):
print("Wake!")
"""
from __future__ import annotations
import time
from dataclasses import dataclass
from typing import Optional
import numpy as np
@dataclass
class WakeConfig:
sample_rate: int = 16_000
# RMS (int16 units) above which we consider a chunk to be speech.
# G1 on-board mic at normal speaking distance has rms ≈ 500-1500
# during speech and ≈ 40-100 in silence. 150 is a safe middle ground.
speech_threshold: float = 150.0
# How long a burst of speech must last to count as a "word".
min_word_duration_s: float = 0.20
max_word_duration_s: float = 1.50
# How long of continuous silence we need to consider the word ended.
post_silence_s: float = 0.30
# Minimum gap between two consecutive wake fires. Prevents a single
# spoken word from triggering twice.
cooldown_s: float = 1.50
# RMS window size — we analyze this many ms of audio per step.
chunk_ms: int = 50
class WakeDetector:
"""Streaming acoustic wake detector — no language model required."""
STATE_SILENCE = "SILENCE"
STATE_SPEAKING = "SPEAKING"
def __init__(self, cfg: Optional[WakeConfig] = None):
self.cfg = cfg or WakeConfig()
self._chunk_samples = int(self.cfg.sample_rate * self.cfg.chunk_ms / 1000)
self._min_speech = int(self.cfg.min_word_duration_s * self.cfg.sample_rate)
self._max_speech = int(self.cfg.max_word_duration_s * self.cfg.sample_rate)
self._post_silence = int(self.cfg.post_silence_s * self.cfg.sample_rate)
self._state = self.STATE_SILENCE
self._speech_start = 0 # sample index where current burst began
self._silence_run = 0 # consecutive silent samples inside SPEAKING
self._sample_cursor = 0 # running sample count since start
self._cooldown_until = 0.0 # wall-clock time after which we can fire again
# A small rolling buffer of leftover samples (when the caller's
# chunks don't align with our internal analysis window).
self._carry = np.zeros(0, dtype=np.int16)
# ── public API ────────────────────────────────────────────────
def process(self, pcm_bytes: bytes) -> bool:
"""
Feed int16 PCM bytes. Returns True once per spoken "word"
(short speech burst followed by silence).
"""
if not pcm_bytes:
return False
incoming = np.frombuffer(pcm_bytes, dtype=np.int16)
samples = np.concatenate([self._carry, incoming]) if self._carry.size else incoming
fired = False
n = self._chunk_samples
i = 0
while i + n <= samples.size:
window = samples[i:i + n]
if self._step(window):
fired = True
# break — flush the rest on next call so we get one fire per word
i += n
break
i += n
self._sample_cursor += n
# Keep whatever didn't fit in a full window for next call.
self._carry = samples[i:].copy()
return fired
def reset(self) -> None:
"""Drop all state — call when resuming from a long pause."""
self._state = self.STATE_SILENCE
self._silence_run = 0
self._carry = np.zeros(0, dtype=np.int16)
# ── internal ──────────────────────────────────────────────────
def _step(self, window: np.ndarray) -> bool:
rms = float(np.sqrt(np.mean(window.astype(np.float64) ** 2)))
is_speech = rms > self.cfg.speech_threshold
now = time.time()
if now < self._cooldown_until:
return False # silent during cooldown
if self._state == self.STATE_SILENCE:
if is_speech:
self._state = self.STATE_SPEAKING
self._speech_start = self._sample_cursor
self._silence_run = 0
return False
# STATE_SPEAKING
if is_speech:
self._silence_run = 0
# Abort if the burst is longer than a single word — user is
# just talking, not addressing the robot.
if self._sample_cursor - self._speech_start > self._max_speech:
self._state = self.STATE_SILENCE
return False
# Silent window inside SPEAKING — accumulate.
self._silence_run += window.size
if self._silence_run >= self._post_silence:
speech_len = (self._sample_cursor - self._silence_run) - self._speech_start
self._state = self.STATE_SILENCE
self._silence_run = 0
if self._min_speech <= speech_len <= self._max_speech:
self._cooldown_until = now + self.cfg.cooldown_s
return True
return False
# ── standalone test ─────────────────────────────────────────────
if __name__ == "__main__":
import os
import sys
_HERE = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, os.path.dirname(_HERE))
from Voice.builtin_mic import BuiltinMic
print("WakeDetector standalone test — say 'Sanad' a few times.")
print("(Ctrl-C to quit)\n")
det = WakeDetector()
mic = BuiltinMic()
mic.start()
try:
while True:
chunk = mic.read_chunk(1024)
if det.process(chunk):
print(f" [WAKE] (t={time.strftime('%H:%M:%S')})")
except KeyboardInterrupt:
pass
finally:
mic.stop()