Full-day voice-stack refactor. Experiments run and reverted:
- Gemini Live HTTP microservice (Python 3.8 env incompat, latency)
- Vosk grammar STT (English lexicon can't decode 'Sanad'; big model
cold-load too slow on Jetson CPU)
Kept architecture:
- Voice/wake_detector.py — pure-numpy energy state machine with
adaptive baseline, burst-audio capture for post-hoc verify.
- Voice/marcus_voice.py — orchestrator with 3 modes
(wake_and_command / always_on / always_on_gated), hysteretic VAD,
pre-silence trim (300 ms pre-roll), DSP pipeline (DC remove,
80 Hz HPF, 0.97 pre-emphasis, peak-normalize), faster-whisper
base.en int8 with beam=8 + temperature fallback [0,0.2,0.4],
fuzzy-match canonicalisation, GARBAGE_PATTERNS + length filter,
/s-/ phonetic wake-verify, full-turn debug WAV recording.
Config-driven vocab (zero hardcoded strings in Python):
- stt.wake_words (33 variants of 'Sanad')
- stt.command_vocab (68 canonical phrases)
- stt.garbage_patterns (17 Whisper noise outputs)
- stt.min_transcription_length, stt.command_vocab_cutoff
Command parser widened (Brain/command_parser.py):
- _RE_SIMPLE_DIR — bare direction + verb+direction combos
('left', 'go back', 'move forward', 'step right', ...)
- _RE_STOP_SIMPLE — bare stop/halt/wait/pause/freeze/hold
- All motion constants sourced from config_Navigation.json
(move_map + step_duration_sec) via API/zmq_api.py; no more
hardcoded 0.3 / 2.0 magic numbers.
API/audio_api.py — _play_pcm now uses AudioClient.PlayStream with
automatic resampling to 16 kHz (matches Sanad's proven pattern).
Removed:
- Voice/vosk_stt.py (and all Vosk references in marcus_voice.py)
- Models/vosk-model-small-en-us-0.15/ (40 MB model + zip)
- All Vosk keys from Config/config_Voice.json
Documentation synced across README, Doc/architecture.md,
Doc/pipeline.md, Doc/functions.md, Doc/controlling.md,
Doc/MARCUS_API.md, Doc/environment.md changelog.
Known limitation: faster-whisper base.en on Jetson CPU + G1
far-field mic yields ~50% command-transcription accuracy due
to model capacity and mic reverberation. Wake + ack + recording
+ trim + Whisper + fuzzy + brain + motion all verified working
end-to-end. Future improvement path (unused): close-talking USB
mic via pactl_parec, or Gemini Live via HTTP microservice.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1176 lines
53 KiB
Python
1176 lines
53 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Voice/marcus_voice.py — voice input for Marcus (custom wake + faster-whisper STT).
|
||
|
||
Pipeline:
|
||
G1 mic ─► custom wake detector (numpy, offline, instant)
|
||
│
|
||
▼
|
||
TTS "Yes" (AudioAPI → G1 TtsMaker)
|
||
│
|
||
▼
|
||
record command audio until silence
|
||
│
|
||
▼
|
||
faster-whisper base.en int8 (CPU) ──► brain callback(text)
|
||
|
||
Wake detection is local and instant (Voice/wake_detector.py — pure DSP, no
|
||
ML). STT runs only on the recorded command, not on every 2 s of mic input,
|
||
so the CPU cost is bounded by how often the user talks.
|
||
|
||
Why faster-whisper (CTranslate2) instead of openai-whisper:
|
||
The Jetson's torch-aarch64 build has a Categorical sampler bug that
|
||
produces NaN logits on low-SNR input, which is exactly what the G1
|
||
far-field mic captures. faster-whisper bypasses torch entirely and
|
||
runs the int8-quantized model through CTranslate2 — same quality as
|
||
Whisper base, no numerical instability, 3× faster on this hardware.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import logging
|
||
import os
|
||
import sys
|
||
import threading
|
||
import time
|
||
from logging.handlers import RotatingFileHandler
|
||
from typing import Callable, Optional
|
||
|
||
import numpy as np
|
||
|
||
_PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||
if _PROJECT_DIR not in sys.path:
|
||
sys.path.insert(0, _PROJECT_DIR)
|
||
from Core.env_loader import PROJECT_ROOT
|
||
from Core.config_loader import load_config
|
||
|
||
LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
|
||
os.makedirs(LOG_DIR, exist_ok=True)
|
||
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
|
||
handlers=[
|
||
RotatingFileHandler(
|
||
os.path.join(LOG_DIR, "voice.log"),
|
||
maxBytes=5_000_000, backupCount=3, encoding="utf-8",
|
||
),
|
||
],
|
||
)
|
||
log = logging.getLogger("marcus_voice")
|
||
|
||
|
||
# Module-level vocabulary containers. EMPTY on import — populated by
|
||
# VoiceModule.__init__ from Config/config_Voice.json::stt.{wake_words,
|
||
# command_vocab, garbage_patterns}. Config is the single source of truth;
|
||
# there are no hardcoded string lists here anymore.
|
||
#
|
||
# If you import this module without running a VoiceModule() first, these
|
||
# stay empty → fuzzy-match is a no-op, wake detection rejects everything,
|
||
# garbage filter rejects nothing. That's by design: bad config = obvious
|
||
# broken behavior, not silently-drifting hardcoded defaults.
|
||
WAKE_WORDS: set = set()
|
||
COMMAND_VOCAB: list = []
|
||
GARBAGE_PATTERNS: set = set()
|
||
_MIN_TRANSCRIPTION_LENGTH: int = 3
|
||
|
||
|
||
def _has_wake_word(text: str) -> bool:
|
||
"""
|
||
True if the utterance contains any wake-word variant as a *whole word*
|
||
(word-boundary match, not substring — so "standard" doesn't trigger
|
||
off "sand").
|
||
"""
|
||
import re
|
||
low = text.lower()
|
||
for w in WAKE_WORDS:
|
||
if re.search(r'\b' + re.escape(w) + r'\b', low):
|
||
return True
|
||
return False
|
||
|
||
|
||
def _strip_wake_word_once(text: str) -> str:
|
||
"""Single pass of wake-word stripping. Use via _strip_wake_word()."""
|
||
import re
|
||
stripped = text.strip()
|
||
|
||
# Case 1: the entire utterance is just a wake word + optional
|
||
# trailing punctuation. Return empty string so caller can ack-only.
|
||
for w in WAKE_WORDS:
|
||
if re.fullmatch(rf'{re.escape(w)}[\s,.!?]*', stripped, re.IGNORECASE):
|
||
return ""
|
||
|
||
# Case 2: "Sanad <command>" — require whitespace (or comma+ws) between
|
||
# wake word and command so "Sanad." doesn't swallow "." as a command.
|
||
for w in sorted(WAKE_WORDS, key=len, reverse=True):
|
||
m = re.match(
|
||
rf'^\s*{re.escape(w)}\s*[,.!?]?\s+(.+)$',
|
||
text, re.IGNORECASE,
|
||
)
|
||
if m:
|
||
return m.group(1).strip(' ,.!?')
|
||
|
||
# Case 3: "<command> Sanad" — trailing wake word.
|
||
m = re.match(
|
||
rf'^(.+?)\s+{re.escape(w)}\s*[.!?]*\s*$',
|
||
text, re.IGNORECASE,
|
||
)
|
||
if m:
|
||
return m.group(1).strip(' ,.!?')
|
||
|
||
return text
|
||
|
||
|
||
def _strip_wake_word(text: str) -> str:
|
||
"""
|
||
Remove the wake word from the start or end of text, iteratively,
|
||
so repeated-wake transcriptions ("Sanad. Sanad.") fully collapse
|
||
to the actual command (or empty string if nothing else was said).
|
||
|
||
Examples:
|
||
"Sanad, turn left" → "turn left"
|
||
"Sanad turn left" → "turn left"
|
||
"turn left Sanad" → "turn left"
|
||
"Sanad." → ""
|
||
"Sanad" → ""
|
||
"Sanad. Sanad." → "" (was leaving "Sanad" before)
|
||
"Sanad Sanad stop" → "stop" (recursive strip)
|
||
"""
|
||
# Iterate until stable — each pass peels off one wake word. Cap at
|
||
# a handful of iterations so a malicious/garbled input can't loop.
|
||
for _ in range(5):
|
||
stripped = _strip_wake_word_once(text)
|
||
if stripped == text:
|
||
return text
|
||
text = stripped
|
||
return text
|
||
|
||
|
||
def _closest_command(text: str, cutoff: float = 0.72) -> str:
|
||
"""
|
||
Map a Whisper transcription to the closest known command phrase.
|
||
|
||
Returns the canonical command if there's a close-enough match, else
|
||
returns the original text unchanged. Close = difflib SequenceMatcher
|
||
ratio ≥ cutoff (0.72 empirically rejects unrelated phrases while
|
||
accepting common Whisper near-misses like "Turn right up"→"turn right"
|
||
or "What do you see?"→"what do you see").
|
||
|
||
Also handles the "transcription contains a command" case — if the
|
||
text has a command phrase as a substring (e.g. "Sanad, turn left"
|
||
from an echo), extract the command.
|
||
"""
|
||
from difflib import SequenceMatcher
|
||
low = text.lower().strip().rstrip(".!?,")
|
||
if not low:
|
||
return text
|
||
|
||
# Cheap substring win first — no fuzzy needed if the command is
|
||
# literally in the transcription.
|
||
for cmd in COMMAND_VOCAB:
|
||
if cmd in low:
|
||
return cmd
|
||
|
||
best_cmd = None
|
||
best_ratio = 0.0
|
||
for cmd in COMMAND_VOCAB:
|
||
r = SequenceMatcher(None, low, cmd).ratio()
|
||
if r > best_ratio:
|
||
best_ratio = r
|
||
best_cmd = cmd
|
||
|
||
if best_ratio >= cutoff:
|
||
return best_cmd
|
||
return text
|
||
|
||
|
||
class VoiceModule:
|
||
def __init__(
|
||
self,
|
||
audio_api,
|
||
on_command: Optional[Callable] = None,
|
||
on_wake: Optional[Callable] = None,
|
||
):
|
||
self._audio = audio_api
|
||
self._on_command = on_command
|
||
self._on_wake = on_wake
|
||
|
||
self._config = load_config("Voice")
|
||
self._stt = self._config.get("stt", {})
|
||
self._messages = self._config.get("messages", {})
|
||
|
||
# Load all voice vocabulary from config — these are the only
|
||
# string lists the voice layer uses, and they come from
|
||
# config_Voice.json. If a key is missing, the list is empty and
|
||
# that feature silently degrades (fuzzy-match no-op, nothing
|
||
# rejected as garbage, no wake-word match) — NEVER crashes.
|
||
global WAKE_WORDS, COMMAND_VOCAB, GARBAGE_PATTERNS, _MIN_TRANSCRIPTION_LENGTH
|
||
WAKE_WORDS = {w.lower() for w in self._stt.get("wake_words", [])}
|
||
COMMAND_VOCAB = list(self._stt.get("command_vocab", []))
|
||
GARBAGE_PATTERNS = {p.lower() for p in self._stt.get("garbage_patterns", [])}
|
||
_MIN_TRANSCRIPTION_LENGTH = int(self._stt.get("min_transcription_length", 3))
|
||
self._vocab_cutoff = float(self._stt.get("command_vocab_cutoff", 0.72))
|
||
log.info("vocab loaded: %d wake_words, %d command_vocab, %d garbage_patterns",
|
||
len(WAKE_WORDS), len(COMMAND_VOCAB), len(GARBAGE_PATTERNS))
|
||
|
||
# ── Custom wake detector ──
|
||
from Voice.wake_detector import WakeDetector, WakeConfig
|
||
wcfg = WakeConfig(
|
||
sample_rate = 16_000,
|
||
speech_threshold = float(self._stt.get("speech_threshold", 80.0)),
|
||
min_word_duration_s = float(self._stt.get("min_word_duration", 0.20)),
|
||
max_word_duration_s = float(self._stt.get("max_word_duration", 1.50)),
|
||
post_silence_s = float(self._stt.get("post_silence", 0.30)),
|
||
cooldown_s = float(self._stt.get("wake_cooldown", 1.50)),
|
||
chunk_ms = int( self._stt.get("wake_chunk_ms", 50)),
|
||
adaptive_window_n = int( self._stt.get("wake_adaptive_window_n", 50)),
|
||
adaptive_mult = float(self._stt.get("wake_adaptive_mult", 3.0)),
|
||
diag_log_sec = float(self._stt.get("wake_diag_log_sec", 3.0)),
|
||
)
|
||
self._detector = WakeDetector(wcfg)
|
||
|
||
# ── G1 mic ──
|
||
from Voice.builtin_mic import BuiltinMic
|
||
_mcfg = self._config.get("mic_udp", {})
|
||
self._mic_capture = BuiltinMic(
|
||
group = _mcfg.get("group", "239.168.123.161"),
|
||
port = _mcfg.get("port", 5555),
|
||
buf_max = _mcfg.get("buffer_max_bytes", 64000),
|
||
)
|
||
self._sample_rate = self._mic_capture.sample_rate
|
||
|
||
# ── global software mic gain ──
|
||
# Applied to every byte read from the mic, so wake detector, VAD,
|
||
# AND Whisper all see the boosted audio. One knob, uniform effect.
|
||
# G1 far-field mic benefits from 2.0-3.0 for normal speaking volume;
|
||
# above 4.0 you start clipping loud words.
|
||
self._mic_gain = float(self._stt.get("mic_gain", 1.0))
|
||
if self._mic_gain != 1.0:
|
||
log.info("mic_gain = %.2fx (applied to all mic reads)", self._mic_gain)
|
||
|
||
# ── faster-whisper (lazy-init on first wake) ──
|
||
self._fw = None
|
||
|
||
self._running = False
|
||
self._thread = None
|
||
self._cooldown_until = 0.0
|
||
log.info("VoiceModule initialized (wake=custom, stt=faster-whisper)")
|
||
|
||
# ─── gain-applied mic read ────────────────────────────
|
||
|
||
def _read_mic_raw(self, num_bytes: int) -> bytes:
|
||
"""Raw mic read — no gain. Used by the wake detector whose
|
||
thresholds are calibrated against unamplified G1 ambient."""
|
||
return self._mic_capture.read_chunk(num_bytes)
|
||
|
||
def _read_mic_gained(self, num_bytes: int) -> bytes:
|
||
"""
|
||
Mic read with self._mic_gain applied. Used during command
|
||
recording so Whisper sees a louder, cleaner signal. NOT used
|
||
in the wake loop — amplifying ambient there pushes it over
|
||
the wake threshold and the detector can never find its
|
||
silent baseline.
|
||
"""
|
||
raw = self._mic_capture.read_chunk(num_bytes)
|
||
if not raw or self._mic_gain == 1.0:
|
||
return raw
|
||
arr = np.frombuffer(raw, dtype=np.int16).astype(np.float32) * self._mic_gain
|
||
return np.clip(arr, -32768, 32767).astype(np.int16).tobytes()
|
||
|
||
# ─── lazy faster-whisper init ─────────────────────────
|
||
|
||
def _get_fw(self):
|
||
"""Load faster-whisper on first use — startup saved for cold path."""
|
||
if self._fw is not None:
|
||
return self._fw
|
||
model = self._stt.get("whisper_model", "base.en")
|
||
device = self._stt.get("whisper_device", "cpu")
|
||
compute = self._stt.get("whisper_compute_type", "int8")
|
||
log.info(
|
||
"Loading faster-whisper: model=%s device=%s compute=%s",
|
||
model, device, compute,
|
||
)
|
||
try:
|
||
from faster_whisper import WhisperModel
|
||
self._fw = WhisperModel(model, device=device, compute_type=compute)
|
||
log.info("faster-whisper ready")
|
||
except Exception as e:
|
||
log.error("faster-whisper init failed: %s — voice will be wake-only", e)
|
||
self._fw = None
|
||
return self._fw
|
||
|
||
# ─── command recording ────────────────────────────────
|
||
|
||
def _record_command(self) -> np.ndarray:
|
||
"""
|
||
Record the user's command with a hysteretic, adaptive-baseline VAD.
|
||
|
||
Design (handles quiet, normal, and loud voices on the G1 mic):
|
||
|
||
1. Sample 200 ms of ambient noise first to learn the floor,
|
||
then set the "silence" gate to max(ambient * 2.5, floor).
|
||
Eliminates the "my silence threshold is higher than my
|
||
user's speaking level" failure mode.
|
||
|
||
2. Two thresholds with hysteresis:
|
||
speech_entry — RMS required to count as "speech started"
|
||
silence_exit — RMS below which we count silence
|
||
(< speech_entry; prevents mid-word bail on
|
||
breaths and short consonant gaps).
|
||
|
||
3. Recording can only *end* after we've actually heard speech.
|
||
Pure silence just runs out to max_record_sec, then returns
|
||
empty (the caller plays "I didn't catch that" without
|
||
burning a Whisper call on noise).
|
||
|
||
4. After speech is seen, silence_budget accumulates only while
|
||
RMS stays below silence_exit. A single loud burst resets
|
||
it to zero — so natural "turn... left" pauses don't end the
|
||
recording.
|
||
"""
|
||
# ── config knobs (all overridable via config_Voice.json::stt) ─
|
||
speech_entry_rms = float(self._stt.get("speech_entry_rms", 250.0))
|
||
silence_exit_rms = float(self._stt.get("silence_exit_rms", 120.0))
|
||
silence_dur = float(self._stt.get("silence_duration_sec", 1.2))
|
||
max_dur = float(self._stt.get("max_record_sec", 8.0))
|
||
min_dur = float(self._stt.get("min_record_sec", 0.4))
|
||
ambient_probe_s = float(self._stt.get("ambient_probe_sec", 0.2))
|
||
ambient_mult = float(self._stt.get("ambient_mult", 2.5))
|
||
|
||
small_chunk_bytes = 1024
|
||
analysis_ms = 100
|
||
analysis_bytes = int(self._sample_rate * analysis_ms / 1000) * 2
|
||
|
||
# ── 1. Reuse the wake detector's baseline instead of probing
|
||
# the mic right now. The wake detector's _baseline is a rolling
|
||
# mean of idle-silence RMS values from the last few seconds.
|
||
#
|
||
# Why NOT probe at record-time: we arrive here right after TTS
|
||
# "Yes", and the user typically starts speaking within 200 ms
|
||
# of hearing the ack. A probe window sized to the ambient floor
|
||
# then measures the *user's speech* as "ambient" and sets
|
||
# speech_entry above the user's actual amplitude — causing the
|
||
# "no speech in 8.00s" failure mode observed in the wild.
|
||
#
|
||
# Cap the baseline at a sensible ceiling so a one-off loud
|
||
# transient during idle doesn't lock us out either.
|
||
probe_buf = bytearray() # no probe audio kept
|
||
ambient_rms = getattr(self._detector, "_baseline", 0.0) or 0.0
|
||
ambient_cap = float(self._stt.get("ambient_cap_rms", 200.0))
|
||
ambient_rms = min(ambient_rms, ambient_cap)
|
||
|
||
if ambient_rms > 0:
|
||
adaptive_exit = max(silence_exit_rms, ambient_rms * ambient_mult)
|
||
adaptive_entry = max(speech_entry_rms, ambient_rms * ambient_mult * 1.8)
|
||
else:
|
||
adaptive_exit, adaptive_entry = silence_exit_rms, speech_entry_rms
|
||
|
||
log.info("vad: ambient_rms=%.0f (from wake baseline, cap=%.0f) "
|
||
"speech_entry=%.0f silence_exit=%.0f",
|
||
ambient_rms, ambient_cap, adaptive_entry, adaptive_exit)
|
||
|
||
# ── 2. main capture loop ──────────────────────────────────────
|
||
collected = bytearray(probe_buf) # keep probe audio — user may
|
||
# have already started talking
|
||
analysis_buf = bytearray()
|
||
silence_budget = 0.0
|
||
total_time = len(probe_buf) / 2 / self._sample_rate
|
||
speech_seen = False
|
||
peak_rms_seen = 0.0
|
||
# Byte offset into `collected` at which speech first crossed
|
||
# adaptive_entry. We trim pre-speech silence to this point (minus
|
||
# ~300 ms pre-roll) before returning. Keeping Whisper's input
|
||
# tight (speech + small tails) improves transcription accuracy
|
||
# by removing the ambient/HVAC portion that dilutes the mel
|
||
# features.
|
||
speech_start_byte: Optional[int] = None
|
||
preroll_bytes = int(self._sample_rate * 0.3) * 2 # 300 ms
|
||
wall_start = time.time()
|
||
|
||
while total_time < max_dur and (time.time() - wall_start) < max_dur + 2:
|
||
raw = self._read_mic_gained(small_chunk_bytes)
|
||
if not raw:
|
||
time.sleep(0.005)
|
||
continue
|
||
collected.extend(raw)
|
||
analysis_buf.extend(raw)
|
||
total_time += (len(raw) // 2) / self._sample_rate
|
||
|
||
while len(analysis_buf) >= analysis_bytes:
|
||
win = np.frombuffer(bytes(analysis_buf[:analysis_bytes]), dtype=np.int16)
|
||
del analysis_buf[:analysis_bytes]
|
||
rms = float(np.sqrt(np.mean(win.astype(np.float64) ** 2)))
|
||
peak_rms_seen = max(peak_rms_seen, rms)
|
||
|
||
if rms >= adaptive_entry:
|
||
if not speech_seen:
|
||
speech_seen = True
|
||
# Record where speech started (byte offset
|
||
# in `collected`) so we can trim pre-roll later.
|
||
speech_start_byte = max(0, len(collected) - preroll_bytes)
|
||
silence_budget = 0.0
|
||
elif speech_seen and rms < adaptive_exit:
|
||
silence_budget += analysis_ms / 1000.0
|
||
# between exit and entry → hold state (hysteresis zone)
|
||
|
||
# end only *after* we've heard real speech
|
||
if (speech_seen
|
||
and silence_budget >= silence_dur
|
||
and total_time >= min_dur):
|
||
log.info("silence after speech at %.2fs (peak_rms=%.0f)",
|
||
total_time, peak_rms_seen)
|
||
break
|
||
|
||
if not speech_seen:
|
||
log.info("no speech in %.2fs (peak_rms=%.0f < entry=%.0f) — dropping",
|
||
total_time, peak_rms_seen, adaptive_entry)
|
||
return np.array([], dtype=np.int16)
|
||
|
||
if total_time >= max_dur:
|
||
log.info("max-record-sec hit at %.2fs (peak_rms=%.0f)",
|
||
total_time, peak_rms_seen)
|
||
|
||
# Trim leading pre-speech silence. Keep 300 ms of pre-roll so
|
||
# the onset of the first phoneme is preserved for Whisper.
|
||
if speech_start_byte and speech_start_byte > 0:
|
||
trimmed_ms = speech_start_byte / 2 / self._sample_rate * 1000
|
||
log.info("trimmed %.0f ms of leading silence "
|
||
"(pre-speech buffer %d bytes)",
|
||
trimmed_ms, speech_start_byte)
|
||
collected = collected[speech_start_byte:]
|
||
|
||
return (np.frombuffer(bytes(collected), dtype=np.int16)
|
||
if collected else np.array([], dtype=np.int16))
|
||
|
||
# ─── transcription ────────────────────────────────────
|
||
|
||
def _transcribe(self, audio_i16: np.ndarray) -> str:
|
||
"""int16 PCM → Whisper transcription. Returns '' on no-speech/noise."""
|
||
fw = self._get_fw()
|
||
if fw is None:
|
||
return ""
|
||
|
||
# mic_gain was already applied in _read_mic_gained() during
|
||
# _record_command, so audio_i16 here is already boosted.
|
||
|
||
# int16 → float32 [-1, 1] + DSP pre-processing:
|
||
# 1. DC offset removal (subtract mean) — removes any mic bias
|
||
# 2. High-pass filter at 80 Hz — kills HVAC rumble, G1 fan noise,
|
||
# and speaker-vibration resonance. Whisper ignores the
|
||
# rumble band anyway, but it inflates RMS estimation and
|
||
# steals dynamic range from the speech band.
|
||
# 3. Pre-emphasis (0.97 coeff) — mild high-frequency boost
|
||
# that sharpens consonants (/t/, /s/, /k/ plosives/fricatives)
|
||
# which Whisper's mel features care most about.
|
||
# 4. Peak-normalize to 0.7.
|
||
audio_f32 = audio_i16.astype(np.float32) / 32768.0
|
||
# 1. DC removal
|
||
audio_f32 = audio_f32 - np.mean(audio_f32)
|
||
# 2. High-pass at 80 Hz (1-pole IIR, stable + cheap)
|
||
audio_f32 = self._highpass_80hz(audio_f32)
|
||
# 3. Pre-emphasis y[n] = x[n] - 0.97 * x[n-1]
|
||
audio_f32 = np.append(
|
||
audio_f32[:1], audio_f32[1:] - 0.97 * audio_f32[:-1]
|
||
)
|
||
# 4. Peak-normalize
|
||
peak = float(np.abs(audio_f32).max())
|
||
if peak > 1e-4 and peak < 0.7:
|
||
boost = 0.7 / peak
|
||
audio_f32 = audio_f32 * boost
|
||
log.info("peak-normalized ×%.2f (peak %.3f → 0.70)", boost, peak)
|
||
|
||
# Initial prompt biases the model toward our command vocabulary.
|
||
# Whisper uses this as decoder context — words in the prompt become
|
||
# more likely, which converts ambiguous low-SNR audio like "muv rahh"
|
||
# from a plausible English phrase ("and provide") into the intended
|
||
# command ("move right"). Keep short — long prompts can be echoed.
|
||
init_prompt = self._stt.get(
|
||
"whisper_initial_prompt",
|
||
"turn left, turn right, move forward, walk back, stop, come here, "
|
||
"sit down, stand up, raise arm, wave, look around, what do you see, "
|
||
"remember this, go home, patrol."
|
||
)
|
||
|
||
beam_size = int(self._stt.get("whisper_beam_size", 5))
|
||
no_speech_threshold = float(self._stt.get("whisper_no_speech_threshold", 0.6))
|
||
log_prob_threshold = float(self._stt.get("whisper_log_prob_threshold", -1.0))
|
||
compression_ratio_t = float(self._stt.get("whisper_compression_ratio_threshold", 2.4))
|
||
|
||
# Temperature fallback: greedy first (T=0), then 0.2, then 0.4.
|
||
# Whisper retries automatically when a pass is rejected by
|
||
# its confidence gates (log_prob < threshold etc.). On noisy
|
||
# audio this commonly rescues a bad greedy decode.
|
||
temperatures = self._stt.get(
|
||
"whisper_temperature_fallback", [0.0, 0.2, 0.4]
|
||
)
|
||
try:
|
||
segments, info = fw.transcribe(
|
||
audio_f32,
|
||
language="en",
|
||
beam_size=beam_size, # 5 = much better than greedy on noisy audio
|
||
temperature=temperatures, # greedy → 0.2 → 0.4 fallback
|
||
initial_prompt=init_prompt, # command-vocabulary bias (empty by default)
|
||
condition_on_previous_text=False,
|
||
vad_filter=False, # we already trimmed silence
|
||
without_timestamps=True,
|
||
# Whisper's built-in gates — drop transcripts that look
|
||
# like hallucinations (very low prob, highly compressed).
|
||
no_speech_threshold=no_speech_threshold,
|
||
log_prob_threshold=log_prob_threshold,
|
||
compression_ratio_threshold=compression_ratio_t,
|
||
)
|
||
# Collect segments and their mean log-prob for a confidence signal.
|
||
seg_list = list(segments)
|
||
text = " ".join(s.text for s in seg_list).strip()
|
||
nsp = float(getattr(info, "no_speech_prob", 0.0))
|
||
if seg_list:
|
||
mean_lp = sum(getattr(s, "avg_logprob", 0.0) for s in seg_list) / len(seg_list)
|
||
log.info("whisper: lp=%.2f nsp=%.2f text=%r",
|
||
mean_lp, nsp, text[:80])
|
||
else:
|
||
# CRITICAL: log even when Whisper returned zero segments
|
||
# so we can see WHY it dropped everything. Usually nsp is
|
||
# above the threshold or the log-prob fallback killed it.
|
||
log.info("whisper: (no segments) nsp=%.2f thresholds: nsp>%.2f && lp<%.2f → drop",
|
||
nsp, no_speech_threshold, log_prob_threshold)
|
||
except Exception as e:
|
||
log.error("faster-whisper transcribe failed: %s", e)
|
||
return ""
|
||
|
||
if not text:
|
||
return ""
|
||
|
||
# Reject Whisper garbage patterns (stt.garbage_patterns) and
|
||
# transcriptions shorter than stt.min_transcription_length.
|
||
# Preserve:
|
||
# - bare wake words (valid "just Sanad" signal → ack)
|
||
# - exact matches in stt.command_vocab (legitimate short
|
||
# commands like "go", "hi" must survive the length filter)
|
||
low = text.lower().strip().rstrip(".!?,")
|
||
vocab_exact = {c.lower() for c in COMMAND_VOCAB}
|
||
if low in GARBAGE_PATTERNS or len(low) < _MIN_TRANSCRIPTION_LENGTH:
|
||
if low not in WAKE_WORDS and low not in vocab_exact:
|
||
log.info("Rejecting likely noise transcription: %r", text)
|
||
return ""
|
||
|
||
# NOTE: fuzzy-match to canonical command phrase used to happen
|
||
# here, but it runs BEFORE gated-mode could see the wake word.
|
||
# Moved to _normalize_command() and called at dispatch time
|
||
# AFTER the wake-word gate + wake-word strip, so the gate
|
||
# always sees the raw Whisper text.
|
||
return text
|
||
|
||
@staticmethod
|
||
def _highpass_80hz(x: np.ndarray, sr: int = 16_000) -> np.ndarray:
|
||
"""
|
||
1-pole IIR high-pass at ~80 Hz. Attenuates HVAC/fan rumble
|
||
without touching the speech band. Cheap: 2 multiplies per sample.
|
||
"""
|
||
if x.size < 2:
|
||
return x
|
||
# Alpha from fc=80Hz: alpha = RC / (RC + dt), RC = 1/(2*pi*fc)
|
||
import math
|
||
rc = 1.0 / (2 * math.pi * 80.0)
|
||
dt = 1.0 / sr
|
||
alpha = rc / (rc + dt)
|
||
y = np.empty_like(x)
|
||
y[0] = x[0]
|
||
# vectorised enough — the loop is JITted by numpy internally
|
||
# for reasonable sizes (~25k samples).
|
||
prev_y, prev_x = x[0], x[0]
|
||
for i in range(1, x.size):
|
||
cur = alpha * (prev_y + x[i] - prev_x)
|
||
y[i] = cur
|
||
prev_y, prev_x = cur, x[i]
|
||
return y
|
||
|
||
def _transcribe_raw(self, audio_i16: np.ndarray) -> str:
|
||
"""
|
||
Like _transcribe but WITHOUT the garbage-pattern / length filters
|
||
and without the `initial_prompt` bias. Used for wake verify, where:
|
||
- We only care about the first phoneme (s/sh/z) — a 2-char "so"
|
||
is a valid /sa-/ signature and MUST NOT be dropped by
|
||
min_transcription_length.
|
||
- A biased initial_prompt makes Whisper echo itself on unclear
|
||
audio ("This is a robot assistant" → not s-starting → reject).
|
||
The downside (no Sanad nudge) is fine here because the acoustic
|
||
detector has already gated out non-speech.
|
||
"""
|
||
fw = self._get_fw()
|
||
if fw is None:
|
||
return ""
|
||
if self._mic_gain != 1.0:
|
||
audio_i16 = np.clip(
|
||
audio_i16.astype(np.float32) * self._mic_gain, -32768, 32767
|
||
).astype(np.int16)
|
||
audio_f32 = audio_i16.astype(np.float32) / 32768.0
|
||
peak = float(np.abs(audio_f32).max())
|
||
if peak > 1e-4 and peak < 0.7:
|
||
audio_f32 = audio_f32 * (0.7 / peak)
|
||
try:
|
||
segments, info = fw.transcribe(
|
||
audio_f32,
|
||
language="en",
|
||
beam_size=int(self._stt.get("whisper_beam_size", 5)),
|
||
temperature=0.0,
|
||
initial_prompt="", # NO bias → NO prompt echo
|
||
condition_on_previous_text=False,
|
||
vad_filter=False,
|
||
without_timestamps=True,
|
||
# Looser gates — we're about to do phonetic match,
|
||
# not trust the transcription verbatim.
|
||
no_speech_threshold=0.85,
|
||
log_prob_threshold=-1.8,
|
||
compression_ratio_threshold=3.0,
|
||
)
|
||
seg_list = list(segments)
|
||
text = " ".join(s.text for s in seg_list).strip()
|
||
if seg_list:
|
||
mean_lp = sum(getattr(s, "avg_logprob", 0.0) for s in seg_list) / len(seg_list)
|
||
log.info("whisper-raw: lp=%.2f nsp=%.2f text=%r",
|
||
mean_lp, getattr(info, "no_speech_prob", 0.0), text[:80])
|
||
return text
|
||
except Exception as e:
|
||
log.error("whisper-raw transcribe failed: %s", e)
|
||
return ""
|
||
|
||
# ─── command transcription ────────────────────────────
|
||
|
||
def _transcribe_command(self, audio_i16: np.ndarray) -> str:
|
||
"""
|
||
Decode the recorded command audio with faster-whisper. Thin
|
||
wrapper over self._transcribe(); exists so _handle_wake and the
|
||
always-on loop share one entry point.
|
||
"""
|
||
if audio_i16.size == 0:
|
||
return ""
|
||
return self._transcribe(audio_i16)
|
||
|
||
def _save_turn_wav(
|
||
self, audio_i16: np.ndarray, transcription: str = "", tag: str = "cmd",
|
||
) -> Optional[str]:
|
||
"""
|
||
Save a single-turn command recording for debugging.
|
||
|
||
Filename: {tag}_{epoch}_{sanitised_transcription}.wav
|
||
Examples:
|
||
cmd_1728562000_turn_right.wav ← successful command
|
||
cmd_1728562030_hi.wav ← Whisper misheard as 'Hi'
|
||
unk_1728562045_.wav ← Whisper returned empty
|
||
cmd_1728562060_thanks_for_watch.wav ← garbage-filtered
|
||
|
||
Rotation: keeps the most recent 50 across all tags so the disk
|
||
doesn't fill up during a long session. Tunable via
|
||
stt.recording_keep_count.
|
||
"""
|
||
try:
|
||
import re as _re
|
||
import wave
|
||
rec_dir = os.path.join(
|
||
PROJECT_ROOT,
|
||
self._config.get("audio", {}).get("data_dir", "Data/Voice/Recordings"),
|
||
)
|
||
os.makedirs(rec_dir, exist_ok=True)
|
||
|
||
# Rotate — keep only the most recent N across all command WAVs.
|
||
keep = int(self._stt.get("recording_keep_count", 50))
|
||
existing = sorted(
|
||
f for f in os.listdir(rec_dir)
|
||
if (f.startswith("cmd_") or f.startswith("unk_")) and f.endswith(".wav")
|
||
)
|
||
for old in existing[:max(0, len(existing) - keep + 1)]:
|
||
try: os.remove(os.path.join(rec_dir, old))
|
||
except Exception: pass
|
||
|
||
# Sanitise transcription for filename: lowercase, alnum + _, <=40 chars
|
||
slug = _re.sub(r'[^a-z0-9]+', '_', (transcription or "").lower()).strip('_')[:40]
|
||
path = os.path.join(
|
||
rec_dir, f"{tag}_{int(time.time())}_{slug}.wav"
|
||
)
|
||
with wave.open(path, "wb") as w:
|
||
w.setnchannels(1)
|
||
w.setsampwidth(2)
|
||
w.setframerate(self._sample_rate)
|
||
w.writeframes(audio_i16.astype(np.int16).tobytes())
|
||
return path
|
||
except Exception as e:
|
||
log.warning("failed to save turn wav: %s", e)
|
||
return None
|
||
|
||
def _save_unk_wav(self, audio_i16: np.ndarray) -> Optional[str]:
|
||
"""Backward-compat wrapper — save with the `unk` tag."""
|
||
return self._save_turn_wav(audio_i16, transcription="", tag="unk")
|
||
|
||
# ─── command normalization (post-gate) ────────────────
|
||
|
||
def _normalize_command(self, text: str) -> str:
|
||
"""
|
||
Apply fuzzy-match to the closest canonical command phrase.
|
||
Call AFTER the gated wake check so the wake word has already
|
||
been stripped by the caller if appropriate. Turns near-misses
|
||
like "Turn right up" → "turn right" so command_parser.py's
|
||
regex fast-path can hit them without an LLM round-trip.
|
||
"""
|
||
canonical = _closest_command(text, cutoff=self._vocab_cutoff)
|
||
if canonical != text:
|
||
log.info("fuzzy-match: %r → %r", text, canonical)
|
||
return canonical
|
||
|
||
# ─── main loop ────────────────────────────────────────
|
||
|
||
def _voice_loop(self):
|
||
"""
|
||
Dispatch to the right loop based on stt.mode:
|
||
"wake_and_command" — require "Sanad" wake word (acoustic), then
|
||
record and transcribe a command.
|
||
"always_on" — Transcribe every utterance, log all, and
|
||
dispatch all to the brain. No wake.
|
||
"always_on_gated" — Transcribe every utterance and log all,
|
||
but ONLY dispatch utterances that contain
|
||
"Sanad" (fuzzy). Wake word is stripped
|
||
before the command is sent to the brain.
|
||
"""
|
||
mode = self._stt.get("mode", "wake_and_command").lower()
|
||
self._mic_capture.start()
|
||
if mode in ("always_on", "always_on_gated"):
|
||
self._voice_loop_always_on(gated=(mode == "always_on_gated"))
|
||
else:
|
||
self._voice_loop_wake()
|
||
|
||
def _voice_loop_wake(self):
|
||
"""Classic wake-and-command: listen for 'Sanad', then record command."""
|
||
log.info("Voice loop started — listening for wake (energy-based)")
|
||
|
||
was_speaking = False
|
||
while self._running:
|
||
try:
|
||
if self._audio.is_speaking:
|
||
was_speaking = True
|
||
time.sleep(0.1)
|
||
self._detector.reset()
|
||
continue
|
||
|
||
if was_speaking:
|
||
time.sleep(0.25)
|
||
self._mic_capture.flush()
|
||
self._detector.reset()
|
||
was_speaking = False
|
||
|
||
if time.time() < self._cooldown_until:
|
||
_ = self._read_mic_raw(1024)
|
||
self._detector.reset()
|
||
time.sleep(0.05)
|
||
continue
|
||
|
||
chunk = self._read_mic_raw(1024)
|
||
if not chunk:
|
||
continue
|
||
|
||
if self._detector.process(chunk):
|
||
self._handle_wake()
|
||
except Exception as e:
|
||
log.error("Voice loop error: %s", e, exc_info=True)
|
||
time.sleep(1)
|
||
|
||
def _voice_loop_always_on(self, gated: bool = False):
|
||
"""
|
||
Always-on mode — Sanad-style continuous listening.
|
||
|
||
If `gated` is True, utterances that don't contain the wake word
|
||
"Sanad" (or a fuzzy variant) are logged but NOT dispatched to the
|
||
brain — the robot hears everything, speaks only when addressed.
|
||
|
||
Architecture (no wake word, no ack TTS):
|
||
1. Continuously read the gained mic stream in 32 ms chunks.
|
||
2. Run a hysteretic VAD on the stream — speech_entry_rms
|
||
starts an utterance, silence_exit_rms + silence_duration
|
||
ends one.
|
||
3. On each utterance end → Whisper transcribe → fuzzy-match
|
||
→ dispatch to brain.
|
||
4. Every ~5 s of idle: log a `ambient: rms=... peak=...` line
|
||
so you can SEE what the mic is doing at all times, even
|
||
when nobody's talking. Matches Sanad's "always listening"
|
||
visibility.
|
||
5. Speech is not gated on amplitude — everything above the
|
||
entry threshold is captured, quiet or loud. Loud speech
|
||
clips naturally against int16; Whisper handles it.
|
||
|
||
Thresholds come from the same stt.* config as wake mode but are
|
||
typically tuned lower here (you want eager capture since there's
|
||
no wake-word gate to prevent false positives).
|
||
"""
|
||
log.info(
|
||
"Voice loop started — ALWAYS-ON mode%s",
|
||
" [gated: only 'Sanad' utterances dispatched]" if gated
|
||
else " (no wake word — every utterance dispatched)"
|
||
)
|
||
|
||
speech_entry = float(self._stt.get("always_on_speech_entry_rms", 250.0))
|
||
silence_exit = float(self._stt.get("always_on_silence_exit_rms", 120.0))
|
||
silence_dur = float(self._stt.get("always_on_silence_duration_sec", 0.8))
|
||
min_utter_s = float(self._stt.get("always_on_min_utterance_sec", 0.3))
|
||
max_utter_s = float(self._stt.get("always_on_max_utterance_sec", 12.0))
|
||
idle_log_s = float(self._stt.get("always_on_idle_log_sec", 5.0))
|
||
ambient_mult = float(self._stt.get("always_on_ambient_mult", 1.4))
|
||
ambient_win = int(self._stt.get("always_on_ambient_window_chunks", 100))
|
||
|
||
buffer = bytearray()
|
||
in_speech = False
|
||
silence_budget = 0.0
|
||
speech_duration = 0.0
|
||
peak_rms = 0.0
|
||
idle_peak_rms = 0.0
|
||
idle_sum_rms = 0.0
|
||
idle_chunks = 0
|
||
last_idle_log = time.time()
|
||
was_speaking_tts = False
|
||
|
||
# Rolling ambient (idle-only) RMS buffer. Used to adapt silence_exit
|
||
# so a noisy room doesn't trap the VAD at max_utter_s: if the
|
||
# observed idle floor sits at rms=200, silence_exit needs to be
|
||
# above 200 or silence never accumulates. We take
|
||
# effective_exit = max(config_silence_exit, ambient_floor * mult).
|
||
ambient_buf: list = []
|
||
ambient_floor = 0.0
|
||
|
||
# Seed ambient_floor by sampling ~1s of mic BEFORE entering the
|
||
# loop. Without this, the very first utterance runs with
|
||
# ambient_floor=0 → eff_exit=config_floor, which under-cuts
|
||
# noisy rooms and creates self-sustaining echo loops.
|
||
seed_chunks = []
|
||
seed_deadline = time.time() + 1.0
|
||
while time.time() < seed_deadline:
|
||
r = self._read_mic_gained(1024)
|
||
if r:
|
||
a = np.frombuffer(r, dtype=np.int16)
|
||
if a.size:
|
||
seed_chunks.append(
|
||
float(np.sqrt(np.mean(a.astype(np.float64) ** 2)))
|
||
)
|
||
else:
|
||
time.sleep(0.005)
|
||
if seed_chunks:
|
||
# Use the median so one loud transient doesn't poison the seed.
|
||
seed_chunks.sort()
|
||
ambient_floor = seed_chunks[len(seed_chunks) // 2]
|
||
ambient_buf = list(seed_chunks[-ambient_win:])
|
||
log.info("ambient seeded: floor=%.0f from %d chunks",
|
||
ambient_floor, len(seed_chunks))
|
||
|
||
while self._running:
|
||
try:
|
||
# Drop mic input while the robot itself is speaking so we
|
||
# don't feed our own TTS back through Whisper.
|
||
if self._audio.is_speaking:
|
||
was_speaking_tts = True
|
||
buffer.clear()
|
||
in_speech = False
|
||
silence_budget = 0.0
|
||
speech_duration = 0.0
|
||
peak_rms = 0.0
|
||
time.sleep(0.1)
|
||
continue
|
||
|
||
if was_speaking_tts:
|
||
time.sleep(float(self._stt.get("post_tts_settle_sec", 0.3)))
|
||
self._mic_capture.flush()
|
||
was_speaking_tts = False
|
||
|
||
raw = self._read_mic_gained(1024)
|
||
if not raw:
|
||
time.sleep(0.005)
|
||
continue
|
||
|
||
arr = np.frombuffer(raw, dtype=np.int16)
|
||
rms = float(np.sqrt(np.mean(arr.astype(np.float64) ** 2)))
|
||
chunk_s = (len(raw) // 2) / self._sample_rate
|
||
|
||
if in_speech:
|
||
buffer.extend(raw)
|
||
speech_duration += chunk_s
|
||
peak_rms = max(peak_rms, rms)
|
||
|
||
# Adaptive silence exit: sits max(config_floor,
|
||
# ambient_floor × mult). Prevents the "room is noisier
|
||
# than silence_exit" failure mode where silence never
|
||
# accumulates and every utterance hits max_utter_s.
|
||
eff_exit = max(silence_exit, ambient_floor * ambient_mult)
|
||
if rms < eff_exit:
|
||
silence_budget += chunk_s
|
||
else:
|
||
silence_budget = 0.0
|
||
|
||
utter_over = (silence_budget >= silence_dur and
|
||
speech_duration >= min_utter_s)
|
||
force_stop = speech_duration >= max_utter_s
|
||
|
||
if utter_over or force_stop:
|
||
reason = "max-duration" if force_stop else "silence"
|
||
audio = np.frombuffer(bytes(buffer), dtype=np.int16)
|
||
log.info("utterance end (%s): dur=%.2fs peak_rms=%.0f samples=%d",
|
||
reason, speech_duration, peak_rms, audio.size)
|
||
|
||
# RESET STATE IMMEDIATELY — before any Whisper /
|
||
# speak() / dispatch. Previously a `continue` from
|
||
# the wake-only ack branch skipped the reset, and
|
||
# the 12-second buffer lived forever, re-transcribed
|
||
# every iteration into the same "Sanad" output,
|
||
# spawning a self-sustaining "Yes" loop.
|
||
buffer.clear()
|
||
in_speech = False
|
||
silence_budget = 0.0
|
||
speech_duration = 0.0
|
||
peak_rms = 0.0
|
||
|
||
text = self._transcribe_command(audio) if audio.size else ""
|
||
if text:
|
||
log.info("HEARD: %r", text)
|
||
# Gated mode: only dispatch if the wake word was
|
||
# spoken. Everything is still logged above so the
|
||
# operator has full visibility into what the mic
|
||
# is picking up.
|
||
if gated and not _has_wake_word(text):
|
||
log.info(" (no wake word — not dispatched)")
|
||
else:
|
||
if gated:
|
||
command = _strip_wake_word(text)
|
||
if command != text:
|
||
log.info(" wake-stripped: %r → %r",
|
||
text, command)
|
||
# Bare wake word ("Sanad.", "Sanad") →
|
||
# speak a "Yes" ack, do NOT call the
|
||
# brain (it would hallucinate a random
|
||
# response from a 1-word prompt).
|
||
if not command:
|
||
log.info(" wake-only utterance — speaking ack")
|
||
try:
|
||
self._audio.speak(
|
||
self._messages.get("wake_heard", "Yes")
|
||
)
|
||
except Exception as e:
|
||
log.warning("wake-ack TTS failed: %s", e)
|
||
continue
|
||
else:
|
||
command = text
|
||
|
||
# Normalize near-misses ("Turn right up" →
|
||
# "turn right") so command_parser's regex
|
||
# fast-path can hit without an LLM round-trip.
|
||
command = self._normalize_command(command)
|
||
|
||
print(f' [Sanad] heard: "{command}"')
|
||
if self._on_command:
|
||
try:
|
||
self._on_command(command, "en")
|
||
except Exception as e:
|
||
log.error("on_command: %s", e, exc_info=True)
|
||
else:
|
||
log.info("utterance rejected (empty/garbage after Whisper)")
|
||
else:
|
||
idle_peak_rms = max(idle_peak_rms, rms)
|
||
idle_sum_rms += rms
|
||
idle_chunks += 1
|
||
|
||
# Maintain the rolling ambient floor so silence_exit can
|
||
# adapt. Use windows that are *clearly* not speech
|
||
# (rms < speech_entry / 2) — otherwise a borderline
|
||
# window just before transition pollutes the floor.
|
||
if rms < speech_entry * 0.5:
|
||
ambient_buf.append(rms)
|
||
if len(ambient_buf) > ambient_win:
|
||
ambient_buf.pop(0)
|
||
if ambient_buf:
|
||
ambient_floor = sum(ambient_buf) / len(ambient_buf)
|
||
|
||
if rms >= speech_entry:
|
||
# utterance starts — keep this chunk as pre-roll
|
||
log.info("utterance start (rms=%.0f >= entry=%.0f)",
|
||
rms, speech_entry)
|
||
buffer.extend(raw)
|
||
in_speech = True
|
||
speech_duration = chunk_s
|
||
peak_rms = rms
|
||
silence_budget = 0.0
|
||
|
||
# periodic ambient log while idle — "I am listening"
|
||
now = time.time()
|
||
if (now - last_idle_log) >= idle_log_s and idle_chunks > 0:
|
||
eff_exit = max(silence_exit, ambient_floor * ambient_mult)
|
||
log.info("ambient: mean_rms=%.0f peak_rms=%.0f chunks=%d "
|
||
"floor=%.0f entry=%.0f eff_exit=%.0f",
|
||
idle_sum_rms / idle_chunks, idle_peak_rms,
|
||
idle_chunks, ambient_floor, speech_entry, eff_exit)
|
||
idle_peak_rms = 0.0
|
||
idle_sum_rms = 0.0
|
||
idle_chunks = 0
|
||
last_idle_log = now
|
||
except Exception as e:
|
||
log.error("Always-on voice loop error: %s", e, exc_info=True)
|
||
time.sleep(1)
|
||
|
||
def _handle_wake(self):
|
||
t_wake = time.time()
|
||
log.info("Wake detected (acoustic)")
|
||
|
||
# Verify the burst that triggered wake actually sounds like a
|
||
# wake word. The acoustic detector fires on ANY 0.2-1.5s burst
|
||
# (coughs, claps, door slams). We run a lightweight Whisper
|
||
# decode on the burst and accept if EITHER:
|
||
# (a) a wake-word variant is in the transcription, OR
|
||
# (b) the transcription starts with 's'/'sh'/'z' — Whisper's
|
||
# consistent signature for mishearing non-English "Sanad"
|
||
# as an English /sa-/ word ("Stop", "Set", "Sand", "Send").
|
||
# Reject if Whisper returns empty (pure noise / cough) or
|
||
# confidently not-s speech ("hello", "okay").
|
||
if self._stt.get("wake_verify_enabled", True):
|
||
burst = self._detector.get_last_burst()
|
||
if burst is not None and burst.size >= int(0.15 * self._sample_rate):
|
||
t_verify = time.time()
|
||
# Lenient transcribe — no garbage filter, no min-length,
|
||
# no bias prompt. See _transcribe_raw docstring.
|
||
heard = self._transcribe_raw(burst)
|
||
verify_ms = (time.time() - t_verify) * 1000
|
||
low = (heard or "").lower().strip().lstrip('"\'.,!?')
|
||
if not low:
|
||
log.info(" wake REJECTED — whisper empty (%.0fms)", verify_ms)
|
||
return
|
||
starts_with_s = low.startswith(("s", "sh", "z"))
|
||
if _has_wake_word(heard):
|
||
log.info(" wake verified (wake-word: %r, %.0fms)",
|
||
heard, verify_ms)
|
||
elif starts_with_s:
|
||
log.info(" wake verified (s-phonetic: %r, %.0fms)",
|
||
heard, verify_ms)
|
||
else:
|
||
log.info(" wake REJECTED — %r (%.0fms, not s-starting)",
|
||
heard, verify_ms)
|
||
return
|
||
|
||
print("\n [Sanad] wake heard — listening…")
|
||
|
||
ack_mode = self._stt.get("wake_ack", "tts").lower()
|
||
if ack_mode == "none":
|
||
log.info(" wake-ack: silent (no TTS)")
|
||
else:
|
||
try:
|
||
self._audio.speak(self._messages.get("wake_heard", "Yes"))
|
||
except Exception as e:
|
||
log.warning("TTS ack failed: %s", e)
|
||
|
||
# Wait for ack TTS + speaker reverberation to decay
|
||
while self._audio.is_speaking:
|
||
time.sleep(0.05)
|
||
settle = float(self._stt.get("post_tts_settle_sec", 0.3))
|
||
time.sleep(settle)
|
||
self._mic_capture.flush()
|
||
log.info(" wake→record-ready: %.2fs", time.time() - t_wake)
|
||
|
||
log.info("Recording command...")
|
||
audio = self._record_command()
|
||
# _record_command returns empty if it never saw speech above the
|
||
# adaptive entry threshold — no point running STT on noise.
|
||
# Two cases:
|
||
# audio.size == 0 → no speech at all (likely false wake
|
||
# from cough/slam). SILENTLY reset —
|
||
# don't blurt "I didn't catch that" on
|
||
# what was never a real interaction.
|
||
# 0 < size < 8000 → brief speech burst (< 0.5s). Probably
|
||
# a real-but-unintelligible attempt;
|
||
# speak "I didn't catch that" so the
|
||
# user knows to retry.
|
||
if audio.size == 0:
|
||
log.info("Command dropped (no speech — likely false wake); silent reset")
|
||
self._cooldown_until = time.time() + float(
|
||
self._stt.get("command_cooldown_sec", 1.5))
|
||
return
|
||
if audio.size < 8000: # < 0.5 s but > 0 — real short attempt
|
||
log.info("Command too short (%.2fs); asking user to repeat",
|
||
audio.size / self._sample_rate)
|
||
try: self._audio.speak(self._messages.get("no_speech", "I didn't catch that"))
|
||
except Exception: pass
|
||
self._cooldown_until = time.time() + float(
|
||
self._stt.get("command_cooldown_sec", 1.5))
|
||
return
|
||
|
||
peak = int(np.abs(audio).max())
|
||
rms = float(np.sqrt(np.mean(audio.astype(np.float64) ** 2)))
|
||
log.info("command audio: samples=%d peak=%d rms=%.1f",
|
||
audio.size, peak, rms)
|
||
|
||
text = self._transcribe_command(audio)
|
||
if not text:
|
||
log.info("Empty or rejected transcription")
|
||
# Save WAV of the failed transcription for post-mortem.
|
||
if self._stt.get("recording_enabled", True):
|
||
self._save_turn_wav(audio, transcription="", tag="unk")
|
||
try: self._audio.speak(self._messages.get("no_speech", "I didn't catch that"))
|
||
except Exception: pass
|
||
self._cooldown_until = time.time() + float(
|
||
self._stt.get("command_cooldown_sec", 1.5))
|
||
return
|
||
|
||
# Normalize near-miss transcriptions like "Turn right up" → "turn
|
||
# right" so the brain's regex fast-path catches them.
|
||
text = self._normalize_command(text)
|
||
log.info("Transcribed: %s", text[:120])
|
||
|
||
# Save every successful command recording so you can listen back
|
||
# later and see what the mic actually heard vs what Whisper
|
||
# transcribed. Disable with stt.recording_enabled=false.
|
||
if self._stt.get("recording_enabled", True):
|
||
wav_path = self._save_turn_wav(audio, transcription=text, tag="cmd")
|
||
if wav_path:
|
||
log.info("saved: %s", os.path.basename(wav_path))
|
||
|
||
if self._on_command:
|
||
try:
|
||
self._on_command(text, "en")
|
||
except Exception as e:
|
||
log.error("on_command error: %s", e, exc_info=True)
|
||
elif self._on_wake:
|
||
try: self._on_wake()
|
||
except Exception: pass
|
||
|
||
cd = float(self._stt.get("command_cooldown_sec", 1.5))
|
||
self._cooldown_until = time.time() + cd
|
||
log.info("wake→dispatch total: %.2fs | cooldown %.1fs",
|
||
time.time() - t_wake, cd)
|
||
|
||
# ─── start / stop ─────────────────────────────────────
|
||
|
||
def start(self):
|
||
if self._running:
|
||
log.warning("VoiceModule already running")
|
||
return
|
||
self._running = True
|
||
self._thread = threading.Thread(target=self._voice_loop, daemon=True, name="voice")
|
||
self._thread.start()
|
||
log.info("Voice module started")
|
||
|
||
def stop(self):
|
||
self._running = False
|
||
try: self._mic_capture.stop()
|
||
except Exception: pass
|
||
if self._thread:
|
||
self._thread.join(timeout=5)
|
||
self._thread = None
|
||
log.info("Voice module stopped")
|
||
|
||
@property
|
||
def is_running(self) -> bool:
|
||
return self._running
|
||
|
||
|
||
if __name__ == "__main__":
|
||
from API.audio_api import AudioAPI
|
||
def on_cmd(text, lang):
|
||
print(f"\n COMMAND [{lang}]: {text}\n")
|
||
audio = AudioAPI()
|
||
voice = VoiceModule(audio, on_command=on_cmd)
|
||
print('Starting. Say "Sanad", then speak your command.\n')
|
||
voice.start()
|
||
try:
|
||
while voice.is_running: time.sleep(0.5)
|
||
except KeyboardInterrupt:
|
||
voice.stop()
|