#!/usr/bin/env python3 """ Voice/marcus_voice.py — voice input for Marcus (custom wake + faster-whisper STT). Pipeline: G1 mic ─► custom wake detector (numpy, offline, instant) │ ▼ TTS "Yes" (AudioAPI → G1 TtsMaker) │ ▼ record command audio until silence │ ▼ faster-whisper base.en int8 (CPU) ──► brain callback(text) Wake detection is local and instant (Voice/wake_detector.py — pure DSP, no ML). STT runs only on the recorded command, not on every 2 s of mic input, so the CPU cost is bounded by how often the user talks. Why faster-whisper (CTranslate2) instead of openai-whisper: The Jetson's torch-aarch64 build has a Categorical sampler bug that produces NaN logits on low-SNR input, which is exactly what the G1 far-field mic captures. faster-whisper bypasses torch entirely and runs the int8-quantized model through CTranslate2 — same quality as Whisper base, no numerical instability, 3× faster on this hardware. """ from __future__ import annotations import logging import os import sys import threading import time from logging.handlers import RotatingFileHandler from typing import Callable, Optional import numpy as np _PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if _PROJECT_DIR not in sys.path: sys.path.insert(0, _PROJECT_DIR) from Core.env_loader import PROJECT_ROOT from Core.config_loader import load_config LOG_DIR = os.path.join(PROJECT_ROOT, "logs") os.makedirs(LOG_DIR, exist_ok=True) logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)s: %(message)s", handlers=[ RotatingFileHandler( os.path.join(LOG_DIR, "voice.log"), maxBytes=5_000_000, backupCount=3, encoding="utf-8", ), ], ) log = logging.getLogger("marcus_voice") # Module-level vocabulary containers. EMPTY on import — populated by # VoiceModule.__init__ from Config/config_Voice.json::stt.{wake_words, # command_vocab, garbage_patterns}. Config is the single source of truth; # there are no hardcoded string lists here anymore. # # If you import this module without running a VoiceModule() first, these # stay empty → fuzzy-match is a no-op, wake detection rejects everything, # garbage filter rejects nothing. That's by design: bad config = obvious # broken behavior, not silently-drifting hardcoded defaults. WAKE_WORDS: set = set() COMMAND_VOCAB: list = [] GARBAGE_PATTERNS: set = set() _MIN_TRANSCRIPTION_LENGTH: int = 3 def _has_wake_word(text: str) -> bool: """ True if the utterance contains any wake-word variant as a *whole word* (word-boundary match, not substring — so "standard" doesn't trigger off "sand"). """ import re low = text.lower() for w in WAKE_WORDS: if re.search(r'\b' + re.escape(w) + r'\b', low): return True return False def _strip_wake_word_once(text: str) -> str: """Single pass of wake-word stripping. Use via _strip_wake_word().""" import re stripped = text.strip() # Case 1: the entire utterance is just a wake word + optional # trailing punctuation. Return empty string so caller can ack-only. for w in WAKE_WORDS: if re.fullmatch(rf'{re.escape(w)}[\s,.!?]*', stripped, re.IGNORECASE): return "" # Case 2: "Sanad " — require whitespace (or comma+ws) between # wake word and command so "Sanad." doesn't swallow "." as a command. for w in sorted(WAKE_WORDS, key=len, reverse=True): m = re.match( rf'^\s*{re.escape(w)}\s*[,.!?]?\s+(.+)$', text, re.IGNORECASE, ) if m: return m.group(1).strip(' ,.!?') # Case 3: " Sanad" — trailing wake word. m = re.match( rf'^(.+?)\s+{re.escape(w)}\s*[.!?]*\s*$', text, re.IGNORECASE, ) if m: return m.group(1).strip(' ,.!?') return text def _strip_wake_word(text: str) -> str: """ Remove the wake word from the start or end of text, iteratively, so repeated-wake transcriptions ("Sanad. Sanad.") fully collapse to the actual command (or empty string if nothing else was said). Examples: "Sanad, turn left" → "turn left" "Sanad turn left" → "turn left" "turn left Sanad" → "turn left" "Sanad." → "" "Sanad" → "" "Sanad. Sanad." → "" (was leaving "Sanad" before) "Sanad Sanad stop" → "stop" (recursive strip) """ # Iterate until stable — each pass peels off one wake word. Cap at # a handful of iterations so a malicious/garbled input can't loop. for _ in range(5): stripped = _strip_wake_word_once(text) if stripped == text: return text text = stripped return text def _closest_command(text: str, cutoff: float = 0.72) -> str: """ Map a Whisper transcription to the closest known command phrase. Returns the canonical command if there's a close-enough match, else returns the original text unchanged. Close = difflib SequenceMatcher ratio ≥ cutoff (0.72 empirically rejects unrelated phrases while accepting common Whisper near-misses like "Turn right up"→"turn right" or "What do you see?"→"what do you see"). Also handles the "transcription contains a command" case — if the text has a command phrase as a substring (e.g. "Sanad, turn left" from an echo), extract the command. """ from difflib import SequenceMatcher low = text.lower().strip().rstrip(".!?,") if not low: return text # Cheap substring win first — no fuzzy needed if the command is # literally in the transcription. for cmd in COMMAND_VOCAB: if cmd in low: return cmd best_cmd = None best_ratio = 0.0 for cmd in COMMAND_VOCAB: r = SequenceMatcher(None, low, cmd).ratio() if r > best_ratio: best_ratio = r best_cmd = cmd if best_ratio >= cutoff: return best_cmd return text class VoiceModule: def __init__( self, audio_api, on_command: Optional[Callable] = None, on_wake: Optional[Callable] = None, ): self._audio = audio_api self._on_command = on_command self._on_wake = on_wake self._config = load_config("Voice") self._stt = self._config.get("stt", {}) self._messages = self._config.get("messages", {}) # Load all voice vocabulary from config — these are the only # string lists the voice layer uses, and they come from # config_Voice.json. If a key is missing, the list is empty and # that feature silently degrades (fuzzy-match no-op, nothing # rejected as garbage, no wake-word match) — NEVER crashes. global WAKE_WORDS, COMMAND_VOCAB, GARBAGE_PATTERNS, _MIN_TRANSCRIPTION_LENGTH WAKE_WORDS = {w.lower() for w in self._stt.get("wake_words", [])} COMMAND_VOCAB = list(self._stt.get("command_vocab", [])) GARBAGE_PATTERNS = {p.lower() for p in self._stt.get("garbage_patterns", [])} _MIN_TRANSCRIPTION_LENGTH = int(self._stt.get("min_transcription_length", 3)) self._vocab_cutoff = float(self._stt.get("command_vocab_cutoff", 0.72)) log.info("vocab loaded: %d wake_words, %d command_vocab, %d garbage_patterns", len(WAKE_WORDS), len(COMMAND_VOCAB), len(GARBAGE_PATTERNS)) # ── Custom wake detector ── from Voice.wake_detector import WakeDetector, WakeConfig wcfg = WakeConfig( sample_rate = 16_000, speech_threshold = float(self._stt.get("speech_threshold", 80.0)), min_word_duration_s = float(self._stt.get("min_word_duration", 0.20)), max_word_duration_s = float(self._stt.get("max_word_duration", 1.50)), post_silence_s = float(self._stt.get("post_silence", 0.30)), cooldown_s = float(self._stt.get("wake_cooldown", 1.50)), chunk_ms = int( self._stt.get("wake_chunk_ms", 50)), adaptive_window_n = int( self._stt.get("wake_adaptive_window_n", 50)), adaptive_mult = float(self._stt.get("wake_adaptive_mult", 3.0)), diag_log_sec = float(self._stt.get("wake_diag_log_sec", 3.0)), ) self._detector = WakeDetector(wcfg) # ── G1 mic ── from Voice.builtin_mic import BuiltinMic _mcfg = self._config.get("mic_udp", {}) self._mic_capture = BuiltinMic( group = _mcfg.get("group", "239.168.123.161"), port = _mcfg.get("port", 5555), buf_max = _mcfg.get("buffer_max_bytes", 64000), ) self._sample_rate = self._mic_capture.sample_rate # ── global software mic gain ── # Applied to every byte read from the mic, so wake detector, VAD, # AND Whisper all see the boosted audio. One knob, uniform effect. # G1 far-field mic benefits from 2.0-3.0 for normal speaking volume; # above 4.0 you start clipping loud words. self._mic_gain = float(self._stt.get("mic_gain", 1.0)) if self._mic_gain != 1.0: log.info("mic_gain = %.2fx (applied to all mic reads)", self._mic_gain) # ── faster-whisper (lazy-init on first wake) ── self._fw = None self._running = False self._thread = None self._cooldown_until = 0.0 log.info("VoiceModule initialized (wake=custom, stt=faster-whisper)") # ─── gain-applied mic read ──────────────────────────── def _read_mic_raw(self, num_bytes: int) -> bytes: """Raw mic read — no gain. Used by the wake detector whose thresholds are calibrated against unamplified G1 ambient.""" return self._mic_capture.read_chunk(num_bytes) def _read_mic_gained(self, num_bytes: int) -> bytes: """ Mic read with self._mic_gain applied. Used during command recording so Whisper sees a louder, cleaner signal. NOT used in the wake loop — amplifying ambient there pushes it over the wake threshold and the detector can never find its silent baseline. """ raw = self._mic_capture.read_chunk(num_bytes) if not raw or self._mic_gain == 1.0: return raw arr = np.frombuffer(raw, dtype=np.int16).astype(np.float32) * self._mic_gain return np.clip(arr, -32768, 32767).astype(np.int16).tobytes() # ─── lazy faster-whisper init ───────────────────────── def _get_fw(self): """Load faster-whisper on first use — startup saved for cold path.""" if self._fw is not None: return self._fw model = self._stt.get("whisper_model", "base.en") device = self._stt.get("whisper_device", "cpu") compute = self._stt.get("whisper_compute_type", "int8") log.info( "Loading faster-whisper: model=%s device=%s compute=%s", model, device, compute, ) try: from faster_whisper import WhisperModel self._fw = WhisperModel(model, device=device, compute_type=compute) log.info("faster-whisper ready") except Exception as e: log.error("faster-whisper init failed: %s — voice will be wake-only", e) self._fw = None return self._fw # ─── command recording ──────────────────────────────── def _record_command(self) -> np.ndarray: """ Record the user's command with a hysteretic, adaptive-baseline VAD. Design (handles quiet, normal, and loud voices on the G1 mic): 1. Sample 200 ms of ambient noise first to learn the floor, then set the "silence" gate to max(ambient * 2.5, floor). Eliminates the "my silence threshold is higher than my user's speaking level" failure mode. 2. Two thresholds with hysteresis: speech_entry — RMS required to count as "speech started" silence_exit — RMS below which we count silence (< speech_entry; prevents mid-word bail on breaths and short consonant gaps). 3. Recording can only *end* after we've actually heard speech. Pure silence just runs out to max_record_sec, then returns empty (the caller plays "I didn't catch that" without burning a Whisper call on noise). 4. After speech is seen, silence_budget accumulates only while RMS stays below silence_exit. A single loud burst resets it to zero — so natural "turn... left" pauses don't end the recording. """ # ── config knobs (all overridable via config_Voice.json::stt) ─ speech_entry_rms = float(self._stt.get("speech_entry_rms", 250.0)) silence_exit_rms = float(self._stt.get("silence_exit_rms", 120.0)) silence_dur = float(self._stt.get("silence_duration_sec", 1.2)) max_dur = float(self._stt.get("max_record_sec", 8.0)) min_dur = float(self._stt.get("min_record_sec", 0.4)) ambient_probe_s = float(self._stt.get("ambient_probe_sec", 0.2)) ambient_mult = float(self._stt.get("ambient_mult", 2.5)) small_chunk_bytes = 1024 analysis_ms = 100 analysis_bytes = int(self._sample_rate * analysis_ms / 1000) * 2 # ── 1. Reuse the wake detector's baseline instead of probing # the mic right now. The wake detector's _baseline is a rolling # mean of idle-silence RMS values from the last few seconds. # # Why NOT probe at record-time: we arrive here right after TTS # "Yes", and the user typically starts speaking within 200 ms # of hearing the ack. A probe window sized to the ambient floor # then measures the *user's speech* as "ambient" and sets # speech_entry above the user's actual amplitude — causing the # "no speech in 8.00s" failure mode observed in the wild. # # Cap the baseline at a sensible ceiling so a one-off loud # transient during idle doesn't lock us out either. probe_buf = bytearray() # no probe audio kept ambient_rms = getattr(self._detector, "_baseline", 0.0) or 0.0 ambient_cap = float(self._stt.get("ambient_cap_rms", 200.0)) ambient_rms = min(ambient_rms, ambient_cap) if ambient_rms > 0: adaptive_exit = max(silence_exit_rms, ambient_rms * ambient_mult) adaptive_entry = max(speech_entry_rms, ambient_rms * ambient_mult * 1.8) else: adaptive_exit, adaptive_entry = silence_exit_rms, speech_entry_rms log.info("vad: ambient_rms=%.0f (from wake baseline, cap=%.0f) " "speech_entry=%.0f silence_exit=%.0f", ambient_rms, ambient_cap, adaptive_entry, adaptive_exit) # ── 2. main capture loop ────────────────────────────────────── collected = bytearray(probe_buf) # keep probe audio — user may # have already started talking analysis_buf = bytearray() silence_budget = 0.0 total_time = len(probe_buf) / 2 / self._sample_rate speech_seen = False peak_rms_seen = 0.0 # Byte offset into `collected` at which speech first crossed # adaptive_entry. We trim pre-speech silence to this point (minus # ~300 ms pre-roll) before returning. Keeping Whisper's input # tight (speech + small tails) improves transcription accuracy # by removing the ambient/HVAC portion that dilutes the mel # features. speech_start_byte: Optional[int] = None preroll_bytes = int(self._sample_rate * 0.3) * 2 # 300 ms wall_start = time.time() while total_time < max_dur and (time.time() - wall_start) < max_dur + 2: raw = self._read_mic_gained(small_chunk_bytes) if not raw: time.sleep(0.005) continue collected.extend(raw) analysis_buf.extend(raw) total_time += (len(raw) // 2) / self._sample_rate while len(analysis_buf) >= analysis_bytes: win = np.frombuffer(bytes(analysis_buf[:analysis_bytes]), dtype=np.int16) del analysis_buf[:analysis_bytes] rms = float(np.sqrt(np.mean(win.astype(np.float64) ** 2))) peak_rms_seen = max(peak_rms_seen, rms) if rms >= adaptive_entry: if not speech_seen: speech_seen = True # Record where speech started (byte offset # in `collected`) so we can trim pre-roll later. speech_start_byte = max(0, len(collected) - preroll_bytes) silence_budget = 0.0 elif speech_seen and rms < adaptive_exit: silence_budget += analysis_ms / 1000.0 # between exit and entry → hold state (hysteresis zone) # end only *after* we've heard real speech if (speech_seen and silence_budget >= silence_dur and total_time >= min_dur): log.info("silence after speech at %.2fs (peak_rms=%.0f)", total_time, peak_rms_seen) break if not speech_seen: log.info("no speech in %.2fs (peak_rms=%.0f < entry=%.0f) — dropping", total_time, peak_rms_seen, adaptive_entry) return np.array([], dtype=np.int16) if total_time >= max_dur: log.info("max-record-sec hit at %.2fs (peak_rms=%.0f)", total_time, peak_rms_seen) # Trim leading pre-speech silence. Keep 300 ms of pre-roll so # the onset of the first phoneme is preserved for Whisper. if speech_start_byte and speech_start_byte > 0: trimmed_ms = speech_start_byte / 2 / self._sample_rate * 1000 log.info("trimmed %.0f ms of leading silence " "(pre-speech buffer %d bytes)", trimmed_ms, speech_start_byte) collected = collected[speech_start_byte:] return (np.frombuffer(bytes(collected), dtype=np.int16) if collected else np.array([], dtype=np.int16)) # ─── transcription ──────────────────────────────────── def _transcribe(self, audio_i16: np.ndarray) -> str: """int16 PCM → Whisper transcription. Returns '' on no-speech/noise.""" fw = self._get_fw() if fw is None: return "" # mic_gain was already applied in _read_mic_gained() during # _record_command, so audio_i16 here is already boosted. # int16 → float32 [-1, 1] + DSP pre-processing: # 1. DC offset removal (subtract mean) — removes any mic bias # 2. High-pass filter at 80 Hz — kills HVAC rumble, G1 fan noise, # and speaker-vibration resonance. Whisper ignores the # rumble band anyway, but it inflates RMS estimation and # steals dynamic range from the speech band. # 3. Pre-emphasis (0.97 coeff) — mild high-frequency boost # that sharpens consonants (/t/, /s/, /k/ plosives/fricatives) # which Whisper's mel features care most about. # 4. Peak-normalize to 0.7. audio_f32 = audio_i16.astype(np.float32) / 32768.0 # 1. DC removal audio_f32 = audio_f32 - np.mean(audio_f32) # 2. High-pass at 80 Hz (1-pole IIR, stable + cheap) audio_f32 = self._highpass_80hz(audio_f32) # 3. Pre-emphasis y[n] = x[n] - 0.97 * x[n-1] audio_f32 = np.append( audio_f32[:1], audio_f32[1:] - 0.97 * audio_f32[:-1] ) # 4. Peak-normalize peak = float(np.abs(audio_f32).max()) if peak > 1e-4 and peak < 0.7: boost = 0.7 / peak audio_f32 = audio_f32 * boost log.info("peak-normalized ×%.2f (peak %.3f → 0.70)", boost, peak) # Initial prompt biases the model toward our command vocabulary. # Whisper uses this as decoder context — words in the prompt become # more likely, which converts ambiguous low-SNR audio like "muv rahh" # from a plausible English phrase ("and provide") into the intended # command ("move right"). Keep short — long prompts can be echoed. init_prompt = self._stt.get( "whisper_initial_prompt", "turn left, turn right, move forward, walk back, stop, come here, " "sit down, stand up, raise arm, wave, look around, what do you see, " "remember this, go home, patrol." ) beam_size = int(self._stt.get("whisper_beam_size", 5)) no_speech_threshold = float(self._stt.get("whisper_no_speech_threshold", 0.6)) log_prob_threshold = float(self._stt.get("whisper_log_prob_threshold", -1.0)) compression_ratio_t = float(self._stt.get("whisper_compression_ratio_threshold", 2.4)) # Temperature fallback: greedy first (T=0), then 0.2, then 0.4. # Whisper retries automatically when a pass is rejected by # its confidence gates (log_prob < threshold etc.). On noisy # audio this commonly rescues a bad greedy decode. temperatures = self._stt.get( "whisper_temperature_fallback", [0.0, 0.2, 0.4] ) try: segments, info = fw.transcribe( audio_f32, language="en", beam_size=beam_size, # 5 = much better than greedy on noisy audio temperature=temperatures, # greedy → 0.2 → 0.4 fallback initial_prompt=init_prompt, # command-vocabulary bias (empty by default) condition_on_previous_text=False, vad_filter=False, # we already trimmed silence without_timestamps=True, # Whisper's built-in gates — drop transcripts that look # like hallucinations (very low prob, highly compressed). no_speech_threshold=no_speech_threshold, log_prob_threshold=log_prob_threshold, compression_ratio_threshold=compression_ratio_t, ) # Collect segments and their mean log-prob for a confidence signal. seg_list = list(segments) text = " ".join(s.text for s in seg_list).strip() nsp = float(getattr(info, "no_speech_prob", 0.0)) if seg_list: mean_lp = sum(getattr(s, "avg_logprob", 0.0) for s in seg_list) / len(seg_list) log.info("whisper: lp=%.2f nsp=%.2f text=%r", mean_lp, nsp, text[:80]) else: # CRITICAL: log even when Whisper returned zero segments # so we can see WHY it dropped everything. Usually nsp is # above the threshold or the log-prob fallback killed it. log.info("whisper: (no segments) nsp=%.2f thresholds: nsp>%.2f && lp<%.2f → drop", nsp, no_speech_threshold, log_prob_threshold) except Exception as e: log.error("faster-whisper transcribe failed: %s", e) return "" if not text: return "" # Reject Whisper garbage patterns (stt.garbage_patterns) and # transcriptions shorter than stt.min_transcription_length. # Preserve: # - bare wake words (valid "just Sanad" signal → ack) # - exact matches in stt.command_vocab (legitimate short # commands like "go", "hi" must survive the length filter) low = text.lower().strip().rstrip(".!?,") vocab_exact = {c.lower() for c in COMMAND_VOCAB} if low in GARBAGE_PATTERNS or len(low) < _MIN_TRANSCRIPTION_LENGTH: if low not in WAKE_WORDS and low not in vocab_exact: log.info("Rejecting likely noise transcription: %r", text) return "" # NOTE: fuzzy-match to canonical command phrase used to happen # here, but it runs BEFORE gated-mode could see the wake word. # Moved to _normalize_command() and called at dispatch time # AFTER the wake-word gate + wake-word strip, so the gate # always sees the raw Whisper text. return text @staticmethod def _highpass_80hz(x: np.ndarray, sr: int = 16_000) -> np.ndarray: """ 1-pole IIR high-pass at ~80 Hz. Attenuates HVAC/fan rumble without touching the speech band. Cheap: 2 multiplies per sample. """ if x.size < 2: return x # Alpha from fc=80Hz: alpha = RC / (RC + dt), RC = 1/(2*pi*fc) import math rc = 1.0 / (2 * math.pi * 80.0) dt = 1.0 / sr alpha = rc / (rc + dt) y = np.empty_like(x) y[0] = x[0] # vectorised enough — the loop is JITted by numpy internally # for reasonable sizes (~25k samples). prev_y, prev_x = x[0], x[0] for i in range(1, x.size): cur = alpha * (prev_y + x[i] - prev_x) y[i] = cur prev_y, prev_x = cur, x[i] return y def _transcribe_raw(self, audio_i16: np.ndarray) -> str: """ Like _transcribe but WITHOUT the garbage-pattern / length filters and without the `initial_prompt` bias. Used for wake verify, where: - We only care about the first phoneme (s/sh/z) — a 2-char "so" is a valid /sa-/ signature and MUST NOT be dropped by min_transcription_length. - A biased initial_prompt makes Whisper echo itself on unclear audio ("This is a robot assistant" → not s-starting → reject). The downside (no Sanad nudge) is fine here because the acoustic detector has already gated out non-speech. """ fw = self._get_fw() if fw is None: return "" if self._mic_gain != 1.0: audio_i16 = np.clip( audio_i16.astype(np.float32) * self._mic_gain, -32768, 32767 ).astype(np.int16) audio_f32 = audio_i16.astype(np.float32) / 32768.0 peak = float(np.abs(audio_f32).max()) if peak > 1e-4 and peak < 0.7: audio_f32 = audio_f32 * (0.7 / peak) try: segments, info = fw.transcribe( audio_f32, language="en", beam_size=int(self._stt.get("whisper_beam_size", 5)), temperature=0.0, initial_prompt="", # NO bias → NO prompt echo condition_on_previous_text=False, vad_filter=False, without_timestamps=True, # Looser gates — we're about to do phonetic match, # not trust the transcription verbatim. no_speech_threshold=0.85, log_prob_threshold=-1.8, compression_ratio_threshold=3.0, ) seg_list = list(segments) text = " ".join(s.text for s in seg_list).strip() if seg_list: mean_lp = sum(getattr(s, "avg_logprob", 0.0) for s in seg_list) / len(seg_list) log.info("whisper-raw: lp=%.2f nsp=%.2f text=%r", mean_lp, getattr(info, "no_speech_prob", 0.0), text[:80]) return text except Exception as e: log.error("whisper-raw transcribe failed: %s", e) return "" # ─── command transcription ──────────────────────────── def _transcribe_command(self, audio_i16: np.ndarray) -> str: """ Decode the recorded command audio with faster-whisper. Thin wrapper over self._transcribe(); exists so _handle_wake and the always-on loop share one entry point. """ if audio_i16.size == 0: return "" return self._transcribe(audio_i16) def _save_turn_wav( self, audio_i16: np.ndarray, transcription: str = "", tag: str = "cmd", ) -> Optional[str]: """ Save a single-turn command recording for debugging. Filename: {tag}_{epoch}_{sanitised_transcription}.wav Examples: cmd_1728562000_turn_right.wav ← successful command cmd_1728562030_hi.wav ← Whisper misheard as 'Hi' unk_1728562045_.wav ← Whisper returned empty cmd_1728562060_thanks_for_watch.wav ← garbage-filtered Rotation: keeps the most recent 50 across all tags so the disk doesn't fill up during a long session. Tunable via stt.recording_keep_count. """ try: import re as _re import wave rec_dir = os.path.join( PROJECT_ROOT, self._config.get("audio", {}).get("data_dir", "Data/Voice/Recordings"), ) os.makedirs(rec_dir, exist_ok=True) # Rotate — keep only the most recent N across all command WAVs. keep = int(self._stt.get("recording_keep_count", 50)) existing = sorted( f for f in os.listdir(rec_dir) if (f.startswith("cmd_") or f.startswith("unk_")) and f.endswith(".wav") ) for old in existing[:max(0, len(existing) - keep + 1)]: try: os.remove(os.path.join(rec_dir, old)) except Exception: pass # Sanitise transcription for filename: lowercase, alnum + _, <=40 chars slug = _re.sub(r'[^a-z0-9]+', '_', (transcription or "").lower()).strip('_')[:40] path = os.path.join( rec_dir, f"{tag}_{int(time.time())}_{slug}.wav" ) with wave.open(path, "wb") as w: w.setnchannels(1) w.setsampwidth(2) w.setframerate(self._sample_rate) w.writeframes(audio_i16.astype(np.int16).tobytes()) return path except Exception as e: log.warning("failed to save turn wav: %s", e) return None def _save_unk_wav(self, audio_i16: np.ndarray) -> Optional[str]: """Backward-compat wrapper — save with the `unk` tag.""" return self._save_turn_wav(audio_i16, transcription="", tag="unk") # ─── command normalization (post-gate) ──────────────── def _normalize_command(self, text: str) -> str: """ Apply fuzzy-match to the closest canonical command phrase. Call AFTER the gated wake check so the wake word has already been stripped by the caller if appropriate. Turns near-misses like "Turn right up" → "turn right" so command_parser.py's regex fast-path can hit them without an LLM round-trip. """ canonical = _closest_command(text, cutoff=self._vocab_cutoff) if canonical != text: log.info("fuzzy-match: %r → %r", text, canonical) return canonical # ─── main loop ──────────────────────────────────────── def _voice_loop(self): """ Dispatch to the right loop based on stt.mode: "wake_and_command" — require "Sanad" wake word (acoustic), then record and transcribe a command. "always_on" — Transcribe every utterance, log all, and dispatch all to the brain. No wake. "always_on_gated" — Transcribe every utterance and log all, but ONLY dispatch utterances that contain "Sanad" (fuzzy). Wake word is stripped before the command is sent to the brain. """ mode = self._stt.get("mode", "wake_and_command").lower() self._mic_capture.start() if mode in ("always_on", "always_on_gated"): self._voice_loop_always_on(gated=(mode == "always_on_gated")) else: self._voice_loop_wake() def _voice_loop_wake(self): """Classic wake-and-command: listen for 'Sanad', then record command.""" log.info("Voice loop started — listening for wake (energy-based)") was_speaking = False while self._running: try: if self._audio.is_speaking: was_speaking = True time.sleep(0.1) self._detector.reset() continue if was_speaking: time.sleep(0.25) self._mic_capture.flush() self._detector.reset() was_speaking = False if time.time() < self._cooldown_until: _ = self._read_mic_raw(1024) self._detector.reset() time.sleep(0.05) continue chunk = self._read_mic_raw(1024) if not chunk: continue if self._detector.process(chunk): self._handle_wake() except Exception as e: log.error("Voice loop error: %s", e, exc_info=True) time.sleep(1) def _voice_loop_always_on(self, gated: bool = False): """ Always-on mode — Sanad-style continuous listening. If `gated` is True, utterances that don't contain the wake word "Sanad" (or a fuzzy variant) are logged but NOT dispatched to the brain — the robot hears everything, speaks only when addressed. Architecture (no wake word, no ack TTS): 1. Continuously read the gained mic stream in 32 ms chunks. 2. Run a hysteretic VAD on the stream — speech_entry_rms starts an utterance, silence_exit_rms + silence_duration ends one. 3. On each utterance end → Whisper transcribe → fuzzy-match → dispatch to brain. 4. Every ~5 s of idle: log a `ambient: rms=... peak=...` line so you can SEE what the mic is doing at all times, even when nobody's talking. Matches Sanad's "always listening" visibility. 5. Speech is not gated on amplitude — everything above the entry threshold is captured, quiet or loud. Loud speech clips naturally against int16; Whisper handles it. Thresholds come from the same stt.* config as wake mode but are typically tuned lower here (you want eager capture since there's no wake-word gate to prevent false positives). """ log.info( "Voice loop started — ALWAYS-ON mode%s", " [gated: only 'Sanad' utterances dispatched]" if gated else " (no wake word — every utterance dispatched)" ) speech_entry = float(self._stt.get("always_on_speech_entry_rms", 250.0)) silence_exit = float(self._stt.get("always_on_silence_exit_rms", 120.0)) silence_dur = float(self._stt.get("always_on_silence_duration_sec", 0.8)) min_utter_s = float(self._stt.get("always_on_min_utterance_sec", 0.3)) max_utter_s = float(self._stt.get("always_on_max_utterance_sec", 12.0)) idle_log_s = float(self._stt.get("always_on_idle_log_sec", 5.0)) ambient_mult = float(self._stt.get("always_on_ambient_mult", 1.4)) ambient_win = int(self._stt.get("always_on_ambient_window_chunks", 100)) buffer = bytearray() in_speech = False silence_budget = 0.0 speech_duration = 0.0 peak_rms = 0.0 idle_peak_rms = 0.0 idle_sum_rms = 0.0 idle_chunks = 0 last_idle_log = time.time() was_speaking_tts = False # Rolling ambient (idle-only) RMS buffer. Used to adapt silence_exit # so a noisy room doesn't trap the VAD at max_utter_s: if the # observed idle floor sits at rms=200, silence_exit needs to be # above 200 or silence never accumulates. We take # effective_exit = max(config_silence_exit, ambient_floor * mult). ambient_buf: list = [] ambient_floor = 0.0 # Seed ambient_floor by sampling ~1s of mic BEFORE entering the # loop. Without this, the very first utterance runs with # ambient_floor=0 → eff_exit=config_floor, which under-cuts # noisy rooms and creates self-sustaining echo loops. seed_chunks = [] seed_deadline = time.time() + 1.0 while time.time() < seed_deadline: r = self._read_mic_gained(1024) if r: a = np.frombuffer(r, dtype=np.int16) if a.size: seed_chunks.append( float(np.sqrt(np.mean(a.astype(np.float64) ** 2))) ) else: time.sleep(0.005) if seed_chunks: # Use the median so one loud transient doesn't poison the seed. seed_chunks.sort() ambient_floor = seed_chunks[len(seed_chunks) // 2] ambient_buf = list(seed_chunks[-ambient_win:]) log.info("ambient seeded: floor=%.0f from %d chunks", ambient_floor, len(seed_chunks)) while self._running: try: # Drop mic input while the robot itself is speaking so we # don't feed our own TTS back through Whisper. if self._audio.is_speaking: was_speaking_tts = True buffer.clear() in_speech = False silence_budget = 0.0 speech_duration = 0.0 peak_rms = 0.0 time.sleep(0.1) continue if was_speaking_tts: time.sleep(float(self._stt.get("post_tts_settle_sec", 0.3))) self._mic_capture.flush() was_speaking_tts = False raw = self._read_mic_gained(1024) if not raw: time.sleep(0.005) continue arr = np.frombuffer(raw, dtype=np.int16) rms = float(np.sqrt(np.mean(arr.astype(np.float64) ** 2))) chunk_s = (len(raw) // 2) / self._sample_rate if in_speech: buffer.extend(raw) speech_duration += chunk_s peak_rms = max(peak_rms, rms) # Adaptive silence exit: sits max(config_floor, # ambient_floor × mult). Prevents the "room is noisier # than silence_exit" failure mode where silence never # accumulates and every utterance hits max_utter_s. eff_exit = max(silence_exit, ambient_floor * ambient_mult) if rms < eff_exit: silence_budget += chunk_s else: silence_budget = 0.0 utter_over = (silence_budget >= silence_dur and speech_duration >= min_utter_s) force_stop = speech_duration >= max_utter_s if utter_over or force_stop: reason = "max-duration" if force_stop else "silence" audio = np.frombuffer(bytes(buffer), dtype=np.int16) log.info("utterance end (%s): dur=%.2fs peak_rms=%.0f samples=%d", reason, speech_duration, peak_rms, audio.size) # RESET STATE IMMEDIATELY — before any Whisper / # speak() / dispatch. Previously a `continue` from # the wake-only ack branch skipped the reset, and # the 12-second buffer lived forever, re-transcribed # every iteration into the same "Sanad" output, # spawning a self-sustaining "Yes" loop. buffer.clear() in_speech = False silence_budget = 0.0 speech_duration = 0.0 peak_rms = 0.0 text = self._transcribe_command(audio) if audio.size else "" if text: log.info("HEARD: %r", text) # Gated mode: only dispatch if the wake word was # spoken. Everything is still logged above so the # operator has full visibility into what the mic # is picking up. if gated and not _has_wake_word(text): log.info(" (no wake word — not dispatched)") else: if gated: command = _strip_wake_word(text) if command != text: log.info(" wake-stripped: %r → %r", text, command) # Bare wake word ("Sanad.", "Sanad") → # speak a "Yes" ack, do NOT call the # brain (it would hallucinate a random # response from a 1-word prompt). if not command: log.info(" wake-only utterance — speaking ack") try: self._audio.speak( self._messages.get("wake_heard", "Yes") ) except Exception as e: log.warning("wake-ack TTS failed: %s", e) continue else: command = text # Normalize near-misses ("Turn right up" → # "turn right") so command_parser's regex # fast-path can hit without an LLM round-trip. command = self._normalize_command(command) print(f' [Sanad] heard: "{command}"') if self._on_command: try: self._on_command(command, "en") except Exception as e: log.error("on_command: %s", e, exc_info=True) else: log.info("utterance rejected (empty/garbage after Whisper)") else: idle_peak_rms = max(idle_peak_rms, rms) idle_sum_rms += rms idle_chunks += 1 # Maintain the rolling ambient floor so silence_exit can # adapt. Use windows that are *clearly* not speech # (rms < speech_entry / 2) — otherwise a borderline # window just before transition pollutes the floor. if rms < speech_entry * 0.5: ambient_buf.append(rms) if len(ambient_buf) > ambient_win: ambient_buf.pop(0) if ambient_buf: ambient_floor = sum(ambient_buf) / len(ambient_buf) if rms >= speech_entry: # utterance starts — keep this chunk as pre-roll log.info("utterance start (rms=%.0f >= entry=%.0f)", rms, speech_entry) buffer.extend(raw) in_speech = True speech_duration = chunk_s peak_rms = rms silence_budget = 0.0 # periodic ambient log while idle — "I am listening" now = time.time() if (now - last_idle_log) >= idle_log_s and idle_chunks > 0: eff_exit = max(silence_exit, ambient_floor * ambient_mult) log.info("ambient: mean_rms=%.0f peak_rms=%.0f chunks=%d " "floor=%.0f entry=%.0f eff_exit=%.0f", idle_sum_rms / idle_chunks, idle_peak_rms, idle_chunks, ambient_floor, speech_entry, eff_exit) idle_peak_rms = 0.0 idle_sum_rms = 0.0 idle_chunks = 0 last_idle_log = now except Exception as e: log.error("Always-on voice loop error: %s", e, exc_info=True) time.sleep(1) def _handle_wake(self): t_wake = time.time() log.info("Wake detected (acoustic)") # Verify the burst that triggered wake actually sounds like a # wake word. The acoustic detector fires on ANY 0.2-1.5s burst # (coughs, claps, door slams). We run a lightweight Whisper # decode on the burst and accept if EITHER: # (a) a wake-word variant is in the transcription, OR # (b) the transcription starts with 's'/'sh'/'z' — Whisper's # consistent signature for mishearing non-English "Sanad" # as an English /sa-/ word ("Stop", "Set", "Sand", "Send"). # Reject if Whisper returns empty (pure noise / cough) or # confidently not-s speech ("hello", "okay"). if self._stt.get("wake_verify_enabled", True): burst = self._detector.get_last_burst() if burst is not None and burst.size >= int(0.15 * self._sample_rate): t_verify = time.time() # Lenient transcribe — no garbage filter, no min-length, # no bias prompt. See _transcribe_raw docstring. heard = self._transcribe_raw(burst) verify_ms = (time.time() - t_verify) * 1000 low = (heard or "").lower().strip().lstrip('"\'.,!?') if not low: log.info(" wake REJECTED — whisper empty (%.0fms)", verify_ms) return starts_with_s = low.startswith(("s", "sh", "z")) if _has_wake_word(heard): log.info(" wake verified (wake-word: %r, %.0fms)", heard, verify_ms) elif starts_with_s: log.info(" wake verified (s-phonetic: %r, %.0fms)", heard, verify_ms) else: log.info(" wake REJECTED — %r (%.0fms, not s-starting)", heard, verify_ms) return print("\n [Sanad] wake heard — listening…") ack_mode = self._stt.get("wake_ack", "tts").lower() if ack_mode == "none": log.info(" wake-ack: silent (no TTS)") else: try: self._audio.speak(self._messages.get("wake_heard", "Yes")) except Exception as e: log.warning("TTS ack failed: %s", e) # Wait for ack TTS + speaker reverberation to decay while self._audio.is_speaking: time.sleep(0.05) settle = float(self._stt.get("post_tts_settle_sec", 0.3)) time.sleep(settle) self._mic_capture.flush() log.info(" wake→record-ready: %.2fs", time.time() - t_wake) log.info("Recording command...") audio = self._record_command() # _record_command returns empty if it never saw speech above the # adaptive entry threshold — no point running STT on noise. # Two cases: # audio.size == 0 → no speech at all (likely false wake # from cough/slam). SILENTLY reset — # don't blurt "I didn't catch that" on # what was never a real interaction. # 0 < size < 8000 → brief speech burst (< 0.5s). Probably # a real-but-unintelligible attempt; # speak "I didn't catch that" so the # user knows to retry. if audio.size == 0: log.info("Command dropped (no speech — likely false wake); silent reset") self._cooldown_until = time.time() + float( self._stt.get("command_cooldown_sec", 1.5)) return if audio.size < 8000: # < 0.5 s but > 0 — real short attempt log.info("Command too short (%.2fs); asking user to repeat", audio.size / self._sample_rate) try: self._audio.speak(self._messages.get("no_speech", "I didn't catch that")) except Exception: pass self._cooldown_until = time.time() + float( self._stt.get("command_cooldown_sec", 1.5)) return peak = int(np.abs(audio).max()) rms = float(np.sqrt(np.mean(audio.astype(np.float64) ** 2))) log.info("command audio: samples=%d peak=%d rms=%.1f", audio.size, peak, rms) text = self._transcribe_command(audio) if not text: log.info("Empty or rejected transcription") # Save WAV of the failed transcription for post-mortem. if self._stt.get("recording_enabled", True): self._save_turn_wav(audio, transcription="", tag="unk") try: self._audio.speak(self._messages.get("no_speech", "I didn't catch that")) except Exception: pass self._cooldown_until = time.time() + float( self._stt.get("command_cooldown_sec", 1.5)) return # Normalize near-miss transcriptions like "Turn right up" → "turn # right" so the brain's regex fast-path catches them. text = self._normalize_command(text) log.info("Transcribed: %s", text[:120]) # Save every successful command recording so you can listen back # later and see what the mic actually heard vs what Whisper # transcribed. Disable with stt.recording_enabled=false. if self._stt.get("recording_enabled", True): wav_path = self._save_turn_wav(audio, transcription=text, tag="cmd") if wav_path: log.info("saved: %s", os.path.basename(wav_path)) if self._on_command: try: self._on_command(text, "en") except Exception as e: log.error("on_command error: %s", e, exc_info=True) elif self._on_wake: try: self._on_wake() except Exception: pass cd = float(self._stt.get("command_cooldown_sec", 1.5)) self._cooldown_until = time.time() + cd log.info("wake→dispatch total: %.2fs | cooldown %.1fs", time.time() - t_wake, cd) # ─── start / stop ───────────────────────────────────── def start(self): if self._running: log.warning("VoiceModule already running") return self._running = True self._thread = threading.Thread(target=self._voice_loop, daemon=True, name="voice") self._thread.start() log.info("Voice module started") def stop(self): self._running = False try: self._mic_capture.stop() except Exception: pass if self._thread: self._thread.join(timeout=5) self._thread = None log.info("Voice module stopped") @property def is_running(self) -> bool: return self._running if __name__ == "__main__": from API.audio_api import AudioAPI def on_cmd(text, lang): print(f"\n COMMAND [{lang}]: {text}\n") audio = AudioAPI() voice = VoiceModule(audio, on_command=on_cmd) print('Starting. Say "Sanad", then speak your command.\n') voice.start() try: while voice.is_running: time.sleep(0.5) except KeyboardInterrupt: voice.stop()