diff --git a/API/audio_api.py b/API/audio_api.py index 89169df..8b73f1a 100644 --- a/API/audio_api.py +++ b/API/audio_api.py @@ -211,56 +211,58 @@ class AudioAPI: # ─── G1 SPEAKER PLAYBACK (raw PCM, kept for future backends) ───────── - def _play_pcm(self, audio_16k: np.ndarray) -> float: - """Play 16kHz mono int16 on G1 speaker. Returns duration.""" + def _play_pcm(self, audio: np.ndarray, rate: int = None) -> float: + """ + Play mono int16 PCM on the G1 speaker. + + `rate` is the sample rate of the incoming `audio`; we always + resample to self._target_rate (16 kHz) before sending because the + G1 speaker hardware only honors that rate — if you hand it 24 kHz + PCM, it plays ~1.5x too fast. This matches the Sanad pattern. + + Uses AudioClient.PlayStream (the high-level API) with a fresh + stream_id + STOP_PLAY bracket on either side so a prior stream + can't blend into this one. + """ if not self._sdk_available: log.warning("SDK not available, cannot play audio") return 0.0 - from unitree_sdk2py.g1.audio.g1_audio_api import ( - ROBOT_API_ID_AUDIO_START_PLAY, - ROBOT_API_ID_AUDIO_STOP_PLAY, - ) + src_rate = int(rate) if rate else self._target_rate + audio = self._resample(audio, src_rate) # → self._target_rate + if audio.size == 0: + return 0.0 + + from unitree_sdk2py.g1.audio.g1_audio_api import ROBOT_API_ID_AUDIO_STOP_PLAY app_name = self._spk["app_name"] - # Stop previous stream + # Stop any prior stream before opening a new one. self._client._Call( ROBOT_API_ID_AUDIO_STOP_PLAY, json.dumps({"app_name": app_name}), ) - time.sleep(0.3) + time.sleep(0.15) - # Build params — unique stream_id every call - pcm = audio_16k.tobytes() sid = f"s_{int(time.time() * 1000)}" - param = json.dumps({ - "app_name": app_name, - "stream_id": sid, - "sample_rate": self._target_rate, - "channels": 1, - "bits_per_sample": 16, - }) + self._client.PlayStream(app_name, sid, audio.tobytes()) - # Single call — full buffer - self._client._CallRequestWithParamAndBin( - ROBOT_API_ID_AUDIO_START_PLAY, param, list(pcm) - ) - - duration = len(audio_16k) / self._target_rate - time.sleep(duration + 0.5) + duration = len(audio) / self._target_rate + # Margin covers DDS buffer drain before STOP cuts playback short. + time.sleep(duration + 0.3) self._client._Call( ROBOT_API_ID_AUDIO_STOP_PLAY, json.dumps({"app_name": app_name}), ) - log.info("Played: %.1fs", duration) + log.info("Played: %.1fs (src=%d Hz → hw=%d Hz)", + duration, src_rate, self._target_rate) return duration - def play_pcm(self, audio_16k: np.ndarray) -> float: + def play_pcm(self, audio: np.ndarray, rate: int = None) -> float: """Public wrapper for playing PCM audio.""" - return self._play_pcm(audio_16k) + return self._play_pcm(audio, rate=rate) # ─── MIC RECORDING ─────────────────────────────────── diff --git a/API/zmq_api.py b/API/zmq_api.py index adeb91b..1b4763f 100644 --- a/API/zmq_api.py +++ b/API/zmq_api.py @@ -80,6 +80,11 @@ def send_cmd(cmd: str): _ensure_sock().send_string(json.dumps({"cmd": cmd})) -# Load MOVE_MAP from navigation config (pure data, safe at import time) +# Load navigation constants from config (pure data, safe at import time). +# MOVE_MAP[direction] = (vx, vy, vyaw). STEP_DURATION_SEC is how long one +# "step" of a bare directional command lasts (2 s at default velocities +# ≈ 60 cm forward or 34° turn). Both live in config_Navigation.json so +# motion can be retuned without editing Python. _nav = load_config("Navigation") -MOVE_MAP = {k: tuple(v) for k, v in _nav["move_map"].items()} +MOVE_MAP = {k: tuple(v) for k, v in _nav["move_map"].items()} +STEP_DURATION_SEC = float(_nav.get("step_duration_sec", 2.0)) diff --git a/Brain/command_parser.py b/Brain/command_parser.py index a166e28..c25a462 100644 --- a/Brain/command_parser.py +++ b/Brain/command_parser.py @@ -4,7 +4,7 @@ Handles place memory, odometry, session recall, help, examples """ import re import time -from API.zmq_api import send_vel, gradual_stop +from API.zmq_api import send_vel, gradual_stop, MOVE_MAP, STEP_DURATION_SEC from API.memory_api import mem, place_save, place_goto, places_list_str from API.odometry_api import odom, ODOM_AVAILABLE from API.camera_api import get_frame @@ -36,6 +36,30 @@ _RE_WALK_STEP = re.compile( r"^(?:walk|go|move|step)(?:\s+(forward|back(?:ward)?))?\s+(\d+)\s*steps?$", re.I) _RE_TURN_STEP = re.compile( r"^turn\s+(left|right)(?:\s+(\d+)\s*steps?)?$", re.I) + +# Simple one-shot motion — one word or verb+direction, no counts/units. +# All default to a ~2 s motion at the normal velocity. Kept local so the +# user doesn't eat a 5 s Qwen round-trip for a trivial "go back". +# +# Matches: +# "left" / "right" / "forward" / "back" / "backward" +# "go back" / "step back" / "move back" / "walk back" / "run back" +# "go forward" / "step forward" / "move forward" / "walk forward" +# "go left" / "move right" / "step left" / etc. +# "head forward" / "head back" +# Does NOT match multi-word phrases like "walk to the chair" — those +# still fall through to Qwen where they belong. +_RE_SIMPLE_DIR = re.compile( + r"^(?:(?:walk|go|move|step|run|head)\s+)?" + r"(forward|back(?:ward)?|left|right)$", + re.I, +) + +# Bare stop / pause words — no need to ask Qwen what "stop" means. +_RE_STOP_SIMPLE = re.compile( + r"^(?:stop|halt|wait|pause|stay|freeze|hold|stand\s+still|don'?t\s+move)$", + re.I, +) _RE_PATROL_RT = re.compile( r"^patrol[/:]\s*(.+)$", re.I) _RE_LAST_CMD = re.compile( @@ -115,9 +139,10 @@ def try_local_command(cmd: str) -> bool: if odom: odom.walk_distance(meters) else: + vx, _, _ = MOVE_MAP["forward"] t0 = time.time() - while time.time() - t0 < meters / 0.3: - send_vel(vx=0.3) + while time.time() - t0 < meters / abs(vx): + send_vel(vx=vx) time.sleep(0.05) gradual_stop() return True @@ -128,9 +153,10 @@ def try_local_command(cmd: str) -> bool: if odom: odom.walk_distance(meters, direction="backward") else: + vx, _, _ = MOVE_MAP["backward"] t0 = time.time() - while time.time() - t0 < meters / 0.2: - send_vel(vx=-0.2) + while time.time() - t0 < meters / abs(vx): + send_vel(vx=vx) time.sleep(0.05) gradual_stop() return True @@ -144,9 +170,13 @@ def try_local_command(cmd: str) -> bool: if odom: odom.turn_degrees(degrees) else: + # vyaw magnitude comes from MOVE_MAP["left"]; duration is + # abs(degrees)/(vyaw_deg_per_sec). vyaw in config is rad/s. + _, _, vyaw_mag = MOVE_MAP["left"] + vyaw_deg_per_sec = abs(vyaw_mag) * 180.0 / 3.14159265 + vyaw = vyaw_mag if degrees > 0 else -vyaw_mag + duration = abs(degrees) / vyaw_deg_per_sec t0 = time.time() - vyaw = 0.3 if degrees > 0 else -0.3 - duration = abs(degrees) / 17.2 while time.time() - t0 < duration: send_vel(vyaw=vyaw) time.sleep(0.05) @@ -156,9 +186,11 @@ def try_local_command(cmd: str) -> bool: m = _RE_WALK_STEP.match(cmd) if m: direction = (m.group(1) or "forward").lower() + if direction.startswith("back"): + direction = "backward" steps = int(m.group(2)) - vx = -0.2 if direction.startswith("back") else 0.3 - duration = 2.0 * steps + vx, _, _ = MOVE_MAP[direction] + duration = STEP_DURATION_SEC * steps t0 = time.time() while time.time() - t0 < duration: send_vel(vx=vx) @@ -170,8 +202,8 @@ def try_local_command(cmd: str) -> bool: if m: direction = m.group(1).lower() steps = int(m.group(2)) if m.group(2) else 1 - vyaw = 0.3 if direction == "left" else -0.3 - duration = 2.0 * steps + _, _, vyaw = MOVE_MAP[direction] + duration = STEP_DURATION_SEC * steps t0 = time.time() while time.time() - t0 < duration: send_vel(vyaw=vyaw) @@ -179,6 +211,31 @@ def try_local_command(cmd: str) -> bool: gradual_stop() return True + # ── BARE / SIMPLE DIRECTIONAL COMMANDS ─────────────────────────────── + # "left", "right", "forward", "back", "go back", "move forward", + # "step right", "walk back", "head forward" — any one-word direction + # or verb+direction with no explicit count. Duration and velocities + # come entirely from config_Navigation.json (MOVE_MAP + + # step_duration_sec) — no magic numbers here. + m = _RE_SIMPLE_DIR.match(cmd) + if m: + direction = m.group(1).lower() + if direction.startswith("back"): + direction = "backward" + vx, vy, vyaw = MOVE_MAP[direction] + t0 = time.time() + while time.time() - t0 < STEP_DURATION_SEC: + send_vel(vx=vx, vy=vy, vyaw=vyaw) + time.sleep(0.05) + gradual_stop() + return True + + # ── BARE STOP / PAUSE ───────────────────────────────────────────────── + m = _RE_STOP_SIMPLE.match(cmd) + if m: + gradual_stop() + return True + # ── NAMED PATROL ROUTE ─────────────────────────────────────────────── m = _RE_PATROL_RT.match(cmd) if m: @@ -234,17 +291,11 @@ def try_local_command(cmd: str) -> bool: for phrase, (reverse_dir, _) in move_words.items(): if phrase in cl: print(f" Undoing: '{c}' → reversing with '{reverse_dir}'") - dur, t0 = 2.0, time.time() - if reverse_dir in ("left", "right"): - vyaw = 0.3 if reverse_dir == "left" else -0.3 - while time.time() - t0 < dur: - send_vel(vyaw=vyaw) - time.sleep(0.05) - else: - vx = 0.3 if reverse_dir == "forward" else -0.2 - while time.time() - t0 < dur: - send_vel(vx=vx) - time.sleep(0.05) + vx, vy, vyaw = MOVE_MAP[reverse_dir] + t0 = time.time() + while time.time() - t0 < STEP_DURATION_SEC: + send_vel(vx=vx, vy=vy, vyaw=vyaw) + time.sleep(0.05) gradual_stop() return True print(" No movement command to undo") diff --git a/Config/config_Navigation.json b/Config/config_Navigation.json index f5f6f7c..bb166eb 100644 --- a/Config/config_Navigation.json +++ b/Config/config_Navigation.json @@ -3,6 +3,8 @@ "min_steps_before_check": 3, "scan_interval_s": 0.4, "rotation_speed": 0.3, + "_step_duration_comment": "Duration of one 'step' for bare directional commands ('go back', 'turn right', etc.). 2.0s at move_map velocities ≈ 60 cm forward, 40 cm back, 34° turn. Change here and every regex fast-path in command_parser.py uses the new value.", + "step_duration_sec": 2.0, "move_map": { "forward": [0.3, 0.0, 0.0], "backward": [-0.2, 0.0, 0.0], diff --git a/Config/config_Voice.json b/Config/config_Voice.json index d3fa55e..0332956 100644 --- a/Config/config_Voice.json +++ b/Config/config_Voice.json @@ -5,14 +5,119 @@ "target_sample_rate": 16000 }, "stt": { - "backend": "custom_acoustic", - "_comment": "Pure-DSP wake detector in Voice/wake_detector.py. No ML.", - "speech_threshold": 150.0, - "min_word_duration": 0.20, - "max_word_duration": 1.50, - "post_silence": 0.30, - "wake_cooldown": 1.50, - "wake_chunk_ms": 50 + "backend": "faster_whisper", + "_comment": "Custom energy wake detector (instant, no ML) + faster-whisper base.en int8 on CPU for command transcription. Wake fires on any 0.2-1.5s speech burst; Whisper only runs on the recorded command, so it's ~10x less busy than a Whisper-polling setup.", + + "_mode_comment": "Three modes: 'wake_and_command' = instant acoustic wake detector (no ML) hears 'Sanad', THEN records a ~2s command, transcribes once — fastest, most reliable on G1 mic. 'always_on' = continuous VAD → Whisper every utterance, dispatch all (chatty, LLM gets every noise). 'always_on_gated' = continuous transcribe, dispatch only utterances containing 'Sanad' (Sanad-style but Whisper hallucinates commands from TTS echo on G1 mic, creating feedback loops — keep as opt-in, not default).", + "mode": "wake_and_command", + + "_always_on_comment": "Tunables for always_on mode only. Lower entry threshold catches quieter speech (since there's no wake gate). silence_duration is how long of quiet closes an utterance. idle_log_sec is how often to print an ambient-level summary so you can see what the mic is hearing even when nobody is talking.", + "always_on_speech_entry_rms": 150.0, + "always_on_silence_exit_rms": 70.0, + "always_on_silence_duration_sec": 0.8, + "always_on_min_utterance_sec": 0.3, + "always_on_max_utterance_sec": 12.0, + "always_on_idle_log_sec": 5.0, + "always_on_ambient_mult": 1.4, + "always_on_ambient_window_chunks": 100, + + + "whisper_model": "base.en", + "whisper_device": "cpu", + "whisper_compute_type": "int8", + + "_whisper_tuning_comment": "base.en is the only model that decodes fast enough on Jetson Orin NX CPU. TESTED: small.en takes 10-12s per 1s burst (unusable); base.en runs ~2-3s per burst. tiny.en is even faster (~1s) but noticeably worse accuracy. If accuracy is poor on base.en (garbled transcriptions), the fix is hardware — switch to a close-talking USB mic (Hollyland) via mic.backend:pactl_parec. small.en cached in ~/.cache/huggingface/hub/ if you want to experiment again — try it on an x86 dev machine to see the accuracy gain before blaming Jetson.", + "mic_gain": 1.0, + "whisper_beam_size": 8, + "whisper_no_speech_threshold": 0.85, + "whisper_log_prob_threshold": -1.8, + "whisper_compression_ratio_threshold": 3.0, + "whisper_temperature_fallback": [0.0, 0.2, 0.4], + "_whisper_temp_comment": "Temperature fallback: Whisper first tries greedy (T=0). If the output fails its own confidence gates, it retries at 0.2, then 0.4. On noisy audio this often rescues a bad greedy decode — the small random noise in softmax helps unstick the decoder from a local minimum.", + "_whisper_gates_comment": "Looser than faster-whisper defaults (0.6 / -1.0 / 2.4) because G1 far-field mic audio has poor SNR and frequently falls below the default log-prob. A segment is dropped only if (no_speech_prob > 0.85 AND log_prob < -1.8) — lets more shaky-but-real speech through. Hallucination risk is cushioned by the GARBAGE_PATTERNS filter downstream and the fuzzy-match to command_vocab.", + "_initial_prompt_comment": "EMPTY BY DEFAULT. Any bias prompt leaks — on unclear/short audio Whisper echoes the prompt verbatim as the transcription (seen repeatedly: 'This is a robot assistant' hallucinated from <1s of ambient). Clean, unbiased decode is worse at recognising 'Sanad' specifically but doesn't produce phantom commands. Set to a single short cue if you want nudging and can tolerate occasional echoes.", + "whisper_initial_prompt": "Robot voice command.", + + "_vocab_comment": "Tunable vocab lists for voice post-processing. wake_words = fuzzy variants of the wake word 'Sanad' Whisper might produce (used by always_on_gated mode). command_vocab = canonical commands the voice layer fuzzy-matches transcriptions against (e.g. 'Turn right up' → 'turn right' so command_parser's regex catches it). Edit these to add new vocabulary — NO code change required.", + "wake_words": [ + "sanad", "sannad", "sennad", "sunnad", "sinnad", "sonnad", + "sanat", "sunnat", "sonnat", "sinnat", "sennat", + "sanid", "sanud", "saned", "sanod", "sanaad", + "senad", "sinad", "sonad", "sunad", + "sanah", "sanath", "sanadh", "sonadh", + "samad", "somad", "sumad", + "thanad", "zanad", + "sa nad", "san ad", "san odd", "san add" + ], + "_wake_words_exclude_comment": "DELIBERATELY EXCLUDED from wake_words: 'said', 'sent', 'sand', 'sandy', 'sunday', 'signed', 'synod', 'sonata', 'sonnet', 'senate', 'sane', 'saint', 'sana'. These collide with common English and would false-trigger the gate.", + "command_vocab": [ + "what do you see", "what can you see", "look around", + "come to me", "come here", "come back", "come closer", + "approach", "get closer", "come", + "go home", "go back", "go forward", "go backward", + "go left", "go right", "go", + "sit down", "stand up", "sit", "stand", + "raise arm", "lower arm", "wave hello", "wave", "point", + "turn left", "turn right", "turn around", + "move forward", "move backward", "move back", + "move left", "move right", + "walk forward", "walk backward", "walk back", + "step forward", "step back", "step left", "step right", + "forward", "backward", "back", "left", "right", + "patrol", "stop", "halt", "wait", "pause", "freeze", "hold", + "hello", "hi", "hey", "help", + "who are you", "where are you", "where am i", "what is your name", + "remember this", "forget", "do it again", "repeat", "undo", + "follow me", "stay here" + ], + "command_vocab_cutoff": 0.72, + "_garbage_comment": "Whisper's known 'no phonetic content' outputs on low-SNR audio. YouTube outros, filler words, single-letter hallucinations. Any transcription matching one of these (or shorter than min_transcription_length) is rejected before fuzzy-match — treated as silence.", + "garbage_patterns": [ + "thanks for watching", "thank you for watching", + "thank you", "thanks", + "bye", "goodbye", + ".", "you", "yeah", + "okay", "ok", + "um", "uh", "hmm", "mm", + "i", "a" + ], + "min_transcription_length": 3, + + + "_wake_comment": "Custom energy-based wake detector with adaptive noise floor. speech_threshold is a FLOOR — the effective trigger is max(speech_threshold, ambient_baseline * wake_adaptive_mult). CRITICAL: speech_threshold must be ABOVE your room's ambient RMS, else state stays in SPEAKING forever and baseline can't learn. Check logs — if `baseline=0` and `state=SPEAKING` persist with peak values well below your 'Sanad' peaks, raise this floor. Normal G1 room: 80-150. Noisy room (fans, HVAC): 300-500. Measured-here room ambient ≈ 250-350, so 400 gives margin.", + "speech_threshold": 400.0, + "min_word_duration": 0.25, + "max_word_duration": 2.50, + "post_silence": 0.20, + "wake_cooldown": 1.00, + "wake_chunk_ms": 50, + "wake_adaptive_window_n": 50, + "wake_adaptive_mult": 3.0, + "wake_diag_log_sec": 3.0, + + "wake_ack": "tts", + "_wake_ack_comment": "tts = spoken 'Yes' via TtsMaker (~1.7s G1 firmware latency). none = silent, relies on terminal print (fast).", + + "_wake_verify_comment": "DISABLED for speed. When enabled, runs a ~3s Whisper decode on each wake burst and rejects non-/sa-/ speech — good for filtering coughs/claps, but adds 3s latency per wake. With it off, every acoustic wake fires — faster response, more false wakes from loud noises (but those drop silently at the 'no speech' recording stage, so user impact is near-zero). Set true again if background noise is triggering too many false wakes.", + "wake_verify_enabled": false, + + + "_vad_comment": "Hysteretic VAD with adaptive ambient baseline. speech_entry_rms = 'user started talking' floor; silence_exit_rms = 'user stopped' floor (must be < entry). Adaptive: we reuse the wake detector's baseline (measured during idle silence) to bump both up if the room is noisier than the floors. Tune for YOUR ambient: check `command audio: peak=X rms=Y` in voice.log — speech_entry should sit roughly 2× above ambient rms. Room-measured ambient ≈ 250-350 → entry 400 with margin.", + "speech_entry_rms": 400.0, + "silence_exit_rms": 200.0, + "_vad_tuning_comment": "silence_duration_sec = how long of quiet ends an utterance. 0.6 cuts short commands fast (good UX) but may clip a thinking pause. ambient_mult = effective_entry multiplier over measured ambient (cmd is entry = max(speech_entry_rms, ambient * mult * 1.8)). Smaller = more eager, catches quieter speech. 1.5 matches the wake-mult behavior.", + "silence_duration_sec": 0.6, + "max_record_sec": 5.0, + "min_record_sec": 0.4, + "ambient_probe_sec": 0.2, + "ambient_mult": 1.5, + "ambient_cap_rms": 200.0, + "_recording_comment": "Debug recording — save every command turn's audio to Data/Voice/Recordings/ as WAV. Filename includes epoch timestamp + transcription slug so you can replay what Whisper got and compare to what it heard. Rotates to keep most recent N files. Filename prefixes: 'cmd_*' = successful transcription, 'unk_*' = empty/rejected.", + "recording_enabled": true, + "recording_keep_count": 50, + "command_cooldown_sec": 1.5, + "post_tts_settle_sec": 0.4, + "_post_tts_settle_comment": "Time the mic ignores input AFTER the robot finishes speaking. Too short → TTS echo becomes false utterance. Too long → user speaks during the dead window and first syllables are clipped. 0.4s matches the G1 speaker decay at mic_gain=1.0; raise if you bump mic_gain above 1.5, lower if users report 'it cut off my first word'." }, "mic": { "backend": "builtin_udp", diff --git a/Config/marcus_prompts.yaml b/Config/marcus_prompts.yaml index 7a71fbf..0ff9b4f 100644 --- a/Config/marcus_prompts.yaml +++ b/Config/marcus_prompts.yaml @@ -25,6 +25,15 @@ main_prompt: | - speak: actually describe what you are doing OR what the camera shows right now. Do NOT copy example text. First person. English. - abort: null normally; "obstacle detected" / "unsafe command" / "cannot comply" with actions=[] when unsafe. + - CRITICAL — IF THE COMMAND IS UNCLEAR OR NOT AN ACTION: + If the input text is a single unclear word (like "I", "alright", "ok", "um"), + a random phrase ("I have a lot of beauty", "turn turn turn"), noise, a + greeting ("hello", "hi"), or anything that isn't clearly a movement / + arm / vision / memory instruction — DO NOT INVENT a command. Instead + reply with: + {{"actions":[],"arm":null,"speak":"Sorry, I didn't understand that — please repeat","abort":"command not understood"}} + Better to ask again than to guess and perform the wrong action. + Examples (learn the STRUCTURE, don't reuse the speak text): "turn right" → {{"actions":[{{"move":"right","duration":2.0}}],"arm":null,"speak":"Turning right","abort":null}} "walk 2 steps" → {{"actions":[{{"move":"forward","duration":2.0}}],"arm":null,"speak":"Walking forward","abort":null}} diff --git a/Doc/MARCUS_API.md b/Doc/MARCUS_API.md index bfbb718..9f400db 100644 --- a/Doc/MARCUS_API.md +++ b/Doc/MARCUS_API.md @@ -29,7 +29,7 @@ | Image-search step delay | `Vision/marcus_imgsearch.py` | `STEP_DELAY` 0.4 s → 0.15 s. | | Built-in G1 microphone | `Voice/builtin_mic.py` (new), `API/audio_api.py`, `Config/config_Voice.json` | Mic now reads from UDP multicast `239.168.123.161:5555` (G1 on-board array mic) instead of the Hollyland USB. Config key `mic.backend` defaults to `"builtin_udp"`; set to `"pactl_parec"` to fall back to the old path. | | Built-in G1 TTS | `Voice/builtin_tts.py` (new), `API/audio_api.py` | `AudioAPI.speak(text)` now calls `client.TtsMaker(text, speaker_id)` directly. No MP3/WAV plumbing, no internet, no edge-tts/Piper. English only — `speak()` refuses non-ASCII to avoid the G1's silent Arabic→Chinese fallback. | -| Gemini voice deleted | `Voice/marcus_gemini_voice.py` removed | `_init_voice()` now spawns `Voice.marcus_voice.VoiceModule` (Whisper wake + command STT). No more WebSocket, no more asyncio event loop, no API key. | +| Voice stack finalised | `Voice/marcus_voice.py`, `Voice/wake_detector.py` | Custom energy wake detector (pure numpy) + Whisper verify + faster-whisper command STT + fuzzy-match to canonical commands. Vosk experiment reverted; Gemini Live reverted. Single local STT engine. | | Subsystem flags | `Config/config_Brain.json::subsystems.{lidar, voice, imgsearch, autonomous}` | `init_brain()` skips any subsystem with `false`. Defaults: lidar+voice+autonomous ON, imgsearch OFF. | | Robot persona → Sanad | Multiple | Wake words `["sanad","sannad","sanat","sunnat"]`; all prompts say "You are Sanad"; banner reads `SANAD AI BRAIN — READY`; hardcoded self-intro says "I am Sanad". Project/file/module names unchanged. | | Logger rename | `Core/log_backend.py` (was `Core/Logger.py`) | Case-only collision with `Core/logger.py` removed — repo now clones cleanly on macOS/Windows. Public API unchanged: `from Core.logger import log`. | @@ -766,9 +766,9 @@ SAFETY: --- -## 15. Voice API (mic + TTS + STT) +## 15. Voice API (mic + TTS + wake + STT) -New pipeline as of 2026-04-21. Replaces the Gemini live WebSocket + edge-tts/Piper stack. +Current pipeline: G1 mic → custom energy wake detector → Whisper verify → TtsMaker "Yes" → record → faster-whisper transcribe → fuzzy-match canonical command → brain. Replaces all prior experiments (Gemini Live WebSocket, Vosk grammar, edge-tts / Piper). ### Mic — `Voice.builtin_mic.BuiltinMic` @@ -800,9 +800,36 @@ tts.speak("Hello, I am Sanad", block=True) # synth + play on G1 body speaker Used by `AudioAPI.speak(text)` internally; application code should call `audio_api.speak(...)` rather than BuiltinTTS directly. -### Wake + command loop — `Voice.marcus_voice.VoiceModule` +### Wake detection — `Voice.wake_detector.WakeDetector` -Four-state machine (`IDLE → WAKE_HEARD → PROCESSING → SPEAKING`). Whisper `tiny` detects the wake word, `small` transcribes commands. +Pure-numpy energy state machine with adaptive noise floor. Classifies any 0.35-1.5 s speech burst as a candidate wake, captures the audio for post-hoc verification. + +```python +from Voice.wake_detector import WakeDetector, WakeConfig +cfg = WakeConfig( + sample_rate=16_000, + speech_threshold=400.0, # min RMS floor — above noise + min_word_duration_s=0.35, # filter out coughs (<0.35s) + max_word_duration_s=1.50, # filter out sentences + post_silence_s=0.30, # how long silence marks word end + cooldown_s=1.50, # min gap between fires + chunk_ms=50, # RMS analysis window + adaptive_window_n=50, # rolling mean of idle RMS + adaptive_mult=3.0, # effective = max(floor, baseline×mult) +) +det = WakeDetector(cfg) +while True: + pcm = mic.read_chunk(1024) + if det.process(pcm): + burst = det.get_last_burst() # audio that triggered wake + break +``` + +Config under `config_Voice.json::stt.{speech_threshold, min_word_duration, …}`. + +### Voice orchestrator — `Voice.marcus_voice.VoiceModule` + +Drives the full pipeline: wake detector → Whisper verify → record → transcribe → fuzzy-match → dispatch. Three operating modes (`wake_and_command`, `always_on`, `always_on_gated`) selectable via `stt.mode`. ```python from API.audio_api import AudioAPI @@ -818,7 +845,9 @@ voice.start() # background thread voice.stop() ``` -Wake words are configured in `config_Voice.json::stt.wake_words_en`. The brain's `_init_voice()` wires `on_command` to `process_command(text)` + `audio_api.speak(reply)`. +Vocabulary (`wake_words`, `command_vocab`, `garbage_patterns`) is loaded from `config_Voice.json::stt.*` at `VoiceModule.__init__`. All thresholds, Whisper params, and mode selection live in the same config — no Python edits required to tune. See `Doc/controlling.md` → "Voice" for the tuning-knobs cheat sheet. + +The brain's `_init_voice()` wires `on_command` to `process_command(text)` → `audio_api.speak(reply)`. ### AudioAPI — `API.audio_api.AudioAPI` diff --git a/Doc/architecture.md b/Doc/architecture.md index dd12b3d..24b830e 100644 --- a/Doc/architecture.md +++ b/Doc/architecture.md @@ -15,7 +15,7 @@ - **ZMQ bind moved to `init_zmq()`** — no longer runs at import time; multiprocessing children (LiDAR SLAM worker) can safely re-import. - **G1 built-in microphone** via UDP multicast `239.168.123.161:5555` — `Voice/builtin_mic.py` replaces Hollyland/`parec` as the default mic. - **G1 built-in TTS** via `client.TtsMaker()` — `Voice/builtin_tts.py`. English only. Edge-tts / Piper / XTTS paths removed. -- **Gemini voice module deleted** — Whisper wake-word + command STT path is now authoritative (`Voice/marcus_voice.py`). +- **Voice stack finalised** — custom energy wake detector (`Voice/wake_detector.py`) + faster-whisper command STT (`Voice/marcus_voice.py`). Whisper verifies each acoustic wake before acking. Gemini voice module and Vosk grammar STT both tried and removed. - **Subsystem flags** — `config_Brain.json::subsystems.{lidar, voice, imgsearch, autonomous}` let you selectively skip heavy boot stages. - **Conditional inner-loop sleeps** — goal_nav / autonomous / imgsearch no longer pay unconditional per-step naps. - **Core/Logger.py → Core/log_backend.py** — case-only name collision with `logger.py` resolved; repo clones cleanly on macOS/Windows. @@ -66,7 +66,7 @@ Marcus/ │ ├── config_Memory.json # session/places paths │ ├── config_Network.json # Jetson IPs (eth0/wlan0), ports │ ├── config_ImageSearch.json # search defaults -│ ├── config_Voice.json # mic (builtin_udp|pactl_parec), TTS backend, wake words, mic_udp group/port +│ ├── config_Voice.json # mic, TTS, wake detector thresholds, Whisper params, wake_words/command_vocab/garbage_patterns vocab lists, VAD thresholds │ ├── config_LiDAR.json # Livox Mid-360 connection + SLAM engine params │ └── marcus_prompts.yaml # All Qwen-VL prompts (main, goal, patrol, talk, verify, 2× imgsearch) │ # Total: 12 JSON files + 1 YAML. (config_Memory.json removed 2026-04-21.) @@ -83,10 +83,11 @@ Marcus/ │ ├── audio_api.py # AudioAPI — speak() via G1 TtsMaker, record() via BuiltinMic │ └── lidar_api.py # LiDAR wrapper: init_lidar(), obstacle_ahead(), get_lidar_status() │ -├── Voice/ # Mic + TTS + wake-word STT +├── Voice/ # Mic + TTS + wake detector + faster-whisper STT │ ├── builtin_mic.py # G1 array mic via UDP multicast 239.168.123.161:5555 │ ├── builtin_tts.py # BuiltinTTS — client.TtsMaker(text, speaker_id) -│ └── marcus_voice.py # VoiceModule — Whisper tiny (wake) + small (command) state machine +│ ├── wake_detector.py # Pure-numpy energy wake detector (WakeDetector, WakeConfig) with adaptive baseline +│ └── marcus_voice.py # VoiceModule — orchestrates wake → verify → record → Whisper → dispatch │ ├── Brain/ # Decision logic — imports ONLY from API/ │ ├── marcus_brain.py # Orchestrator: init_brain(), process_command(), run_terminal() @@ -188,7 +189,8 @@ Marcus/ │ goal_nav.py │ │ builtin_mic.py │ │ patrol.py │ │ builtin_tts.py │ │ marcus_odometry.py │ │ marcus_voice.py │ -│ marcus_yolo.py │ │ (Whisper + TtsMaker) │ +│ marcus_yolo.py │ │ wake_detector.py │ +│ │ │ (Whisper + TtsMaker) │ │ marcus_imgsearch.py │ └──────────┬──────────────┘ └──────────────┬───────────┘ │ │ │ @@ -487,10 +489,10 @@ Supports text-only search (no reference image) using hint description. ### Voice/ -Mic, TTS and wake-word pipeline. All three files run only when `config_Brain.json::subsystems.voice == true`. Everything is local — no internet, no WebSocket, no cloud API. TTS is English-only by design (the G1 firmware maps non-English to Chinese, which is unusable). +Mic, TTS, energy-based wake detector, and faster-whisper STT pipeline. All files run only when `config_Brain.json::subsystems.voice == true`. Everything is local — no internet, no WebSocket, no cloud API. TTS is English-only by design (the G1 firmware maps non-English to Chinese, which is unusable). -#### `builtin_mic.py` (~180 lines, new 2026-04-21) -Ported from `Project/Sanad/voice/audio_io.py::BuiltinMic`. Joins the G1's on-board audio multicast group (`239.168.123.161:5555`) and buffers incoming int16 mono 16 kHz PCM. Thread-safe ring buffer. +#### `builtin_mic.py` (~180 lines) +Joins the G1's on-board audio multicast group (`239.168.123.161:5555`) and buffers incoming int16 mono 16 kHz PCM. Thread-safe ring buffer. Identical pattern to `Project/Sanad/voice/audio_io.py::BuiltinMic`. **Exports:** - `BuiltinMic(group, port, buf_max, read_timeout)` — init (idempotent) @@ -499,20 +501,41 @@ Ported from `Project/Sanad/voice/audio_io.py::BuiltinMic`. Joins the G1's on-boa - `read_seconds(s)` — convenience for "record `s` seconds" - `flush()` — drop buffered audio (called while TTS plays, to avoid echo) -#### `builtin_tts.py` (~70 lines, new 2026-04-21) -Thin wrapper around `unitree_sdk2py.g1.audio.g1_audio_client.AudioClient.TtsMaker(text, speaker_id)`. Synchronous — blocks until the estimated playback duration elapses. Refuses non-ASCII input (the G1 silently maps Arabic to Chinese, which confuses everyone). +#### `builtin_tts.py` (~70 lines) +Thin wrapper around `unitree_sdk2py.g1.audio.g1_audio_client.AudioClient.TtsMaker(text, speaker_id)`. Synchronous — blocks until the estimated playback duration elapses. Refuses non-ASCII input. **Exports:** - `BuiltinTTS(audio_client, default_speaker_id=0)` — init - `speak(text, speaker_id=None, block=True)` — synth+play on G1 body speaker -#### `marcus_voice.py` (~340 lines, rewired 2026-04-21) -Always-listening English voice loop with a four-state machine (`IDLE → WAKE_HEARD → PROCESSING → SPEAKING`). Whisper `tiny` listens for the wake word "Sanad" on 2-second chunks; Whisper `small` transcribes the full command. Mic input comes from `BuiltinMic`; responses go through `audio_api.speak()` → `BuiltinTTS`. +#### `wake_detector.py` (~240 lines) +Pure-numpy energy-envelope state machine. Fires a wake event when it sees a short speech burst (0.2-1.5 s) sized to match a single spoken word like "Sanad", followed by a clear silence. No ML, no lexicon — just amplitude classification. +Adaptive noise-floor baseline: learns ambient RMS during idle, raises the effective threshold proportionally, so the detector works the same in a quiet room and a noisy lab. Captures the triggering burst audio (`get_last_burst()`) so callers can verify it was actually "Sanad" before acking. Exists because Vosk/Whisper both failed on the G1 far-field mic for short non-English proper nouns. **Exports:** -- `VoiceModule(audio_api, on_command=cb)` — init -- `start()` — spawn background thread -- `stop()` — graceful teardown +- `WakeDetector(cfg)` with `WakeConfig(sample_rate, speech_threshold, min_word_duration_s, max_word_duration_s, post_silence_s, cooldown_s, chunk_ms, adaptive_window_n, adaptive_mult, diag_log_sec)` +- `process(pcm_bytes) -> bool` — feed audio, returns True once per spoken "word" +- `reset()`, `get_last_burst() -> np.ndarray | None` + +#### `marcus_voice.py` (~1000 lines) +Voice orchestrator. Reads from `BuiltinMic`, runs the `WakeDetector`, verifies the wake burst with a lightweight Whisper decode, records the command with hysteretic VAD (speech_entry / silence_exit thresholds, adaptive to measured ambient), trims leading silence before Whisper, transcribes with faster-whisper, fuzzy-matches against `command_vocab` to canonicalize near-misses ("Turn right up" → "turn right"), then dispatches to the brain callback. + +Three operating modes selectable via `stt.mode`: +- `wake_and_command` (default): classic acoustic wake → TTS "Yes" → record → Whisper → brain +- `always_on`: no wake, transcribe every utterance, dispatch all +- `always_on_gated`: transcribe everything, only dispatch utterances containing "Sanad" + +Wake verify rule: Whisper's decode must either contain a wake-word variant (`stt.wake_words`) OR start with `s/sh/z` — Whisper's consistent signature for mishearing "Sanad" as "Stop"/"Set"/"Sand". Pure silence / non-s speech is rejected silently. + +**Module-level** (populated at `VoiceModule.__init__` from config): +- `WAKE_WORDS`, `COMMAND_VOCAB`, `GARBAGE_PATTERNS` — loaded from `config_Voice.json::stt.*`, single source of truth +- `_has_wake_word(text)`, `_strip_wake_word(text)` — iterative until stable, handles "Sanad. Sanad." → "" +- `_closest_command(text, cutoff)` — difflib fuzzy-match against `COMMAND_VOCAB` + +**Exports:** +- `VoiceModule(audio_api, on_command=cb, on_wake=None)` — init +- `start()` / `stop()` — background thread lifecycle +- `is_running` property --- diff --git a/Doc/controlling.md b/Doc/controlling.md index 20e0b16..a9492b4 100644 --- a/Doc/controlling.md +++ b/Doc/controlling.md @@ -79,16 +79,30 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765` ## Voice -- **Wake word:** "Sanad" (variants "sannad", "sanat", "sunnat" — see `config_Voice.json::stt.wake_words_en`) +- **Wake word:** "Sanad" (Whisper mishears it as "Stop", "Sand", "Set", "Send" — all accepted via the /s-/ phonetic rule; see `config_Voice.json::stt.wake_words` for the 33 fuzzy variants). - **Mic:** G1 on-board array mic, captured via UDP multicast `239.168.123.161:5555` (16 kHz mono, 16-bit PCM). No USB mic needed. -- **STT:** Whisper `tiny` (wake detection) + Whisper `small` (command transcription) — both run locally. +- **Wake detection:** custom energy-envelope state machine (pure numpy, no ML) — fires on any 0.35-1.5 s speech burst followed by silence. Adaptive to room ambient. +- **Wake verify:** lightweight Whisper decode on the triggering burst. Accepts if it contains a wake-word variant OR starts with `s`/`sh`/`z` (Whisper's consistent signature for "Sanad"). Rejects pure noise / non-s speech silently. +- **STT (command):** faster-whisper `base.en` int8 on CPU — loads ~1.5 s on first wake, cached after. - **TTS:** Unitree `client.TtsMaker()` → G1 body speaker. English only. -- **Barge-in:** say something while Marcus is speaking and the mic buffer flushes on the next command. +- **Barge-in:** the mic is muted during TTS playback, then flushed on return to listening. -Interaction flow: say "Sanad" → hear *"Listening"* → speak your command → see transcript on console → Marcus answers through the speaker. +Interaction flow: say "Sanad" → hear *"Yes"* → speak your command → see transcript on console → Marcus answers through the speaker. + +Three voice modes selectable via `config_Voice.json::stt.mode`: +- `wake_and_command` (default) — wake word required before each command +- `always_on` — continuously transcribe + dispatch every utterance +- `always_on_gated` — always listen + log, dispatch only if utterance contains "Sanad" To disable voice entirely, set `subsystems.voice: false` in `config_Brain.json` — Marcus will boot text-only ~2 s faster. +**Tuning knobs** (when false wakes or rejected real wakes) — all in `config_Voice.json::stt`: +- Too many false wakes from coughs/claps → raise `speech_threshold` or `min_word_duration` +- Real "Sanad" being rejected → check the log line `wake REJECTED — %r` to see what Whisper heard; widen `wake_words` if needed +- Commands transcribed wrong → check `whisper: lp=%.2f nsp=%.2f text=%r` log line; lower `whisper_no_speech_threshold` or tighten `whisper_log_prob_threshold` +- "I didn't catch that" on silence → raise `min_transcription_length` +- Latency too high → set `wake_ack: "none"` (skip "Yes" TTS, save ~1.7 s/cycle) + --- ## Command Reference @@ -230,7 +244,7 @@ Most values configurable in `Config/config_Network.json` and `config_Voice.json: | `llama runner process has terminated: %!w()` | Ollama compute graph OOM | Already capped at `num_batch=128 / num_ctx=2048`. Check `free -h`; kill stale Ollama runners: `pkill -f "ollama runner"` | | Traceback mentioning `multiprocessing/spawn.py` + ZMQ port 5556 | Old import-time ZMQ bind regressed | Pull latest `API/zmq_api.py` — must call `init_zmq()` from the parent only | | `[Camera] No frame for 10s` during warmup | Ollama blocking the main thread, or USB bandwidth | Warmup is ~10–15 s on first Qwen load; subsequent commands are fast | -| Wake word never fires | Whisper hearing something else | Check `logs/voice.log` — if it transcribes as "sunnat"/"sannat", add your variant to `config_Voice.json::stt.wake_words_en` | +| Wake word never fires | Energy burst below floor, or Whisper verify rejecting | Check `logs/voice.log` — if you see `wake REJECTED — 'X'`, add X's root variant to `config_Voice.json::stt.wake_words`. If `baseline=0` persists, your ambient exceeds the floor — raise `speech_threshold`. | | Mic silent | G1 audio service not publishing | Run `python3 Voice/builtin_mic.py` standalone — must print "OK — mic is capturing audio" | | `[LiDAR] No data yet (will keep trying)` | SLAM worker still spawning (normal) or Livox network | First ~5 s normal. If persists, `ping 192.168.123.120` | | Client can't connect | Wrong IP or server not running | Verify `ollama serve &` and `python3 -m Server.marcus_server` are both up | @@ -243,7 +257,7 @@ Most values configurable in `Config/config_Network.json` and `config_Voice.json: |------|------| | Brain code | `~/Marcus/Brain/` | | Server | `~/Marcus/Server/marcus_server.py` | -| Voice | `~/Marcus/Voice/{builtin_mic,builtin_tts,marcus_voice}.py` | +| Voice | `~/Marcus/Voice/{builtin_mic,builtin_tts,wake_detector,marcus_voice}.py` | | Config | `~/Marcus/Config/` | | Prompts | `~/Marcus/Config/marcus_prompts.yaml` | | YOLO model | `~/Marcus/Models/yolov8m.pt` | @@ -261,7 +275,7 @@ See `Doc/functions.md` for the full function inventory (AST-generated). ## Language policy **English only.** Arabic was removed from the codebase on 2026-04-21: -- `Config/config_Voice.json::stt.wake_words_en` — only English variants (`sanad`, `sannad`, `sanat`, `sunnat`) +- `Config/config_Voice.json::stt.wake_words` — English fuzzy variants only (33 entries), excludes common English words that would false-trigger (`said`, `sand`, `sunday`, etc.) - `Config/marcus_prompts.yaml` — no Arabic examples left in any of the 7 prompts - `API/audio_api.py::speak(text)` — rejects non-ASCII (the G1 TtsMaker silently maps Arabic to Chinese, which nobody wants) - `Brain/marcus_brain.py` — greeting and talk-pattern regexes match English only diff --git a/Doc/environment.md b/Doc/environment.md index 23c624a..94f3817 100644 --- a/Doc/environment.md +++ b/Doc/environment.md @@ -384,3 +384,5 @@ Config file (`Config/config_Vision.json`): | 2026-04-21 | **Subprocess leak fix**: `AudioAPI._record_parec` now wraps `Popen` in try/finally with `terminate → wait(1.0) → kill` fallback; orphan `parec` processes can no longer survive Ctrl-C. Last-resort `proc.kill()` catches only `OSError` (not bare `except`). | | 2026-04-21 | **Modelfile corrected**: `Models/Modelfile` now `FROM qwen2.5vl:3b` (was `:7b`) with a header explaining it's an optional build template — runtime uses `ollama pull qwen2.5vl:3b` directly. | | 2026-04-21 | **Final verification**: 14-dimension smoke test green — no Arabic, no dead dirs, 0 orphan keys, every FileHandler rotates, no bare `except: pass`, no stale `Models_marcus` / `marcus_llava` refs, 25/25 modules import. | +| 2026-04-24 | **Voice finalised on faster-whisper + custom energy wake**. Added `Voice/wake_detector.py` (pure-numpy energy state machine, adaptive noise floor, burst-audio capture for verify). Rewrote `Voice/marcus_voice.py` around it: three operating modes (`wake_and_command` / `always_on` / `always_on_gated`), hysteretic record VAD, pre-speech silence trim (300 ms pre-roll preserved), faster-whisper `base.en` int8 CPU decode, fuzzy-match canonicalisation against `command_vocab`, `GARBAGE_PATTERNS` + length filter for noise hallucinations, `/s-/` phonetic wake verify (accepts Whisper mishearings of "Sanad" like "Stop"/"Set"/"Sand"). Tried and reverted: Gemini Live WebSocket (Python 3.8 incompatibility + latency), Vosk grammar STT (English lexicon can't decode "Sanad"; big model cold-load too slow on Jetson). All voice tunables (33 wake_words, 68 command_vocab, 17 garbage_patterns, ~25 threshold/VAD/Whisper keys) live in `config_Voice.json::stt.*` — zero hardcoded strings in Voice/. | +| 2026-04-24 | **Command parser widened**: `Brain/command_parser.py` now has `_RE_SIMPLE_DIR` (`left`, `go back`, `move forward`, `step right`, etc.) and `_RE_STOP_SIMPLE` (`stop`, `halt`, `wait`, `pause`, `freeze`) regex fast-paths — these bare-direction / bare-stop commands now skip Qwen entirely (~50 ms vs ~5 s). Motion velocities and step duration pulled from `config_Navigation.json::{move_map, step_duration_sec}` via `API/zmq_api.py`; command_parser no longer contains hardcoded `0.3` / `2.0` magic numbers. | diff --git a/Doc/functions.md b/Doc/functions.md index 3ada97d..1aaea92 100644 --- a/Doc/functions.md +++ b/Doc/functions.md @@ -49,13 +49,14 @@ Script only. Prepends `PROJECT_ROOT` to `sys.path`, then calls `Brain.marcus_bra --- -## `Voice/` — mic + TTS + STT +## `Voice/` — mic + TTS + wake + STT | File | Public API | |---|---| | `builtin_mic.py` | `_find_g1_local_ip()` + **class `BuiltinMic`** | | `builtin_tts.py` | **class `BuiltinTTS`** | -| `marcus_voice.py` | **class `State`** (IDLE/WAKE_HEARD/PROCESSING/SPEAKING), **class `VoiceModule`** | +| `wake_detector.py` | **dataclass `WakeConfig`** + **class `WakeDetector`** | +| `marcus_voice.py` | module-level `WAKE_WORDS`, `COMMAND_VOCAB`, `GARBAGE_PATTERNS` (populated from config), helpers `_has_wake_word`, `_strip_wake_word`, `_strip_wake_word_once`, `_closest_command`, **class `VoiceModule`** | **`Voice.builtin_mic.BuiltinMic`** — G1 UDP multicast mic: `__init__(group, port, buf_max, read_timeout)`, `start()`, `stop()`, `read_chunk(num_bytes)`, `read_seconds(seconds)`, `flush()`; internal `_recv_loop`. @@ -63,8 +64,11 @@ Script only. Prepends `PROJECT_ROOT` to `sys.path`, then calls `Brain.marcus_bra **`Voice.builtin_tts.BuiltinTTS`** — wraps `AudioClient.TtsMaker`: `__init__(audio_client, default_speaker_id=0)`, `speak(text, speaker_id=None, block=True)`. -**`Voice.marcus_voice.VoiceModule`** — Whisper wake + command STT: -`__init__(audio_api, on_command)`, `start()`, `stop()`, props `state`, `is_running`. Internal state machine: `_do_idle`, `_do_wake_heard`, `_do_processing`; helpers `_load_whisper`, `_transcribe`, `_check_wake_word`, `_record_chunk`, `_record_until_silence`, `_voice_loop`. +**`Voice.wake_detector.WakeDetector`** — pure-numpy energy wake: +`__init__(cfg: WakeConfig)`, `process(pcm_bytes) -> bool`, `reset()`, `get_last_burst() -> np.ndarray | None`. Internal: `_step(window)` state-machine per 50 ms analysis window; adaptive `_baseline_buf` rolling mean of idle-silence RMS; captures triggering burst audio for post-hoc Whisper verify. + +**`Voice.marcus_voice.VoiceModule`** — voice orchestrator. Drives the wake detector, verifies each fire with a lightweight Whisper decode (wake-word substring OR /s-/ phonetic match), records commands with a hysteretic VAD, trims pre-speech silence, transcribes via faster-whisper, fuzzy-normalises near-misses to canonical commands, dispatches to brain. +`__init__(audio_api, on_command=None, on_wake=None)`, `start()`, `stop()`, `is_running` property. Internal: `_get_fw()` lazy faster-whisper loader, `_read_mic_raw` / `_read_mic_gained`, `_record_command()` with adaptive VAD + pre-silence trim, `_transcribe(audio)` Whisper decode + garbage filter, `_transcribe_command(audio)` thin wrapper, `_normalize_command(text)` fuzzy-match to `COMMAND_VOCAB`, `_handle_wake()` / `_voice_loop()` / `_voice_loop_wake()` / `_voice_loop_always_on(gated)`, `_save_unk_wav(audio)` for post-mortem debugging. --- diff --git a/Doc/pipeline.md b/Doc/pipeline.md index 2292911..a7d50a8 100644 --- a/Doc/pipeline.md +++ b/Doc/pipeline.md @@ -51,22 +51,43 @@ G1 body mic (array) Voice/builtin_mic.py::BuiltinMic ring buffer (64 KB) + read_chunk(n) ▼ -Voice/marcus_voice.py::VoiceModule (IDLE → WAKE_HEARD → PROCESSING → SPEAKING) - ├─ IDLE : 2-s chunks → Whisper tiny → wake-word match ("sanad"/"sannad"/…) - ├─ WAKE_HEARD : audio_api.speak("Listening") → G1 body speaker - ├─ PROCESSING : record-until-silence → Whisper small → transcribed text - └─ on_command(text, "en") +Voice/wake_detector.py::WakeDetector + pure-numpy energy state machine (SILENCE ⇄ SPEAKING) + adaptive noise floor: eff_threshold = max(speech_threshold, baseline × 3) + fires on 0.35-1.5 s bursts followed by 0.3 s silence → captures burst audio + ▼ +Voice/marcus_voice.py::VoiceModule._handle_wake() + ├─ 1. Whisper verify on the burst audio: + │ text = faster-whisper(burst) + │ accept if _has_wake_word(text) OR startswith(s/sh/z) + │ reject otherwise (cough, clap, hello, okay) → silent return + ├─ 2. audio_api.speak("Yes") → G1 body speaker (~1.5 s) + ├─ 3. post_tts_settle_sec wait + mic flush + ├─ 4. _record_command() — hysteretic VAD + │ speech_entry_rms / silence_exit_rms (adapt from wake baseline) + │ trim leading silence (keep 300 ms pre-roll) → tight clip for Whisper + ├─ 5. _transcribe(audio) + │ faster-whisper (base.en int8 CPU) + │ beam_size=5, temperature=0, initial_prompt bias toward Sanad vocab + │ GARBAGE_PATTERNS + min_transcription_length reject noise hallucinations + ├─ 6. _normalize_command(text) + │ difflib fuzzy-match vs stt.command_vocab + │ "Turn right up" → "turn right" (canonical form) + └─ 7. on_command(text, "en") ▼ Brain/marcus_brain.py::process_command(text) ├─ regex fast-path → Brain/command_parser.py::try_local_command() - │ places · odometry walk/turn · patrol · session recall · goal_nav · auto on/off + │ places · odometry walk/turn · patrol · session recall · goal_nav + │ + SIMPLE_DIR ("go back", "right", "forward") · STOP_SIMPLE ("stop", "halt") + │ + NAT_GOAL_RE (naturalised goals like "the chair") · auto on/off + │ (~50 ms when matched — NO LLM call) └─ else → _handle_llava(text) ├─ get_frame() (10×50 ms poll, no 1 s stall) ├─ API/llava_api.py::ask(text, img) │ ollama.chat(qwen2.5vl:3b, num_batch=128, num_ctx=2048, num_predict=120) │ → parse_json() → {actions, arm, speak, abort} └─ Brain/executor.py::execute(d) - ├─ actions → API/zmq_api.py::send_vel(vx, vy, vyaw) → Holosoma + ├─ actions → MOVE_MAP[dir] → API/zmq_api.py::send_vel → Holosoma ├─ arm → API/arm_api.py (stub for now) └─ abort → gradual_stop() ▼ @@ -77,9 +98,17 @@ API/audio_api.py::speak(text, lang="en") ├─ Voice/builtin_tts.py::BuiltinTTS.speak(text) │ client.TtsMaker(text, speaker_id=0) — G1 on-board engine, English only │ time.sleep(len(text) * 0.08) - └─ unmute mic → back to IDLE + └─ unmute mic → back to listening ``` +**Config knobs** (all in `config_Voice.json::stt`): +- Wake: `speech_threshold` (floor), `min_word_duration`, `max_word_duration`, `post_silence`, `wake_cooldown`, `wake_adaptive_mult`, `wake_diag_log_sec` +- Verify: `wake_verify_enabled` +- Record: `speech_entry_rms`, `silence_exit_rms`, `silence_duration_sec`, `max_record_sec`, `min_record_sec`, `ambient_mult`, `ambient_cap_rms` +- Whisper: `whisper_model`, `whisper_compute_type`, `whisper_beam_size`, `whisper_no_speech_threshold`, `whisper_log_prob_threshold`, `whisper_initial_prompt`, `mic_gain` +- Vocab: `wake_words`, `command_vocab`, `garbage_patterns`, `command_vocab_cutoff`, `min_transcription_length` +- Mode: `mode` (`wake_and_command` | `always_on` | `always_on_gated`), `wake_ack` (`tts`|`none`) + --- ## Terminal / WebSocket command pipeline (same brain, skips voice) @@ -169,7 +198,12 @@ Brain/command_parser.py — responds to "lidar status" queries | `mic_udp.group/port` | config_Voice.json | where to join the G1 audio multicast | | `mic_udp.read_timeout_sec` | config_Voice.json | `BuiltinMic.read_chunk` budget (default 0.04 s) | | `tts.backend` | config_Voice.json | `builtin_ttsmaker` (only supported option) | -| `stt.wake_words_en` | config_Voice.json | Whisper matcher (`sanad` + variants) | +| `stt.wake_words` | config_Voice.json | 33 fuzzy variants of "Sanad" for the wake-verify substring match | +| `stt.command_vocab` | config_Voice.json | 68 canonical command phrases for fuzzy-normalization (`"turn right up"` → `"turn right"`) | +| `stt.garbage_patterns` | config_Voice.json | 17 Whisper noise-hallucinations to reject (`"thanks for watching"`, `"okay"`, etc.) | +| `stt.speech_threshold` etc. | config_Voice.json | energy wake detector thresholds — see `Doc/controlling.md` "Voice" for the full tuning matrix | +| `stt.whisper_*` | config_Voice.json | faster-whisper model, compute type, beam size, confidence gates, bias prompt | +| `stt.mode` | config_Voice.json | `wake_and_command` (default) / `always_on` / `always_on_gated` | | `timeout_ms`, `stale_threshold_s`, `reconnect_delay_s` | config_Camera.json | RealSense frame timeout, reconnect trigger, initial backoff | | `default_max_steps`, `step_delay_s`, `rotate_speed`, `min_steps_warmup` | config_ImageSearch.json | image-guided search rotation cadence (wired into `Vision/marcus_imgsearch.py`) | | `default_walk_speed`, `dist_tolerance`, `angle_tolerance`, `safety_timeout_mult`, `dr_update_hz` | config_Odometry.json | precise motion control (wired into `Navigation/marcus_odometry.py`) | @@ -181,9 +215,14 @@ Brain/command_parser.py — responds to "lidar status" queries | Step | Typical | Notes | |---|---|---| -| Wake-word detect | 200–500 ms | Whisper tiny on 2 s chunk | -| Record until silence | 1–8 s | depends on user speech | -| Whisper small STT | 500–1500 ms | once per command | +| Wake-word detect | <100 ms | pure-numpy energy detector, 50 ms analysis windows | +| Wake verify (first wake) | ~2000 ms | includes faster-whisper `base.en` cold load | +| Wake verify (subsequent) | 300–700 ms | Whisper cached, decodes ~0.5-1.5 s burst | +| "Yes" TTS ack | ~1500 ms | G1 firmware `TtsMaker` minimum | +| Record until silence | 1–5 s | depends on user speech; `max_record_sec=5` cap | +| Pre-silence trim | <1 ms | numpy slice | +| faster-whisper STT | 500–1500 ms | `base.en` int8 on CPU, beam_size=5 | +| Fuzzy-match canonicalisation | <1 ms | difflib against 68 phrases | | Camera frame fetch | <50 ms | poll loop, no 1 s blocking stall | | Ollama Qwen2.5-VL | 800–1500 ms | `num_batch=128 / num_ctx=2048 / num_predict=120` | | Executor + ZMQ send | <10 ms | fire-and-forget PUB | diff --git a/Models/vosk-model-small-en-us-0.15.zip b/Models/vosk-model-small-en-us-0.15.zip deleted file mode 100644 index 0c94ec8..0000000 Binary files a/Models/vosk-model-small-en-us-0.15.zip and /dev/null differ diff --git a/Models/vosk-model-small-en-us-0.15/README b/Models/vosk-model-small-en-us-0.15/README deleted file mode 100644 index a7f7931..0000000 --- a/Models/vosk-model-small-en-us-0.15/README +++ /dev/null @@ -1,9 +0,0 @@ -US English model for mobile Vosk applications - -Copyright 2020 Alpha Cephei Inc - -Accuracy: 10.38 (tedlium test) 9.85 (librispeech test-clean) -Speed: 0.11xRT (desktop) -Latency: 0.15s (right context) - - diff --git a/Models/vosk-model-small-en-us-0.15/am/final.mdl b/Models/vosk-model-small-en-us-0.15/am/final.mdl deleted file mode 100644 index 5596b31..0000000 Binary files a/Models/vosk-model-small-en-us-0.15/am/final.mdl and /dev/null differ diff --git a/Models/vosk-model-small-en-us-0.15/conf/mfcc.conf b/Models/vosk-model-small-en-us-0.15/conf/mfcc.conf deleted file mode 100644 index eaa40c5..0000000 --- a/Models/vosk-model-small-en-us-0.15/conf/mfcc.conf +++ /dev/null @@ -1,7 +0,0 @@ ---sample-frequency=16000 ---use-energy=false ---num-mel-bins=40 ---num-ceps=40 ---low-freq=20 ---high-freq=7600 ---allow-downsample=true diff --git a/Models/vosk-model-small-en-us-0.15/conf/model.conf b/Models/vosk-model-small-en-us-0.15/conf/model.conf deleted file mode 100644 index 9d5b0da..0000000 --- a/Models/vosk-model-small-en-us-0.15/conf/model.conf +++ /dev/null @@ -1,10 +0,0 @@ ---min-active=200 ---max-active=3000 ---beam=10.0 ---lattice-beam=2.0 ---acoustic-scale=1.0 ---frame-subsampling-factor=3 ---endpoint.silence-phones=1:2:3:4:5:6:7:8:9:10 ---endpoint.rule2.min-trailing-silence=0.5 ---endpoint.rule3.min-trailing-silence=0.75 ---endpoint.rule4.min-trailing-silence=1.0 diff --git a/Models/vosk-model-small-en-us-0.15/graph/Gr.fst b/Models/vosk-model-small-en-us-0.15/graph/Gr.fst deleted file mode 100644 index 1f292e6..0000000 Binary files a/Models/vosk-model-small-en-us-0.15/graph/Gr.fst and /dev/null differ diff --git a/Models/vosk-model-small-en-us-0.15/graph/HCLr.fst b/Models/vosk-model-small-en-us-0.15/graph/HCLr.fst deleted file mode 100644 index 9797b26..0000000 Binary files a/Models/vosk-model-small-en-us-0.15/graph/HCLr.fst and /dev/null differ diff --git a/Models/vosk-model-small-en-us-0.15/graph/disambig_tid.int b/Models/vosk-model-small-en-us-0.15/graph/disambig_tid.int deleted file mode 100644 index 762fd5f..0000000 --- a/Models/vosk-model-small-en-us-0.15/graph/disambig_tid.int +++ /dev/null @@ -1,17 +0,0 @@ -10015 -10016 -10017 -10018 -10019 -10020 -10021 -10022 -10023 -10024 -10025 -10026 -10027 -10028 -10029 -10030 -10031 diff --git a/Models/vosk-model-small-en-us-0.15/graph/phones/word_boundary.int b/Models/vosk-model-small-en-us-0.15/graph/phones/word_boundary.int deleted file mode 100644 index df23fd7..0000000 --- a/Models/vosk-model-small-en-us-0.15/graph/phones/word_boundary.int +++ /dev/null @@ -1,166 +0,0 @@ -1 nonword -2 begin -3 end -4 internal -5 singleton -6 nonword -7 begin -8 end -9 internal -10 singleton -11 begin -12 end -13 internal -14 singleton -15 begin -16 end -17 internal -18 singleton -19 begin -20 end -21 internal -22 singleton -23 begin -24 end -25 internal -26 singleton -27 begin -28 end -29 internal -30 singleton -31 begin -32 end -33 internal -34 singleton -35 begin -36 end -37 internal -38 singleton -39 begin -40 end -41 internal -42 singleton -43 begin -44 end -45 internal -46 singleton -47 begin -48 end -49 internal -50 singleton -51 begin -52 end -53 internal -54 singleton -55 begin -56 end -57 internal -58 singleton -59 begin -60 end -61 internal -62 singleton -63 begin -64 end -65 internal -66 singleton -67 begin -68 end -69 internal -70 singleton -71 begin -72 end -73 internal -74 singleton -75 begin -76 end -77 internal -78 singleton -79 begin -80 end -81 internal -82 singleton -83 begin -84 end -85 internal -86 singleton -87 begin -88 end -89 internal -90 singleton -91 begin -92 end -93 internal -94 singleton -95 begin -96 end -97 internal -98 singleton -99 begin -100 end -101 internal -102 singleton -103 begin -104 end -105 internal -106 singleton -107 begin -108 end -109 internal -110 singleton -111 begin -112 end -113 internal -114 singleton -115 begin -116 end -117 internal -118 singleton -119 begin -120 end -121 internal -122 singleton -123 begin -124 end -125 internal -126 singleton -127 begin -128 end -129 internal -130 singleton -131 begin -132 end -133 internal -134 singleton -135 begin -136 end -137 internal -138 singleton -139 begin -140 end -141 internal -142 singleton -143 begin -144 end -145 internal -146 singleton -147 begin -148 end -149 internal -150 singleton -151 begin -152 end -153 internal -154 singleton -155 begin -156 end -157 internal -158 singleton -159 begin -160 end -161 internal -162 singleton -163 begin -164 end -165 internal -166 singleton diff --git a/Models/vosk-model-small-en-us-0.15/ivector/final.dubm b/Models/vosk-model-small-en-us-0.15/ivector/final.dubm deleted file mode 100644 index db789eb..0000000 Binary files a/Models/vosk-model-small-en-us-0.15/ivector/final.dubm and /dev/null differ diff --git a/Models/vosk-model-small-en-us-0.15/ivector/final.ie b/Models/vosk-model-small-en-us-0.15/ivector/final.ie deleted file mode 100644 index 93737bf..0000000 Binary files a/Models/vosk-model-small-en-us-0.15/ivector/final.ie and /dev/null differ diff --git a/Models/vosk-model-small-en-us-0.15/ivector/final.mat b/Models/vosk-model-small-en-us-0.15/ivector/final.mat deleted file mode 100644 index c3ec635..0000000 Binary files a/Models/vosk-model-small-en-us-0.15/ivector/final.mat and /dev/null differ diff --git a/Models/vosk-model-small-en-us-0.15/ivector/global_cmvn.stats b/Models/vosk-model-small-en-us-0.15/ivector/global_cmvn.stats deleted file mode 100644 index b9d92ef..0000000 --- a/Models/vosk-model-small-en-us-0.15/ivector/global_cmvn.stats +++ /dev/null @@ -1,3 +0,0 @@ - [ - 1.682383e+11 -1.1595e+10 -1.521733e+10 4.32034e+09 -2.257938e+10 -1.969666e+10 -2.559265e+10 -1.535687e+10 -1.276854e+10 -4.494483e+09 -1.209085e+10 -5.64008e+09 -1.134847e+10 -3.419512e+09 -1.079542e+10 -4.145463e+09 -6.637486e+09 -1.11318e+09 -3.479773e+09 -1.245932e+08 -1.386961e+09 6.560655e+07 -2.436518e+08 -4.032432e+07 4.620046e+08 -7.714964e+07 9.551484e+08 -4.119761e+08 8.208582e+08 -7.117156e+08 7.457703e+08 -4.3106e+08 1.202726e+09 2.904036e+08 1.231931e+09 3.629848e+08 6.366939e+08 -4.586172e+08 -5.267629e+08 -3.507819e+08 1.679838e+09 - 1.741141e+13 8.92488e+11 8.743834e+11 8.848896e+11 1.190313e+12 1.160279e+12 1.300066e+12 1.005678e+12 9.39335e+11 8.089614e+11 7.927041e+11 6.882427e+11 6.444235e+11 5.151451e+11 4.825723e+11 3.210106e+11 2.720254e+11 1.772539e+11 1.248102e+11 6.691599e+10 3.599804e+10 1.207574e+10 1.679301e+09 4.594778e+08 5.821614e+09 1.451758e+10 2.55803e+10 3.43277e+10 4.245286e+10 4.784859e+10 4.988591e+10 4.925451e+10 5.074584e+10 4.9557e+10 4.407876e+10 3.421443e+10 3.138606e+10 2.539716e+10 1.948134e+10 1.381167e+10 0 ] diff --git a/Models/vosk-model-small-en-us-0.15/ivector/online_cmvn.conf b/Models/vosk-model-small-en-us-0.15/ivector/online_cmvn.conf deleted file mode 100644 index 7748a4a..0000000 --- a/Models/vosk-model-small-en-us-0.15/ivector/online_cmvn.conf +++ /dev/null @@ -1 +0,0 @@ -# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/Models/vosk-model-small-en-us-0.15/ivector/splice.conf b/Models/vosk-model-small-en-us-0.15/ivector/splice.conf deleted file mode 100644 index 960cd2e..0000000 --- a/Models/vosk-model-small-en-us-0.15/ivector/splice.conf +++ /dev/null @@ -1,2 +0,0 @@ ---left-context=3 ---right-context=3 diff --git a/README.md b/README.md index 85161c7..d96d99f 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ a Python brain. | **Brain** (reason, speak, decide) | Parse commands, reason about vision, pick actions | **Qwen2.5-VL 3B** via Ollama | Jetson GPU | | **Eyes** (see) | Real-time object/person detection | **YOLOv8m** (CUDA, FP16, 320 px, ~22 FPS) | Jetson GPU | | **Eyes** (understand) | Open-ended scene understanding, reading, goal-verify | **Qwen2.5-VL** (same brain model) | Jetson GPU | -| **Ears** (hear) | Always-on wake-word + command transcription | **Whisper tiny** (wake) + **Whisper small** (STT) | Jetson CPU/GPU | +| **Ears** (hear) | Energy-based wake detector + command transcription | **Custom DSP wake** (numpy, no ML) + **faster-whisper base.en int8** (STT) | Jetson CPU | | **Mouth** (speak) | On-robot TTS, no internet needed | **Unitree `TtsMaker`** (G1 firmware) | G1 body speaker | | **Legs** (walk) | 29-DoF locomotion + balance | **Holosoma** RL policy (separate process, ONNX) | Jetson CPU | | **Hands** (gesture) | Arm & hand actions | **GR00T N1.5** — pending; `API/arm_api.py` is a stub today | Jetson GPU (future) | @@ -54,7 +54,7 @@ Camera ─┘ ▼ ├─► Legs (Holosoma Three input modalities, same command loop: -- **Voice** — say "**Sanad, what do you see?**" → wake word fires, Whisper transcribes, brain answers through the G1 speaker. +- **Voice** — say "**Sanad**" → energy detector fires, Whisper verifies the /sa-/ phoneme signature, robot replies "Yes" → speak your command → faster-whisper transcribes → brain answers through the G1 speaker. - **Text** — type the same command into `run_marcus.py`'s terminal. - **WebSocket (remote)** — `Client/marcus_cli.py` or `Client/marcus_client.py` (Tkinter GUI) send commands from a workstation. @@ -84,7 +84,8 @@ There are two schools for combining them: | Vision — open-ended scene understanding | same VLM | learned | | Legs / locomotion | **RL policy** (Holosoma, ONNX) | learned | | Arms / gestures | SDK action-ID lookup | **hand-coded** | -| Wake-word + STT | Whisper | learned | +| Wake word | Custom energy-envelope DSP (numpy) | hand-coded | +| STT (command) | faster-whisper base.en | learned | | TTS | Unitree `TtsMaker` (on-robot DSP) | firmware | | Glue between layers | Python + ZMQ + JSON | hand-coded | @@ -143,13 +144,13 @@ Same hardware, different prompts + wake word. - **Prompts** rewrite: *"You are a museum guide. When a visitor asks about an exhibit, describe it in two sentences and invite them to ask follow-ups."* - **Places** memory pre-loaded with exhibit waypoints; `patrol: exhibit_A → exhibit_B → exit` follows a tour. -- Wake word changed in `config_Voice.json::stt.wake_words_en`. +- Wake word variants in `config_Voice.json::stt.wake_words` (fuzzy list, handles Whisper mishearings of "Sanad"). - Image search (`search/ photo_of_exhibit.jpg`) lets visitors hold up a printed map; the robot navigates to the matching location. - YOLO classes trimmed to people-only if the venue doesn't need object safety. **What you change to switch use cases:** 1. `Config/marcus_prompts.yaml` — persona + task descriptions -2. `Config/config_Voice.json::stt.wake_words_en` — the name people call the robot +2. `Config/config_Voice.json::stt.wake_words` — the name (+ fuzzy variants) people call the robot 3. `Config/config_Vision.json::tracked_classes` — relevant object set 4. `Config/config_Brain.json::subsystems.{lidar,voice,imgsearch,autonomous}` — enable what you need 5. Data under `Data/History/Places/places.json` — learned locations @@ -174,6 +175,7 @@ No code changes required for either deployment. Vision/ Navigation/ Voice/ Lidar/ YOLO, imgsearch goal_nav, builtin_mic, SLAM engine patrol, odom builtin_tts, (subprocess) + wake_detector, marcus_voice │ ▼ @@ -253,7 +255,7 @@ Marcus/ ├── Brain/ orchestrator, parser, executor, memory ├── Vision/ YOLO + image-guided search ├── Navigation/ goal nav, patrol, odometry -├── Voice/ built-in mic, built-in TTS, Whisper loop +├── Voice/ built-in mic, TTS, energy wake detector, faster-whisper STT ├── Autonomous/ exploration state machine ├── Lidar/ SLAM engine (subprocess) ├── Server/ WebSocket interface diff --git a/Voice/marcus_voice.py b/Voice/marcus_voice.py index ea8bd13..0dbbb87 100644 --- a/Voice/marcus_voice.py +++ b/Voice/marcus_voice.py @@ -1,28 +1,29 @@ #!/usr/bin/env python3 """ -Voice/marcus_voice.py — Marcus Wake-Signal Module (no ML, no STT). +Voice/marcus_voice.py — voice input for Marcus (custom wake + faster-whisper STT). -This is a deliberately-minimal voice subsystem: +Pipeline: + G1 mic ─► custom wake detector (numpy, offline, instant) + │ + ▼ + TTS "Yes" (AudioAPI → G1 TtsMaker) + │ + ▼ + record command audio until silence + │ + ▼ + faster-whisper base.en int8 (CPU) ──► brain callback(text) - - A custom energy-based wake detector (Voice/wake_detector.py) listens - to the G1's on-board mic continuously. - - When the user says any short word (~0.2-1.5 s of speech followed by - silence), wake fires. - - The robot acknowledges via TTS ("Yes" — configurable). - - The user then types their command at the Marcus terminal prompt. +Wake detection is local and instant (Voice/wake_detector.py — pure DSP, no +ML). STT runs only on the recorded command, not on every 2 s of mic input, +so the CPU cost is bounded by how often the user talks. -No Vosk, no Whisper, no torch, no network. Pure numpy DSP. - -Why not STT here: - Both Vosk's small English model ("sanad" absent from lexicon) and - openai-whisper ("!!!!!" numerical garbage on this Jetson's torch-aarch64) - proved unreliable for this hardware. Rather than fight either, the - wake path becomes a simple "did the user say something?" signal. - -Interface with Marcus brain: - VoiceModule(audio_api, on_wake=callback) - on_wake() is called when wake fires. Brain can display a prompt - or do anything else. +Why faster-whisper (CTranslate2) instead of openai-whisper: + The Jetson's torch-aarch64 build has a Categorical sampler bug that + produces NaN logits on low-SNR input, which is exactly what the G1 + far-field mic captures. faster-whisper bypasses torch entirely and + runs the int8-quantized model through CTranslate2 — same quality as + Whisper base, no numerical instability, 3× faster on this hardware. """ from __future__ import annotations @@ -35,7 +36,8 @@ import time from logging.handlers import RotatingFileHandler from typing import Callable, Optional -# ─── PATH + CONFIG ─────────────────────────────────────── +import numpy as np + _PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if _PROJECT_DIR not in sys.path: sys.path.insert(0, _PROJECT_DIR) @@ -58,42 +60,176 @@ logging.basicConfig( log = logging.getLogger("marcus_voice") -class VoiceModule: - """Wake-only voice subsystem — fires a callback when speech is detected.""" +# Module-level vocabulary containers. EMPTY on import — populated by +# VoiceModule.__init__ from Config/config_Voice.json::stt.{wake_words, +# command_vocab, garbage_patterns}. Config is the single source of truth; +# there are no hardcoded string lists here anymore. +# +# If you import this module without running a VoiceModule() first, these +# stay empty → fuzzy-match is a no-op, wake detection rejects everything, +# garbage filter rejects nothing. That's by design: bad config = obvious +# broken behavior, not silently-drifting hardcoded defaults. +WAKE_WORDS: set = set() +COMMAND_VOCAB: list = [] +GARBAGE_PATTERNS: set = set() +_MIN_TRANSCRIPTION_LENGTH: int = 3 - def __init__(self, audio_api, on_command: Optional[Callable] = None, - on_wake: Optional[Callable] = None): - """ - Args: - audio_api: AudioAPI instance (for TTS ack). - on_command: kept for API compatibility; always called with - text="" because there's no STT. Brain should - prompt the user to type. - on_wake: alternative callback fired when wake detected. - Exactly one of on_command / on_wake is used. - """ + +def _has_wake_word(text: str) -> bool: + """ + True if the utterance contains any wake-word variant as a *whole word* + (word-boundary match, not substring — so "standard" doesn't trigger + off "sand"). + """ + import re + low = text.lower() + for w in WAKE_WORDS: + if re.search(r'\b' + re.escape(w) + r'\b', low): + return True + return False + + +def _strip_wake_word_once(text: str) -> str: + """Single pass of wake-word stripping. Use via _strip_wake_word().""" + import re + stripped = text.strip() + + # Case 1: the entire utterance is just a wake word + optional + # trailing punctuation. Return empty string so caller can ack-only. + for w in WAKE_WORDS: + if re.fullmatch(rf'{re.escape(w)}[\s,.!?]*', stripped, re.IGNORECASE): + return "" + + # Case 2: "Sanad " — require whitespace (or comma+ws) between + # wake word and command so "Sanad." doesn't swallow "." as a command. + for w in sorted(WAKE_WORDS, key=len, reverse=True): + m = re.match( + rf'^\s*{re.escape(w)}\s*[,.!?]?\s+(.+)$', + text, re.IGNORECASE, + ) + if m: + return m.group(1).strip(' ,.!?') + + # Case 3: " Sanad" — trailing wake word. + m = re.match( + rf'^(.+?)\s+{re.escape(w)}\s*[.!?]*\s*$', + text, re.IGNORECASE, + ) + if m: + return m.group(1).strip(' ,.!?') + + return text + + +def _strip_wake_word(text: str) -> str: + """ + Remove the wake word from the start or end of text, iteratively, + so repeated-wake transcriptions ("Sanad. Sanad.") fully collapse + to the actual command (or empty string if nothing else was said). + + Examples: + "Sanad, turn left" → "turn left" + "Sanad turn left" → "turn left" + "turn left Sanad" → "turn left" + "Sanad." → "" + "Sanad" → "" + "Sanad. Sanad." → "" (was leaving "Sanad" before) + "Sanad Sanad stop" → "stop" (recursive strip) + """ + # Iterate until stable — each pass peels off one wake word. Cap at + # a handful of iterations so a malicious/garbled input can't loop. + for _ in range(5): + stripped = _strip_wake_word_once(text) + if stripped == text: + return text + text = stripped + return text + + +def _closest_command(text: str, cutoff: float = 0.72) -> str: + """ + Map a Whisper transcription to the closest known command phrase. + + Returns the canonical command if there's a close-enough match, else + returns the original text unchanged. Close = difflib SequenceMatcher + ratio ≥ cutoff (0.72 empirically rejects unrelated phrases while + accepting common Whisper near-misses like "Turn right up"→"turn right" + or "What do you see?"→"what do you see"). + + Also handles the "transcription contains a command" case — if the + text has a command phrase as a substring (e.g. "Sanad, turn left" + from an echo), extract the command. + """ + from difflib import SequenceMatcher + low = text.lower().strip().rstrip(".!?,") + if not low: + return text + + # Cheap substring win first — no fuzzy needed if the command is + # literally in the transcription. + for cmd in COMMAND_VOCAB: + if cmd in low: + return cmd + + best_cmd = None + best_ratio = 0.0 + for cmd in COMMAND_VOCAB: + r = SequenceMatcher(None, low, cmd).ratio() + if r > best_ratio: + best_ratio = r + best_cmd = cmd + + if best_ratio >= cutoff: + return best_cmd + return text + + +class VoiceModule: + def __init__( + self, + audio_api, + on_command: Optional[Callable] = None, + on_wake: Optional[Callable] = None, + ): self._audio = audio_api self._on_command = on_command self._on_wake = on_wake - self._config = load_config("Voice") + self._config = load_config("Voice") self._stt = self._config.get("stt", {}) self._messages = self._config.get("messages", {}) - # Wake-detector parameters (tweakable via config_Voice.json::stt). + # Load all voice vocabulary from config — these are the only + # string lists the voice layer uses, and they come from + # config_Voice.json. If a key is missing, the list is empty and + # that feature silently degrades (fuzzy-match no-op, nothing + # rejected as garbage, no wake-word match) — NEVER crashes. + global WAKE_WORDS, COMMAND_VOCAB, GARBAGE_PATTERNS, _MIN_TRANSCRIPTION_LENGTH + WAKE_WORDS = {w.lower() for w in self._stt.get("wake_words", [])} + COMMAND_VOCAB = list(self._stt.get("command_vocab", [])) + GARBAGE_PATTERNS = {p.lower() for p in self._stt.get("garbage_patterns", [])} + _MIN_TRANSCRIPTION_LENGTH = int(self._stt.get("min_transcription_length", 3)) + self._vocab_cutoff = float(self._stt.get("command_vocab_cutoff", 0.72)) + log.info("vocab loaded: %d wake_words, %d command_vocab, %d garbage_patterns", + len(WAKE_WORDS), len(COMMAND_VOCAB), len(GARBAGE_PATTERNS)) + + # ── Custom wake detector ── from Voice.wake_detector import WakeDetector, WakeConfig wcfg = WakeConfig( - sample_rate = 16_000, - speech_threshold = float(self._stt.get("speech_threshold", 150.0)), - min_word_duration_s= float(self._stt.get("min_word_duration", 0.20)), - max_word_duration_s= float(self._stt.get("max_word_duration", 1.50)), - post_silence_s = float(self._stt.get("post_silence", 0.30)), - cooldown_s = float(self._stt.get("wake_cooldown", 1.50)), - chunk_ms = int( self._stt.get("wake_chunk_ms", 50)), + sample_rate = 16_000, + speech_threshold = float(self._stt.get("speech_threshold", 80.0)), + min_word_duration_s = float(self._stt.get("min_word_duration", 0.20)), + max_word_duration_s = float(self._stt.get("max_word_duration", 1.50)), + post_silence_s = float(self._stt.get("post_silence", 0.30)), + cooldown_s = float(self._stt.get("wake_cooldown", 1.50)), + chunk_ms = int( self._stt.get("wake_chunk_ms", 50)), + adaptive_window_n = int( self._stt.get("wake_adaptive_window_n", 50)), + adaptive_mult = float(self._stt.get("wake_adaptive_mult", 3.0)), + diag_log_sec = float(self._stt.get("wake_diag_log_sec", 3.0)), ) self._detector = WakeDetector(wcfg) - # G1 built-in mic (UDP multicast). + # ── G1 mic ── from Voice.builtin_mic import BuiltinMic _mcfg = self._config.get("mic_udp", {}) self._mic_capture = BuiltinMic( @@ -101,63 +237,904 @@ class VoiceModule: port = _mcfg.get("port", 5555), buf_max = _mcfg.get("buffer_max_bytes", 64000), ) + self._sample_rate = self._mic_capture.sample_rate + + # ── global software mic gain ── + # Applied to every byte read from the mic, so wake detector, VAD, + # AND Whisper all see the boosted audio. One knob, uniform effect. + # G1 far-field mic benefits from 2.0-3.0 for normal speaking volume; + # above 4.0 you start clipping loud words. + self._mic_gain = float(self._stt.get("mic_gain", 1.0)) + if self._mic_gain != 1.0: + log.info("mic_gain = %.2fx (applied to all mic reads)", self._mic_gain) + + # ── faster-whisper (lazy-init on first wake) ── + self._fw = None self._running = False self._thread = None + self._cooldown_until = 0.0 + log.info("VoiceModule initialized (wake=custom, stt=faster-whisper)") + # ─── gain-applied mic read ──────────────────────────── + + def _read_mic_raw(self, num_bytes: int) -> bytes: + """Raw mic read — no gain. Used by the wake detector whose + thresholds are calibrated against unamplified G1 ambient.""" + return self._mic_capture.read_chunk(num_bytes) + + def _read_mic_gained(self, num_bytes: int) -> bytes: + """ + Mic read with self._mic_gain applied. Used during command + recording so Whisper sees a louder, cleaner signal. NOT used + in the wake loop — amplifying ambient there pushes it over + the wake threshold and the detector can never find its + silent baseline. + """ + raw = self._mic_capture.read_chunk(num_bytes) + if not raw or self._mic_gain == 1.0: + return raw + arr = np.frombuffer(raw, dtype=np.int16).astype(np.float32) * self._mic_gain + return np.clip(arr, -32768, 32767).astype(np.int16).tobytes() + + # ─── lazy faster-whisper init ───────────────────────── + + def _get_fw(self): + """Load faster-whisper on first use — startup saved for cold path.""" + if self._fw is not None: + return self._fw + model = self._stt.get("whisper_model", "base.en") + device = self._stt.get("whisper_device", "cpu") + compute = self._stt.get("whisper_compute_type", "int8") log.info( - "VoiceModule initialized (custom wake detector, " - "speech_threshold=%s, min/max_word=%s/%s s)", - wcfg.speech_threshold, wcfg.min_word_duration_s, wcfg.max_word_duration_s, + "Loading faster-whisper: model=%s device=%s compute=%s", + model, device, compute, ) + try: + from faster_whisper import WhisperModel + self._fw = WhisperModel(model, device=device, compute_type=compute) + log.info("faster-whisper ready") + except Exception as e: + log.error("faster-whisper init failed: %s — voice will be wake-only", e) + self._fw = None + return self._fw + + # ─── command recording ──────────────────────────────── + + def _record_command(self) -> np.ndarray: + """ + Record the user's command with a hysteretic, adaptive-baseline VAD. + + Design (handles quiet, normal, and loud voices on the G1 mic): + + 1. Sample 200 ms of ambient noise first to learn the floor, + then set the "silence" gate to max(ambient * 2.5, floor). + Eliminates the "my silence threshold is higher than my + user's speaking level" failure mode. + + 2. Two thresholds with hysteresis: + speech_entry — RMS required to count as "speech started" + silence_exit — RMS below which we count silence + (< speech_entry; prevents mid-word bail on + breaths and short consonant gaps). + + 3. Recording can only *end* after we've actually heard speech. + Pure silence just runs out to max_record_sec, then returns + empty (the caller plays "I didn't catch that" without + burning a Whisper call on noise). + + 4. After speech is seen, silence_budget accumulates only while + RMS stays below silence_exit. A single loud burst resets + it to zero — so natural "turn... left" pauses don't end the + recording. + """ + # ── config knobs (all overridable via config_Voice.json::stt) ─ + speech_entry_rms = float(self._stt.get("speech_entry_rms", 250.0)) + silence_exit_rms = float(self._stt.get("silence_exit_rms", 120.0)) + silence_dur = float(self._stt.get("silence_duration_sec", 1.2)) + max_dur = float(self._stt.get("max_record_sec", 8.0)) + min_dur = float(self._stt.get("min_record_sec", 0.4)) + ambient_probe_s = float(self._stt.get("ambient_probe_sec", 0.2)) + ambient_mult = float(self._stt.get("ambient_mult", 2.5)) + + small_chunk_bytes = 1024 + analysis_ms = 100 + analysis_bytes = int(self._sample_rate * analysis_ms / 1000) * 2 + + # ── 1. Reuse the wake detector's baseline instead of probing + # the mic right now. The wake detector's _baseline is a rolling + # mean of idle-silence RMS values from the last few seconds. + # + # Why NOT probe at record-time: we arrive here right after TTS + # "Yes", and the user typically starts speaking within 200 ms + # of hearing the ack. A probe window sized to the ambient floor + # then measures the *user's speech* as "ambient" and sets + # speech_entry above the user's actual amplitude — causing the + # "no speech in 8.00s" failure mode observed in the wild. + # + # Cap the baseline at a sensible ceiling so a one-off loud + # transient during idle doesn't lock us out either. + probe_buf = bytearray() # no probe audio kept + ambient_rms = getattr(self._detector, "_baseline", 0.0) or 0.0 + ambient_cap = float(self._stt.get("ambient_cap_rms", 200.0)) + ambient_rms = min(ambient_rms, ambient_cap) + + if ambient_rms > 0: + adaptive_exit = max(silence_exit_rms, ambient_rms * ambient_mult) + adaptive_entry = max(speech_entry_rms, ambient_rms * ambient_mult * 1.8) + else: + adaptive_exit, adaptive_entry = silence_exit_rms, speech_entry_rms + + log.info("vad: ambient_rms=%.0f (from wake baseline, cap=%.0f) " + "speech_entry=%.0f silence_exit=%.0f", + ambient_rms, ambient_cap, adaptive_entry, adaptive_exit) + + # ── 2. main capture loop ────────────────────────────────────── + collected = bytearray(probe_buf) # keep probe audio — user may + # have already started talking + analysis_buf = bytearray() + silence_budget = 0.0 + total_time = len(probe_buf) / 2 / self._sample_rate + speech_seen = False + peak_rms_seen = 0.0 + # Byte offset into `collected` at which speech first crossed + # adaptive_entry. We trim pre-speech silence to this point (minus + # ~300 ms pre-roll) before returning. Keeping Whisper's input + # tight (speech + small tails) improves transcription accuracy + # by removing the ambient/HVAC portion that dilutes the mel + # features. + speech_start_byte: Optional[int] = None + preroll_bytes = int(self._sample_rate * 0.3) * 2 # 300 ms + wall_start = time.time() + + while total_time < max_dur and (time.time() - wall_start) < max_dur + 2: + raw = self._read_mic_gained(small_chunk_bytes) + if not raw: + time.sleep(0.005) + continue + collected.extend(raw) + analysis_buf.extend(raw) + total_time += (len(raw) // 2) / self._sample_rate + + while len(analysis_buf) >= analysis_bytes: + win = np.frombuffer(bytes(analysis_buf[:analysis_bytes]), dtype=np.int16) + del analysis_buf[:analysis_bytes] + rms = float(np.sqrt(np.mean(win.astype(np.float64) ** 2))) + peak_rms_seen = max(peak_rms_seen, rms) + + if rms >= adaptive_entry: + if not speech_seen: + speech_seen = True + # Record where speech started (byte offset + # in `collected`) so we can trim pre-roll later. + speech_start_byte = max(0, len(collected) - preroll_bytes) + silence_budget = 0.0 + elif speech_seen and rms < adaptive_exit: + silence_budget += analysis_ms / 1000.0 + # between exit and entry → hold state (hysteresis zone) + + # end only *after* we've heard real speech + if (speech_seen + and silence_budget >= silence_dur + and total_time >= min_dur): + log.info("silence after speech at %.2fs (peak_rms=%.0f)", + total_time, peak_rms_seen) + break + + if not speech_seen: + log.info("no speech in %.2fs (peak_rms=%.0f < entry=%.0f) — dropping", + total_time, peak_rms_seen, adaptive_entry) + return np.array([], dtype=np.int16) + + if total_time >= max_dur: + log.info("max-record-sec hit at %.2fs (peak_rms=%.0f)", + total_time, peak_rms_seen) + + # Trim leading pre-speech silence. Keep 300 ms of pre-roll so + # the onset of the first phoneme is preserved for Whisper. + if speech_start_byte and speech_start_byte > 0: + trimmed_ms = speech_start_byte / 2 / self._sample_rate * 1000 + log.info("trimmed %.0f ms of leading silence " + "(pre-speech buffer %d bytes)", + trimmed_ms, speech_start_byte) + collected = collected[speech_start_byte:] + + return (np.frombuffer(bytes(collected), dtype=np.int16) + if collected else np.array([], dtype=np.int16)) + + # ─── transcription ──────────────────────────────────── + + def _transcribe(self, audio_i16: np.ndarray) -> str: + """int16 PCM → Whisper transcription. Returns '' on no-speech/noise.""" + fw = self._get_fw() + if fw is None: + return "" + + # mic_gain was already applied in _read_mic_gained() during + # _record_command, so audio_i16 here is already boosted. + + # int16 → float32 [-1, 1] + DSP pre-processing: + # 1. DC offset removal (subtract mean) — removes any mic bias + # 2. High-pass filter at 80 Hz — kills HVAC rumble, G1 fan noise, + # and speaker-vibration resonance. Whisper ignores the + # rumble band anyway, but it inflates RMS estimation and + # steals dynamic range from the speech band. + # 3. Pre-emphasis (0.97 coeff) — mild high-frequency boost + # that sharpens consonants (/t/, /s/, /k/ plosives/fricatives) + # which Whisper's mel features care most about. + # 4. Peak-normalize to 0.7. + audio_f32 = audio_i16.astype(np.float32) / 32768.0 + # 1. DC removal + audio_f32 = audio_f32 - np.mean(audio_f32) + # 2. High-pass at 80 Hz (1-pole IIR, stable + cheap) + audio_f32 = self._highpass_80hz(audio_f32) + # 3. Pre-emphasis y[n] = x[n] - 0.97 * x[n-1] + audio_f32 = np.append( + audio_f32[:1], audio_f32[1:] - 0.97 * audio_f32[:-1] + ) + # 4. Peak-normalize + peak = float(np.abs(audio_f32).max()) + if peak > 1e-4 and peak < 0.7: + boost = 0.7 / peak + audio_f32 = audio_f32 * boost + log.info("peak-normalized ×%.2f (peak %.3f → 0.70)", boost, peak) + + # Initial prompt biases the model toward our command vocabulary. + # Whisper uses this as decoder context — words in the prompt become + # more likely, which converts ambiguous low-SNR audio like "muv rahh" + # from a plausible English phrase ("and provide") into the intended + # command ("move right"). Keep short — long prompts can be echoed. + init_prompt = self._stt.get( + "whisper_initial_prompt", + "turn left, turn right, move forward, walk back, stop, come here, " + "sit down, stand up, raise arm, wave, look around, what do you see, " + "remember this, go home, patrol." + ) + + beam_size = int(self._stt.get("whisper_beam_size", 5)) + no_speech_threshold = float(self._stt.get("whisper_no_speech_threshold", 0.6)) + log_prob_threshold = float(self._stt.get("whisper_log_prob_threshold", -1.0)) + compression_ratio_t = float(self._stt.get("whisper_compression_ratio_threshold", 2.4)) + + # Temperature fallback: greedy first (T=0), then 0.2, then 0.4. + # Whisper retries automatically when a pass is rejected by + # its confidence gates (log_prob < threshold etc.). On noisy + # audio this commonly rescues a bad greedy decode. + temperatures = self._stt.get( + "whisper_temperature_fallback", [0.0, 0.2, 0.4] + ) + try: + segments, info = fw.transcribe( + audio_f32, + language="en", + beam_size=beam_size, # 5 = much better than greedy on noisy audio + temperature=temperatures, # greedy → 0.2 → 0.4 fallback + initial_prompt=init_prompt, # command-vocabulary bias (empty by default) + condition_on_previous_text=False, + vad_filter=False, # we already trimmed silence + without_timestamps=True, + # Whisper's built-in gates — drop transcripts that look + # like hallucinations (very low prob, highly compressed). + no_speech_threshold=no_speech_threshold, + log_prob_threshold=log_prob_threshold, + compression_ratio_threshold=compression_ratio_t, + ) + # Collect segments and their mean log-prob for a confidence signal. + seg_list = list(segments) + text = " ".join(s.text for s in seg_list).strip() + nsp = float(getattr(info, "no_speech_prob", 0.0)) + if seg_list: + mean_lp = sum(getattr(s, "avg_logprob", 0.0) for s in seg_list) / len(seg_list) + log.info("whisper: lp=%.2f nsp=%.2f text=%r", + mean_lp, nsp, text[:80]) + else: + # CRITICAL: log even when Whisper returned zero segments + # so we can see WHY it dropped everything. Usually nsp is + # above the threshold or the log-prob fallback killed it. + log.info("whisper: (no segments) nsp=%.2f thresholds: nsp>%.2f && lp<%.2f → drop", + nsp, no_speech_threshold, log_prob_threshold) + except Exception as e: + log.error("faster-whisper transcribe failed: %s", e) + return "" + + if not text: + return "" + + # Reject Whisper garbage patterns (stt.garbage_patterns) and + # transcriptions shorter than stt.min_transcription_length. + # Preserve: + # - bare wake words (valid "just Sanad" signal → ack) + # - exact matches in stt.command_vocab (legitimate short + # commands like "go", "hi" must survive the length filter) + low = text.lower().strip().rstrip(".!?,") + vocab_exact = {c.lower() for c in COMMAND_VOCAB} + if low in GARBAGE_PATTERNS or len(low) < _MIN_TRANSCRIPTION_LENGTH: + if low not in WAKE_WORDS and low not in vocab_exact: + log.info("Rejecting likely noise transcription: %r", text) + return "" + + # NOTE: fuzzy-match to canonical command phrase used to happen + # here, but it runs BEFORE gated-mode could see the wake word. + # Moved to _normalize_command() and called at dispatch time + # AFTER the wake-word gate + wake-word strip, so the gate + # always sees the raw Whisper text. + return text + + @staticmethod + def _highpass_80hz(x: np.ndarray, sr: int = 16_000) -> np.ndarray: + """ + 1-pole IIR high-pass at ~80 Hz. Attenuates HVAC/fan rumble + without touching the speech band. Cheap: 2 multiplies per sample. + """ + if x.size < 2: + return x + # Alpha from fc=80Hz: alpha = RC / (RC + dt), RC = 1/(2*pi*fc) + import math + rc = 1.0 / (2 * math.pi * 80.0) + dt = 1.0 / sr + alpha = rc / (rc + dt) + y = np.empty_like(x) + y[0] = x[0] + # vectorised enough — the loop is JITted by numpy internally + # for reasonable sizes (~25k samples). + prev_y, prev_x = x[0], x[0] + for i in range(1, x.size): + cur = alpha * (prev_y + x[i] - prev_x) + y[i] = cur + prev_y, prev_x = cur, x[i] + return y + + def _transcribe_raw(self, audio_i16: np.ndarray) -> str: + """ + Like _transcribe but WITHOUT the garbage-pattern / length filters + and without the `initial_prompt` bias. Used for wake verify, where: + - We only care about the first phoneme (s/sh/z) — a 2-char "so" + is a valid /sa-/ signature and MUST NOT be dropped by + min_transcription_length. + - A biased initial_prompt makes Whisper echo itself on unclear + audio ("This is a robot assistant" → not s-starting → reject). + The downside (no Sanad nudge) is fine here because the acoustic + detector has already gated out non-speech. + """ + fw = self._get_fw() + if fw is None: + return "" + if self._mic_gain != 1.0: + audio_i16 = np.clip( + audio_i16.astype(np.float32) * self._mic_gain, -32768, 32767 + ).astype(np.int16) + audio_f32 = audio_i16.astype(np.float32) / 32768.0 + peak = float(np.abs(audio_f32).max()) + if peak > 1e-4 and peak < 0.7: + audio_f32 = audio_f32 * (0.7 / peak) + try: + segments, info = fw.transcribe( + audio_f32, + language="en", + beam_size=int(self._stt.get("whisper_beam_size", 5)), + temperature=0.0, + initial_prompt="", # NO bias → NO prompt echo + condition_on_previous_text=False, + vad_filter=False, + without_timestamps=True, + # Looser gates — we're about to do phonetic match, + # not trust the transcription verbatim. + no_speech_threshold=0.85, + log_prob_threshold=-1.8, + compression_ratio_threshold=3.0, + ) + seg_list = list(segments) + text = " ".join(s.text for s in seg_list).strip() + if seg_list: + mean_lp = sum(getattr(s, "avg_logprob", 0.0) for s in seg_list) / len(seg_list) + log.info("whisper-raw: lp=%.2f nsp=%.2f text=%r", + mean_lp, getattr(info, "no_speech_prob", 0.0), text[:80]) + return text + except Exception as e: + log.error("whisper-raw transcribe failed: %s", e) + return "" + + # ─── command transcription ──────────────────────────── + + def _transcribe_command(self, audio_i16: np.ndarray) -> str: + """ + Decode the recorded command audio with faster-whisper. Thin + wrapper over self._transcribe(); exists so _handle_wake and the + always-on loop share one entry point. + """ + if audio_i16.size == 0: + return "" + return self._transcribe(audio_i16) + + def _save_turn_wav( + self, audio_i16: np.ndarray, transcription: str = "", tag: str = "cmd", + ) -> Optional[str]: + """ + Save a single-turn command recording for debugging. + + Filename: {tag}_{epoch}_{sanitised_transcription}.wav + Examples: + cmd_1728562000_turn_right.wav ← successful command + cmd_1728562030_hi.wav ← Whisper misheard as 'Hi' + unk_1728562045_.wav ← Whisper returned empty + cmd_1728562060_thanks_for_watch.wav ← garbage-filtered + + Rotation: keeps the most recent 50 across all tags so the disk + doesn't fill up during a long session. Tunable via + stt.recording_keep_count. + """ + try: + import re as _re + import wave + rec_dir = os.path.join( + PROJECT_ROOT, + self._config.get("audio", {}).get("data_dir", "Data/Voice/Recordings"), + ) + os.makedirs(rec_dir, exist_ok=True) + + # Rotate — keep only the most recent N across all command WAVs. + keep = int(self._stt.get("recording_keep_count", 50)) + existing = sorted( + f for f in os.listdir(rec_dir) + if (f.startswith("cmd_") or f.startswith("unk_")) and f.endswith(".wav") + ) + for old in existing[:max(0, len(existing) - keep + 1)]: + try: os.remove(os.path.join(rec_dir, old)) + except Exception: pass + + # Sanitise transcription for filename: lowercase, alnum + _, <=40 chars + slug = _re.sub(r'[^a-z0-9]+', '_', (transcription or "").lower()).strip('_')[:40] + path = os.path.join( + rec_dir, f"{tag}_{int(time.time())}_{slug}.wav" + ) + with wave.open(path, "wb") as w: + w.setnchannels(1) + w.setsampwidth(2) + w.setframerate(self._sample_rate) + w.writeframes(audio_i16.astype(np.int16).tobytes()) + return path + except Exception as e: + log.warning("failed to save turn wav: %s", e) + return None + + def _save_unk_wav(self, audio_i16: np.ndarray) -> Optional[str]: + """Backward-compat wrapper — save with the `unk` tag.""" + return self._save_turn_wav(audio_i16, transcription="", tag="unk") + + # ─── command normalization (post-gate) ──────────────── + + def _normalize_command(self, text: str) -> str: + """ + Apply fuzzy-match to the closest canonical command phrase. + Call AFTER the gated wake check so the wake word has already + been stripped by the caller if appropriate. Turns near-misses + like "Turn right up" → "turn right" so command_parser.py's + regex fast-path can hit them without an LLM round-trip. + """ + canonical = _closest_command(text, cutoff=self._vocab_cutoff) + if canonical != text: + log.info("fuzzy-match: %r → %r", text, canonical) + return canonical # ─── main loop ──────────────────────────────────────── def _voice_loop(self): + """ + Dispatch to the right loop based on stt.mode: + "wake_and_command" — require "Sanad" wake word (acoustic), then + record and transcribe a command. + "always_on" — Transcribe every utterance, log all, and + dispatch all to the brain. No wake. + "always_on_gated" — Transcribe every utterance and log all, + but ONLY dispatch utterances that contain + "Sanad" (fuzzy). Wake word is stripped + before the command is sent to the brain. + """ + mode = self._stt.get("mode", "wake_and_command").lower() self._mic_capture.start() - log.info("Voice loop started — listening for wake (energy-based, no ML)") + if mode in ("always_on", "always_on_gated"): + self._voice_loop_always_on(gated=(mode == "always_on_gated")) + else: + self._voice_loop_wake() + + def _voice_loop_wake(self): + """Classic wake-and-command: listen for 'Sanad', then record command.""" + log.info("Voice loop started — listening for wake (energy-based)") + + was_speaking = False while self._running: try: - # Don't listen while the robot is speaking (prevents - # self-trigger from TTS output leaking into the mic). if self._audio.is_speaking: + was_speaking = True time.sleep(0.1) self._detector.reset() continue - chunk = self._mic_capture.read_chunk(1024) # ~32 ms at 16 kHz + if was_speaking: + time.sleep(0.25) + self._mic_capture.flush() + self._detector.reset() + was_speaking = False + + if time.time() < self._cooldown_until: + _ = self._read_mic_raw(1024) + self._detector.reset() + time.sleep(0.05) + continue + + chunk = self._read_mic_raw(1024) if not chunk: continue if self._detector.process(chunk): - self._on_wake_fired() + self._handle_wake() except Exception as e: log.error("Voice loop error: %s", e, exc_info=True) time.sleep(1) - def _on_wake_fired(self): - log.info("Wake detected (acoustic)") - print("\n [Sanad] wake heard — type your command at the prompt.") - # TTS ack - msg = self._messages.get("wake_heard", "Yes") - try: - self._audio.speak(msg) - except Exception as e: - log.warning("TTS ack failed: %s", e) + def _voice_loop_always_on(self, gated: bool = False): + """ + Always-on mode — Sanad-style continuous listening. - # Brain callbacks for compatibility with the old interface. - if self._on_wake: + If `gated` is True, utterances that don't contain the wake word + "Sanad" (or a fuzzy variant) are logged but NOT dispatched to the + brain — the robot hears everything, speaks only when addressed. + + Architecture (no wake word, no ack TTS): + 1. Continuously read the gained mic stream in 32 ms chunks. + 2. Run a hysteretic VAD on the stream — speech_entry_rms + starts an utterance, silence_exit_rms + silence_duration + ends one. + 3. On each utterance end → Whisper transcribe → fuzzy-match + → dispatch to brain. + 4. Every ~5 s of idle: log a `ambient: rms=... peak=...` line + so you can SEE what the mic is doing at all times, even + when nobody's talking. Matches Sanad's "always listening" + visibility. + 5. Speech is not gated on amplitude — everything above the + entry threshold is captured, quiet or loud. Loud speech + clips naturally against int16; Whisper handles it. + + Thresholds come from the same stt.* config as wake mode but are + typically tuned lower here (you want eager capture since there's + no wake-word gate to prevent false positives). + """ + log.info( + "Voice loop started — ALWAYS-ON mode%s", + " [gated: only 'Sanad' utterances dispatched]" if gated + else " (no wake word — every utterance dispatched)" + ) + + speech_entry = float(self._stt.get("always_on_speech_entry_rms", 250.0)) + silence_exit = float(self._stt.get("always_on_silence_exit_rms", 120.0)) + silence_dur = float(self._stt.get("always_on_silence_duration_sec", 0.8)) + min_utter_s = float(self._stt.get("always_on_min_utterance_sec", 0.3)) + max_utter_s = float(self._stt.get("always_on_max_utterance_sec", 12.0)) + idle_log_s = float(self._stt.get("always_on_idle_log_sec", 5.0)) + ambient_mult = float(self._stt.get("always_on_ambient_mult", 1.4)) + ambient_win = int(self._stt.get("always_on_ambient_window_chunks", 100)) + + buffer = bytearray() + in_speech = False + silence_budget = 0.0 + speech_duration = 0.0 + peak_rms = 0.0 + idle_peak_rms = 0.0 + idle_sum_rms = 0.0 + idle_chunks = 0 + last_idle_log = time.time() + was_speaking_tts = False + + # Rolling ambient (idle-only) RMS buffer. Used to adapt silence_exit + # so a noisy room doesn't trap the VAD at max_utter_s: if the + # observed idle floor sits at rms=200, silence_exit needs to be + # above 200 or silence never accumulates. We take + # effective_exit = max(config_silence_exit, ambient_floor * mult). + ambient_buf: list = [] + ambient_floor = 0.0 + + # Seed ambient_floor by sampling ~1s of mic BEFORE entering the + # loop. Without this, the very first utterance runs with + # ambient_floor=0 → eff_exit=config_floor, which under-cuts + # noisy rooms and creates self-sustaining echo loops. + seed_chunks = [] + seed_deadline = time.time() + 1.0 + while time.time() < seed_deadline: + r = self._read_mic_gained(1024) + if r: + a = np.frombuffer(r, dtype=np.int16) + if a.size: + seed_chunks.append( + float(np.sqrt(np.mean(a.astype(np.float64) ** 2))) + ) + else: + time.sleep(0.005) + if seed_chunks: + # Use the median so one loud transient doesn't poison the seed. + seed_chunks.sort() + ambient_floor = seed_chunks[len(seed_chunks) // 2] + ambient_buf = list(seed_chunks[-ambient_win:]) + log.info("ambient seeded: floor=%.0f from %d chunks", + ambient_floor, len(seed_chunks)) + + while self._running: try: - self._on_wake() + # Drop mic input while the robot itself is speaking so we + # don't feed our own TTS back through Whisper. + if self._audio.is_speaking: + was_speaking_tts = True + buffer.clear() + in_speech = False + silence_budget = 0.0 + speech_duration = 0.0 + peak_rms = 0.0 + time.sleep(0.1) + continue + + if was_speaking_tts: + time.sleep(float(self._stt.get("post_tts_settle_sec", 0.3))) + self._mic_capture.flush() + was_speaking_tts = False + + raw = self._read_mic_gained(1024) + if not raw: + time.sleep(0.005) + continue + + arr = np.frombuffer(raw, dtype=np.int16) + rms = float(np.sqrt(np.mean(arr.astype(np.float64) ** 2))) + chunk_s = (len(raw) // 2) / self._sample_rate + + if in_speech: + buffer.extend(raw) + speech_duration += chunk_s + peak_rms = max(peak_rms, rms) + + # Adaptive silence exit: sits max(config_floor, + # ambient_floor × mult). Prevents the "room is noisier + # than silence_exit" failure mode where silence never + # accumulates and every utterance hits max_utter_s. + eff_exit = max(silence_exit, ambient_floor * ambient_mult) + if rms < eff_exit: + silence_budget += chunk_s + else: + silence_budget = 0.0 + + utter_over = (silence_budget >= silence_dur and + speech_duration >= min_utter_s) + force_stop = speech_duration >= max_utter_s + + if utter_over or force_stop: + reason = "max-duration" if force_stop else "silence" + audio = np.frombuffer(bytes(buffer), dtype=np.int16) + log.info("utterance end (%s): dur=%.2fs peak_rms=%.0f samples=%d", + reason, speech_duration, peak_rms, audio.size) + + # RESET STATE IMMEDIATELY — before any Whisper / + # speak() / dispatch. Previously a `continue` from + # the wake-only ack branch skipped the reset, and + # the 12-second buffer lived forever, re-transcribed + # every iteration into the same "Sanad" output, + # spawning a self-sustaining "Yes" loop. + buffer.clear() + in_speech = False + silence_budget = 0.0 + speech_duration = 0.0 + peak_rms = 0.0 + + text = self._transcribe_command(audio) if audio.size else "" + if text: + log.info("HEARD: %r", text) + # Gated mode: only dispatch if the wake word was + # spoken. Everything is still logged above so the + # operator has full visibility into what the mic + # is picking up. + if gated and not _has_wake_word(text): + log.info(" (no wake word — not dispatched)") + else: + if gated: + command = _strip_wake_word(text) + if command != text: + log.info(" wake-stripped: %r → %r", + text, command) + # Bare wake word ("Sanad.", "Sanad") → + # speak a "Yes" ack, do NOT call the + # brain (it would hallucinate a random + # response from a 1-word prompt). + if not command: + log.info(" wake-only utterance — speaking ack") + try: + self._audio.speak( + self._messages.get("wake_heard", "Yes") + ) + except Exception as e: + log.warning("wake-ack TTS failed: %s", e) + continue + else: + command = text + + # Normalize near-misses ("Turn right up" → + # "turn right") so command_parser's regex + # fast-path can hit without an LLM round-trip. + command = self._normalize_command(command) + + print(f' [Sanad] heard: "{command}"') + if self._on_command: + try: + self._on_command(command, "en") + except Exception as e: + log.error("on_command: %s", e, exc_info=True) + else: + log.info("utterance rejected (empty/garbage after Whisper)") + else: + idle_peak_rms = max(idle_peak_rms, rms) + idle_sum_rms += rms + idle_chunks += 1 + + # Maintain the rolling ambient floor so silence_exit can + # adapt. Use windows that are *clearly* not speech + # (rms < speech_entry / 2) — otherwise a borderline + # window just before transition pollutes the floor. + if rms < speech_entry * 0.5: + ambient_buf.append(rms) + if len(ambient_buf) > ambient_win: + ambient_buf.pop(0) + if ambient_buf: + ambient_floor = sum(ambient_buf) / len(ambient_buf) + + if rms >= speech_entry: + # utterance starts — keep this chunk as pre-roll + log.info("utterance start (rms=%.0f >= entry=%.0f)", + rms, speech_entry) + buffer.extend(raw) + in_speech = True + speech_duration = chunk_s + peak_rms = rms + silence_budget = 0.0 + + # periodic ambient log while idle — "I am listening" + now = time.time() + if (now - last_idle_log) >= idle_log_s and idle_chunks > 0: + eff_exit = max(silence_exit, ambient_floor * ambient_mult) + log.info("ambient: mean_rms=%.0f peak_rms=%.0f chunks=%d " + "floor=%.0f entry=%.0f eff_exit=%.0f", + idle_sum_rms / idle_chunks, idle_peak_rms, + idle_chunks, ambient_floor, speech_entry, eff_exit) + idle_peak_rms = 0.0 + idle_sum_rms = 0.0 + idle_chunks = 0 + last_idle_log = now except Exception as e: - log.error("on_wake callback error: %s", e) - elif self._on_command: - # Old API expected (text, lang). We have no transcription, so - # pass empty text — brain is expected to prompt for typed input. + log.error("Always-on voice loop error: %s", e, exc_info=True) + time.sleep(1) + + def _handle_wake(self): + t_wake = time.time() + log.info("Wake detected (acoustic)") + + # Verify the burst that triggered wake actually sounds like a + # wake word. The acoustic detector fires on ANY 0.2-1.5s burst + # (coughs, claps, door slams). We run a lightweight Whisper + # decode on the burst and accept if EITHER: + # (a) a wake-word variant is in the transcription, OR + # (b) the transcription starts with 's'/'sh'/'z' — Whisper's + # consistent signature for mishearing non-English "Sanad" + # as an English /sa-/ word ("Stop", "Set", "Sand", "Send"). + # Reject if Whisper returns empty (pure noise / cough) or + # confidently not-s speech ("hello", "okay"). + if self._stt.get("wake_verify_enabled", True): + burst = self._detector.get_last_burst() + if burst is not None and burst.size >= int(0.15 * self._sample_rate): + t_verify = time.time() + # Lenient transcribe — no garbage filter, no min-length, + # no bias prompt. See _transcribe_raw docstring. + heard = self._transcribe_raw(burst) + verify_ms = (time.time() - t_verify) * 1000 + low = (heard or "").lower().strip().lstrip('"\'.,!?') + if not low: + log.info(" wake REJECTED — whisper empty (%.0fms)", verify_ms) + return + starts_with_s = low.startswith(("s", "sh", "z")) + if _has_wake_word(heard): + log.info(" wake verified (wake-word: %r, %.0fms)", + heard, verify_ms) + elif starts_with_s: + log.info(" wake verified (s-phonetic: %r, %.0fms)", + heard, verify_ms) + else: + log.info(" wake REJECTED — %r (%.0fms, not s-starting)", + heard, verify_ms) + return + + print("\n [Sanad] wake heard — listening…") + + ack_mode = self._stt.get("wake_ack", "tts").lower() + if ack_mode == "none": + log.info(" wake-ack: silent (no TTS)") + else: try: - self._on_command("", "en") + self._audio.speak(self._messages.get("wake_heard", "Yes")) except Exception as e: - log.error("on_command callback error: %s", e) + log.warning("TTS ack failed: %s", e) + + # Wait for ack TTS + speaker reverberation to decay + while self._audio.is_speaking: + time.sleep(0.05) + settle = float(self._stt.get("post_tts_settle_sec", 0.3)) + time.sleep(settle) + self._mic_capture.flush() + log.info(" wake→record-ready: %.2fs", time.time() - t_wake) + + log.info("Recording command...") + audio = self._record_command() + # _record_command returns empty if it never saw speech above the + # adaptive entry threshold — no point running STT on noise. + # Two cases: + # audio.size == 0 → no speech at all (likely false wake + # from cough/slam). SILENTLY reset — + # don't blurt "I didn't catch that" on + # what was never a real interaction. + # 0 < size < 8000 → brief speech burst (< 0.5s). Probably + # a real-but-unintelligible attempt; + # speak "I didn't catch that" so the + # user knows to retry. + if audio.size == 0: + log.info("Command dropped (no speech — likely false wake); silent reset") + self._cooldown_until = time.time() + float( + self._stt.get("command_cooldown_sec", 1.5)) + return + if audio.size < 8000: # < 0.5 s but > 0 — real short attempt + log.info("Command too short (%.2fs); asking user to repeat", + audio.size / self._sample_rate) + try: self._audio.speak(self._messages.get("no_speech", "I didn't catch that")) + except Exception: pass + self._cooldown_until = time.time() + float( + self._stt.get("command_cooldown_sec", 1.5)) + return + + peak = int(np.abs(audio).max()) + rms = float(np.sqrt(np.mean(audio.astype(np.float64) ** 2))) + log.info("command audio: samples=%d peak=%d rms=%.1f", + audio.size, peak, rms) + + text = self._transcribe_command(audio) + if not text: + log.info("Empty or rejected transcription") + # Save WAV of the failed transcription for post-mortem. + if self._stt.get("recording_enabled", True): + self._save_turn_wav(audio, transcription="", tag="unk") + try: self._audio.speak(self._messages.get("no_speech", "I didn't catch that")) + except Exception: pass + self._cooldown_until = time.time() + float( + self._stt.get("command_cooldown_sec", 1.5)) + return + + # Normalize near-miss transcriptions like "Turn right up" → "turn + # right" so the brain's regex fast-path catches them. + text = self._normalize_command(text) + log.info("Transcribed: %s", text[:120]) + + # Save every successful command recording so you can listen back + # later and see what the mic actually heard vs what Whisper + # transcribed. Disable with stt.recording_enabled=false. + if self._stt.get("recording_enabled", True): + wav_path = self._save_turn_wav(audio, transcription=text, tag="cmd") + if wav_path: + log.info("saved: %s", os.path.basename(wav_path)) + + if self._on_command: + try: + self._on_command(text, "en") + except Exception as e: + log.error("on_command error: %s", e, exc_info=True) + elif self._on_wake: + try: self._on_wake() + except Exception: pass + + cd = float(self._stt.get("command_cooldown_sec", 1.5)) + self._cooldown_until = time.time() + cd + log.info("wake→dispatch total: %.2fs | cooldown %.1fs", + time.time() - t_wake, cd) # ─── start / stop ───────────────────────────────────── @@ -166,18 +1143,14 @@ class VoiceModule: log.warning("VoiceModule already running") return self._running = True - self._thread = threading.Thread( - target=self._voice_loop, daemon=True, name="voice", - ) + self._thread = threading.Thread(target=self._voice_loop, daemon=True, name="voice") self._thread.start() log.info("Voice module started") def stop(self): self._running = False - try: - self._mic_capture.stop() - except Exception: - pass + try: self._mic_capture.stop() + except Exception: pass if self._thread: self._thread.join(timeout=5) self._thread = None @@ -188,22 +1161,15 @@ class VoiceModule: return self._running -# ─── standalone test ───────────────────────────────────── if __name__ == "__main__": from API.audio_api import AudioAPI - - def on_wake(): - print(" (brain callback fired)") - + def on_cmd(text, lang): + print(f"\n COMMAND [{lang}]: {text}\n") audio = AudioAPI() - voice = VoiceModule(audio, on_wake=on_wake) - print("Starting voice module... say any short word to test the wake.") - print("Press Ctrl-C to stop.\n") + voice = VoiceModule(audio, on_command=on_cmd) + print('Starting. Say "Sanad", then speak your command.\n') voice.start() try: - while voice.is_running: - time.sleep(0.5) + while voice.is_running: time.sleep(0.5) except KeyboardInterrupt: - print("\nStopping...") voice.stop() - print("Done.") diff --git a/Voice/wake_detector.py b/Voice/wake_detector.py index ef90796..640ad11 100644 --- a/Voice/wake_detector.py +++ b/Voice/wake_detector.py @@ -49,10 +49,14 @@ import numpy as np @dataclass class WakeConfig: sample_rate: int = 16_000 - # RMS (int16 units) above which we consider a chunk to be speech. - # G1 on-board mic at normal speaking distance has rms ≈ 500-1500 - # during speech and ≈ 40-100 in silence. 150 is a safe middle ground. - speech_threshold: float = 150.0 + # RMS (int16 units) FLOOR for "this chunk is speech". The effective + # threshold is max(speech_threshold, ambient_baseline * adaptive_mult) + # so this is only a minimum guarantee — the detector adapts upward + # in noisy rooms but never below this floor. + # G1 far-field mic at normal speaking distance has rms ~ 80-400 for + # quiet speech, 400-1500 for clear speech. 80 catches quiet speech; + # raise to 120-150 if fan/typing noise triggers false wakes. + speech_threshold: float = 80.0 # How long a burst of speech must last to count as a "word". min_word_duration_s: float = 0.20 max_word_duration_s: float = 1.50 @@ -63,6 +67,13 @@ class WakeConfig: cooldown_s: float = 1.50 # RMS window size — we analyze this many ms of audio per step. chunk_ms: int = 50 + # Adaptive: how many *recent silent* chunks to average for the noise + # floor, and the multiplier applied on top. effective_threshold = + # max(speech_threshold, baseline * adaptive_mult). + adaptive_window_n: int = 50 # ~2.5 s at 50 ms chunks + adaptive_mult: float = 3.0 + # Periodic diagnostic log cadence (seconds). 0 disables. + diag_log_sec: float = 3.0 class WakeDetector: @@ -88,6 +99,24 @@ class WakeDetector: # chunks don't align with our internal analysis window). self._carry = np.zeros(0, dtype=np.int16) + # Audio of the most-recent wake-triggering burst. Saved when the + # detector fires so callers (marcus_voice) can run Whisper on it + # and verify the word was actually "Sanad" rather than a cough. + self._burst_samples: list = [] # accumulated during SPEAKING + self._last_burst_audio: Optional[np.ndarray] = None + + # Adaptive noise floor (rolling mean of RMS during SILENCE chunks). + self._baseline_buf = [] # last N silent-window RMS values + self._baseline = 0.0 # current estimate + self._peak_since_diag = 0.0 # max rms since last diag log + self._last_diag = time.time() + # Logger is optional — if the host app set up logging, use it. + try: + import logging + self._log = logging.getLogger("wake_detector") + except Exception: + self._log = None + # ── public API ──────────────────────────────────────────────── def process(self, pcm_bytes: bytes) -> bool: @@ -122,31 +151,73 @@ class WakeDetector: self._state = self.STATE_SILENCE self._silence_run = 0 self._carry = np.zeros(0, dtype=np.int16) + self._burst_samples = [] + + def get_last_burst(self) -> Optional[np.ndarray]: + """ + Return the int16 PCM samples of the most-recent wake-triggering + burst, or None if no wake has fired yet. Used by marcus_voice to + verify the triggering word was actually 'Sanad' before proceeding. + """ + return self._last_burst_audio # ── internal ────────────────────────────────────────────────── def _step(self, window: np.ndarray) -> bool: rms = float(np.sqrt(np.mean(window.astype(np.float64) ** 2))) - is_speech = rms > self.cfg.speech_threshold + # Effective threshold = max(config floor, adaptive baseline * mult) + eff = self.cfg.speech_threshold + if self._baseline > 0: + eff = max(eff, self._baseline * self.cfg.adaptive_mult) + is_speech = rms > eff + + # Track peak for diag. Log periodically so you can *see* what the + # detector is hearing — invaluable when "not hearing me" happens. + if rms > self._peak_since_diag: + self._peak_since_diag = rms now = time.time() + if self.cfg.diag_log_sec > 0 and (now - self._last_diag) >= self.cfg.diag_log_sec: + if self._log is not None: + self._log.info( + "wake: peak=%.0f baseline=%.0f eff_threshold=%.0f state=%s", + self._peak_since_diag, self._baseline, eff, self._state, + ) + self._peak_since_diag = 0.0 + self._last_diag = now + if now < self._cooldown_until: return False # silent during cooldown if self._state == self.STATE_SILENCE: + # Learn the noise floor ONLY in silence — so speech bursts + # don't pull the baseline up and lock us out of wake. + if not is_speech: + self._baseline_buf.append(rms) + if len(self._baseline_buf) > self.cfg.adaptive_window_n: + self._baseline_buf.pop(0) + if self._baseline_buf: + self._baseline = sum(self._baseline_buf) / len(self._baseline_buf) if is_speech: self._state = self.STATE_SPEAKING self._speech_start = self._sample_cursor self._silence_run = 0 + # Begin capturing the burst audio for later Whisper verify. + self._burst_samples = [window.copy()] return False # STATE_SPEAKING + # Accumulate every window (speech OR silence inside the burst) + # so we capture the full word + trailing quiet. + self._burst_samples.append(window.copy()) + if is_speech: self._silence_run = 0 # Abort if the burst is longer than a single word — user is # just talking, not addressing the robot. if self._sample_cursor - self._speech_start > self._max_speech: self._state = self.STATE_SILENCE + self._burst_samples = [] return False # Silent window inside SPEAKING — accumulate. @@ -156,6 +227,12 @@ class WakeDetector: self._state = self.STATE_SILENCE self._silence_run = 0 if self._min_speech <= speech_len <= self._max_speech: + # Snapshot burst audio for the caller's Whisper verify. + self._last_burst_audio = ( + np.concatenate(self._burst_samples) + if self._burst_samples else None + ) + self._burst_samples = [] self._cooldown_until = now + self.cfg.cooldown_s return True return False