diff --git a/API/audio_api.py b/API/audio_api.py
index 89169df..8b73f1a 100644
--- a/API/audio_api.py
+++ b/API/audio_api.py
@@ -211,56 +211,58 @@ class AudioAPI:
 
     # ─── G1 SPEAKER PLAYBACK (raw PCM, kept for future backends) ─────────
 
-    def _play_pcm(self, audio_16k: np.ndarray) -> float:
-        """Play 16kHz mono int16 on G1 speaker. Returns duration."""
+    def _play_pcm(self, audio: np.ndarray, rate: int = None) -> float:
+        """
+        Play mono int16 PCM on the G1 speaker.
+
+        `rate` is the sample rate of the incoming `audio`; we always
+        resample to self._target_rate (16 kHz) before sending because the
+        G1 speaker hardware only honors that rate — if you hand it 24 kHz
+        PCM, it plays ~1.5x too fast. This matches the Sanad pattern.
+
+        Uses AudioClient.PlayStream (the high-level API) with a fresh
+        stream_id + STOP_PLAY bracket on either side so a prior stream
+        can't blend into this one.
+        """
         if not self._sdk_available:
             log.warning("SDK not available, cannot play audio")
             return 0.0
 
-        from unitree_sdk2py.g1.audio.g1_audio_api import (
-            ROBOT_API_ID_AUDIO_START_PLAY,
-            ROBOT_API_ID_AUDIO_STOP_PLAY,
-        )
+        src_rate = int(rate) if rate else self._target_rate
+        audio = self._resample(audio, src_rate)   # → self._target_rate
+        if audio.size == 0:
+            return 0.0
+
+        from unitree_sdk2py.g1.audio.g1_audio_api import ROBOT_API_ID_AUDIO_STOP_PLAY
 
         app_name = self._spk["app_name"]
 
-        # Stop previous stream
+        # Stop any prior stream before opening a new one.
         self._client._Call(
             ROBOT_API_ID_AUDIO_STOP_PLAY,
             json.dumps({"app_name": app_name}),
         )
-        time.sleep(0.3)
+        time.sleep(0.15)
 
-        # Build params — unique stream_id every call
-        pcm = audio_16k.tobytes()
         sid = f"s_{int(time.time() * 1000)}"
-        param = json.dumps({
-            "app_name": app_name,
-            "stream_id": sid,
-            "sample_rate": self._target_rate,
-            "channels": 1,
-            "bits_per_sample": 16,
-        })
+        self._client.PlayStream(app_name, sid, audio.tobytes())
 
-        # Single call — full buffer
-        self._client._CallRequestWithParamAndBin(
-            ROBOT_API_ID_AUDIO_START_PLAY, param, list(pcm)
-        )
-
-        duration = len(audio_16k) / self._target_rate
-        time.sleep(duration + 0.5)
+        duration = len(audio) / self._target_rate
+        # Margin covers DDS buffer drain before STOP cuts playback short.
+        time.sleep(duration + 0.3)
 
         self._client._Call(
             ROBOT_API_ID_AUDIO_STOP_PLAY,
             json.dumps({"app_name": app_name}),
         )
 
-        log.info("Played: %.1fs", duration)
+        log.info("Played: %.1fs (src=%d Hz → hw=%d Hz)",
+                 duration, src_rate, self._target_rate)
         return duration
 
-    def play_pcm(self, audio_16k: np.ndarray) -> float:
+    def play_pcm(self, audio: np.ndarray, rate: int = None) -> float:
         """Public wrapper for playing PCM audio."""
-        return self._play_pcm(audio_16k)
+        return self._play_pcm(audio, rate=rate)
 
     # ─── MIC RECORDING ───────────────────────────────────
 
diff --git a/API/zmq_api.py b/API/zmq_api.py
index adeb91b..1b4763f 100644
--- a/API/zmq_api.py
+++ b/API/zmq_api.py
@@ -80,6 +80,11 @@ def send_cmd(cmd: str):
     _ensure_sock().send_string(json.dumps({"cmd": cmd}))
 
 
-# Load MOVE_MAP from navigation config (pure data, safe at import time)
+# Load navigation constants from config (pure data, safe at import time).
+# MOVE_MAP[direction] = (vx, vy, vyaw). STEP_DURATION_SEC is how long one
+# "step" of a bare directional command lasts (2 s at default velocities
+# ≈ 60 cm forward or 34° turn). Both live in config_Navigation.json so
+# motion can be retuned without editing Python.
 _nav = load_config("Navigation")
-MOVE_MAP = {k: tuple(v) for k, v in _nav["move_map"].items()}
+MOVE_MAP           = {k: tuple(v) for k, v in _nav["move_map"].items()}
+STEP_DURATION_SEC  = float(_nav.get("step_duration_sec", 2.0))
diff --git a/Brain/command_parser.py b/Brain/command_parser.py
index a166e28..c25a462 100644
--- a/Brain/command_parser.py
+++ b/Brain/command_parser.py
@@ -4,7 +4,7 @@ Handles place memory, odometry, session recall, help, examples
 """
 import re
 import time
-from API.zmq_api import send_vel, gradual_stop
+from API.zmq_api import send_vel, gradual_stop, MOVE_MAP, STEP_DURATION_SEC
 from API.memory_api import mem, place_save, place_goto, places_list_str
 from API.odometry_api import odom, ODOM_AVAILABLE
 from API.camera_api import get_frame
@@ -36,6 +36,30 @@ _RE_WALK_STEP = re.compile(
     r"^(?:walk|go|move|step)(?:\s+(forward|back(?:ward)?))?\s+(\d+)\s*steps?$", re.I)
 _RE_TURN_STEP = re.compile(
     r"^turn\s+(left|right)(?:\s+(\d+)\s*steps?)?$", re.I)
+
+# Simple one-shot motion — one word or verb+direction, no counts/units.
+# All default to a ~2 s motion at the normal velocity. Kept local so the
+# user doesn't eat a 5 s Qwen round-trip for a trivial "go back".
+#
+# Matches:
+#     "left" / "right" / "forward" / "back" / "backward"
+#     "go back" / "step back" / "move back" / "walk back" / "run back"
+#     "go forward" / "step forward" / "move forward" / "walk forward"
+#     "go left" / "move right" / "step left" / etc.
+#     "head forward" / "head back"
+# Does NOT match multi-word phrases like "walk to the chair" — those
+# still fall through to Qwen where they belong.
+_RE_SIMPLE_DIR = re.compile(
+    r"^(?:(?:walk|go|move|step|run|head)\s+)?"
+    r"(forward|back(?:ward)?|left|right)$",
+    re.I,
+)
+
+# Bare stop / pause words — no need to ask Qwen what "stop" means.
+_RE_STOP_SIMPLE = re.compile(
+    r"^(?:stop|halt|wait|pause|stay|freeze|hold|stand\s+still|don'?t\s+move)$",
+    re.I,
+)
 _RE_PATROL_RT = re.compile(
     r"^patrol[/:]\s*(.+)$", re.I)
 _RE_LAST_CMD = re.compile(
@@ -115,9 +139,10 @@ def try_local_command(cmd: str) -> bool:
         if odom:
             odom.walk_distance(meters)
         else:
+            vx, _, _ = MOVE_MAP["forward"]
             t0 = time.time()
-            while time.time() - t0 < meters / 0.3:
-                send_vel(vx=0.3)
+            while time.time() - t0 < meters / abs(vx):
+                send_vel(vx=vx)
                 time.sleep(0.05)
             gradual_stop()
         return True
@@ -128,9 +153,10 @@ def try_local_command(cmd: str) -> bool:
         if odom:
             odom.walk_distance(meters, direction="backward")
         else:
+            vx, _, _ = MOVE_MAP["backward"]
             t0 = time.time()
-            while time.time() - t0 < meters / 0.2:
-                send_vel(vx=-0.2)
+            while time.time() - t0 < meters / abs(vx):
+                send_vel(vx=vx)
                 time.sleep(0.05)
             gradual_stop()
         return True
@@ -144,9 +170,13 @@ def try_local_command(cmd: str) -> bool:
         if odom:
             odom.turn_degrees(degrees)
         else:
+            # vyaw magnitude comes from MOVE_MAP["left"]; duration is
+            # abs(degrees)/(vyaw_deg_per_sec). vyaw in config is rad/s.
+            _, _, vyaw_mag = MOVE_MAP["left"]
+            vyaw_deg_per_sec = abs(vyaw_mag) * 180.0 / 3.14159265
+            vyaw = vyaw_mag if degrees > 0 else -vyaw_mag
+            duration = abs(degrees) / vyaw_deg_per_sec
             t0 = time.time()
-            vyaw = 0.3 if degrees > 0 else -0.3
-            duration = abs(degrees) / 17.2
             while time.time() - t0 < duration:
                 send_vel(vyaw=vyaw)
                 time.sleep(0.05)
@@ -156,9 +186,11 @@ def try_local_command(cmd: str) -> bool:
     m = _RE_WALK_STEP.match(cmd)
     if m:
         direction = (m.group(1) or "forward").lower()
+        if direction.startswith("back"):
+            direction = "backward"
         steps = int(m.group(2))
-        vx = -0.2 if direction.startswith("back") else 0.3
-        duration = 2.0 * steps
+        vx, _, _ = MOVE_MAP[direction]
+        duration = STEP_DURATION_SEC * steps
         t0 = time.time()
         while time.time() - t0 < duration:
             send_vel(vx=vx)
@@ -170,8 +202,8 @@ def try_local_command(cmd: str) -> bool:
     if m:
         direction = m.group(1).lower()
         steps = int(m.group(2)) if m.group(2) else 1
-        vyaw = 0.3 if direction == "left" else -0.3
-        duration = 2.0 * steps
+        _, _, vyaw = MOVE_MAP[direction]
+        duration = STEP_DURATION_SEC * steps
         t0 = time.time()
         while time.time() - t0 < duration:
             send_vel(vyaw=vyaw)
@@ -179,6 +211,31 @@ def try_local_command(cmd: str) -> bool:
         gradual_stop()
         return True
 
+    # ── BARE / SIMPLE DIRECTIONAL COMMANDS ───────────────────────────────
+    # "left", "right", "forward", "back", "go back", "move forward",
+    # "step right", "walk back", "head forward" — any one-word direction
+    # or verb+direction with no explicit count. Duration and velocities
+    # come entirely from config_Navigation.json (MOVE_MAP +
+    # step_duration_sec) — no magic numbers here.
+    m = _RE_SIMPLE_DIR.match(cmd)
+    if m:
+        direction = m.group(1).lower()
+        if direction.startswith("back"):
+            direction = "backward"
+        vx, vy, vyaw = MOVE_MAP[direction]
+        t0 = time.time()
+        while time.time() - t0 < STEP_DURATION_SEC:
+            send_vel(vx=vx, vy=vy, vyaw=vyaw)
+            time.sleep(0.05)
+        gradual_stop()
+        return True
+
+    # ── BARE STOP / PAUSE ─────────────────────────────────────────────────
+    m = _RE_STOP_SIMPLE.match(cmd)
+    if m:
+        gradual_stop()
+        return True
+
     # ── NAMED PATROL ROUTE ───────────────────────────────────────────────
     m = _RE_PATROL_RT.match(cmd)
     if m:
@@ -234,17 +291,11 @@ def try_local_command(cmd: str) -> bool:
             for phrase, (reverse_dir, _) in move_words.items():
                 if phrase in cl:
                     print(f"  Undoing: '{c}' → reversing with '{reverse_dir}'")
-                    dur, t0 = 2.0, time.time()
-                    if reverse_dir in ("left", "right"):
-                        vyaw = 0.3 if reverse_dir == "left" else -0.3
-                        while time.time() - t0 < dur:
-                            send_vel(vyaw=vyaw)
-                            time.sleep(0.05)
-                    else:
-                        vx = 0.3 if reverse_dir == "forward" else -0.2
-                        while time.time() - t0 < dur:
-                            send_vel(vx=vx)
-                            time.sleep(0.05)
+                    vx, vy, vyaw = MOVE_MAP[reverse_dir]
+                    t0 = time.time()
+                    while time.time() - t0 < STEP_DURATION_SEC:
+                        send_vel(vx=vx, vy=vy, vyaw=vyaw)
+                        time.sleep(0.05)
                     gradual_stop()
                     return True
         print("  No movement command to undo")
diff --git a/Config/config_Navigation.json b/Config/config_Navigation.json
index f5f6f7c..bb166eb 100644
--- a/Config/config_Navigation.json
+++ b/Config/config_Navigation.json
@@ -3,6 +3,8 @@
   "min_steps_before_check": 3,
   "scan_interval_s": 0.4,
   "rotation_speed": 0.3,
+  "_step_duration_comment": "Duration of one 'step' for bare directional commands ('go back', 'turn right', etc.). 2.0s at move_map velocities ≈ 60 cm forward, 40 cm back, 34° turn. Change here and every regex fast-path in command_parser.py uses the new value.",
+  "step_duration_sec": 2.0,
   "move_map": {
     "forward":  [0.3,  0.0,  0.0],
     "backward": [-0.2, 0.0,  0.0],
diff --git a/Config/config_Voice.json b/Config/config_Voice.json
index d3fa55e..0332956 100644
--- a/Config/config_Voice.json
+++ b/Config/config_Voice.json
@@ -5,14 +5,119 @@
     "target_sample_rate": 16000
   },
   "stt": {
-    "backend": "custom_acoustic",
-    "_comment": "Pure-DSP wake detector in Voice/wake_detector.py. No ML.",
-    "speech_threshold":   150.0,
-    "min_word_duration":  0.20,
-    "max_word_duration":  1.50,
-    "post_silence":       0.30,
-    "wake_cooldown":      1.50,
-    "wake_chunk_ms":      50
+    "backend": "faster_whisper",
+    "_comment": "Custom energy wake detector (instant, no ML) + faster-whisper base.en int8 on CPU for command transcription. Wake fires on any 0.2-1.5s speech burst; Whisper only runs on the recorded command, so it's ~10x less busy than a Whisper-polling setup.",
+
+    "_mode_comment": "Three modes:  'wake_and_command' = instant acoustic wake detector (no ML) hears 'Sanad', THEN records a ~2s command, transcribes once — fastest, most reliable on G1 mic.  'always_on' = continuous VAD → Whisper every utterance, dispatch all (chatty, LLM gets every noise).  'always_on_gated' = continuous transcribe, dispatch only utterances containing 'Sanad' (Sanad-style but Whisper hallucinates commands from TTS echo on G1 mic, creating feedback loops — keep as opt-in, not default).",
+    "mode": "wake_and_command",
+
+    "_always_on_comment": "Tunables for always_on mode only. Lower entry threshold catches quieter speech (since there's no wake gate). silence_duration is how long of quiet closes an utterance. idle_log_sec is how often to print an ambient-level summary so you can see what the mic is hearing even when nobody is talking.",
+    "always_on_speech_entry_rms":       150.0,
+    "always_on_silence_exit_rms":        70.0,
+    "always_on_silence_duration_sec":   0.8,
+    "always_on_min_utterance_sec":      0.3,
+    "always_on_max_utterance_sec":     12.0,
+    "always_on_idle_log_sec":           5.0,
+    "always_on_ambient_mult":           1.4,
+    "always_on_ambient_window_chunks":  100,
+
+
+    "whisper_model":        "base.en",
+    "whisper_device":       "cpu",
+    "whisper_compute_type": "int8",
+
+    "_whisper_tuning_comment": "base.en is the only model that decodes fast enough on Jetson Orin NX CPU. TESTED: small.en takes 10-12s per 1s burst (unusable); base.en runs ~2-3s per burst. tiny.en is even faster (~1s) but noticeably worse accuracy. If accuracy is poor on base.en (garbled transcriptions), the fix is hardware — switch to a close-talking USB mic (Hollyland) via mic.backend:pactl_parec. small.en cached in ~/.cache/huggingface/hub/ if you want to experiment again — try it on an x86 dev machine to see the accuracy gain before blaming Jetson.",
+    "mic_gain":                             1.0,
+    "whisper_beam_size":                    8,
+    "whisper_no_speech_threshold":          0.85,
+    "whisper_log_prob_threshold":          -1.8,
+    "whisper_compression_ratio_threshold":  3.0,
+    "whisper_temperature_fallback":         [0.0, 0.2, 0.4],
+    "_whisper_temp_comment": "Temperature fallback: Whisper first tries greedy (T=0). If the output fails its own confidence gates, it retries at 0.2, then 0.4. On noisy audio this often rescues a bad greedy decode — the small random noise in softmax helps unstick the decoder from a local minimum.",
+    "_whisper_gates_comment": "Looser than faster-whisper defaults (0.6 / -1.0 / 2.4) because G1 far-field mic audio has poor SNR and frequently falls below the default log-prob. A segment is dropped only if (no_speech_prob > 0.85 AND log_prob < -1.8) — lets more shaky-but-real speech through. Hallucination risk is cushioned by the GARBAGE_PATTERNS filter downstream and the fuzzy-match to command_vocab.",
+    "_initial_prompt_comment": "EMPTY BY DEFAULT. Any bias prompt leaks — on unclear/short audio Whisper echoes the prompt verbatim as the transcription (seen repeatedly: 'This is a robot assistant' hallucinated from <1s of ambient). Clean, unbiased decode is worse at recognising 'Sanad' specifically but doesn't produce phantom commands. Set to a single short cue if you want nudging and can tolerate occasional echoes.",
+    "whisper_initial_prompt":               "Robot voice command.",
+
+    "_vocab_comment": "Tunable vocab lists for voice post-processing. wake_words = fuzzy variants of the wake word 'Sanad' Whisper might produce (used by always_on_gated mode). command_vocab = canonical commands the voice layer fuzzy-matches transcriptions against (e.g. 'Turn right up' → 'turn right' so command_parser's regex catches it). Edit these to add new vocabulary — NO code change required.",
+    "wake_words": [
+      "sanad", "sannad", "sennad", "sunnad", "sinnad", "sonnad",
+      "sanat", "sunnat", "sonnat", "sinnat", "sennat",
+      "sanid", "sanud", "saned", "sanod", "sanaad",
+      "senad", "sinad", "sonad", "sunad",
+      "sanah", "sanath", "sanadh", "sonadh",
+      "samad", "somad", "sumad",
+      "thanad", "zanad",
+      "sa nad", "san ad", "san odd", "san add"
+    ],
+    "_wake_words_exclude_comment": "DELIBERATELY EXCLUDED from wake_words: 'said', 'sent', 'sand', 'sandy', 'sunday', 'signed', 'synod', 'sonata', 'sonnet', 'senate', 'sane', 'saint', 'sana'. These collide with common English and would false-trigger the gate.",
+    "command_vocab": [
+      "what do you see", "what can you see", "look around",
+      "come to me", "come here", "come back", "come closer",
+      "approach", "get closer", "come",
+      "go home", "go back", "go forward", "go backward",
+      "go left", "go right", "go",
+      "sit down", "stand up", "sit", "stand",
+      "raise arm", "lower arm", "wave hello", "wave", "point",
+      "turn left", "turn right", "turn around",
+      "move forward", "move backward", "move back",
+      "move left", "move right",
+      "walk forward", "walk backward", "walk back",
+      "step forward", "step back", "step left", "step right",
+      "forward", "backward", "back", "left", "right",
+      "patrol", "stop", "halt", "wait", "pause", "freeze", "hold",
+      "hello", "hi", "hey", "help",
+      "who are you", "where are you", "where am i", "what is your name",
+      "remember this", "forget", "do it again", "repeat", "undo",
+      "follow me", "stay here"
+    ],
+    "command_vocab_cutoff": 0.72,
+    "_garbage_comment": "Whisper's known 'no phonetic content' outputs on low-SNR audio. YouTube outros, filler words, single-letter hallucinations. Any transcription matching one of these (or shorter than min_transcription_length) is rejected before fuzzy-match — treated as silence.",
+    "garbage_patterns": [
+      "thanks for watching", "thank you for watching",
+      "thank you", "thanks",
+      "bye", "goodbye",
+      ".", "you", "yeah",
+      "okay", "ok",
+      "um", "uh", "hmm", "mm",
+      "i", "a"
+    ],
+    "min_transcription_length": 3,
+
+
+    "_wake_comment": "Custom energy-based wake detector with adaptive noise floor. speech_threshold is a FLOOR — the effective trigger is max(speech_threshold, ambient_baseline * wake_adaptive_mult). CRITICAL: speech_threshold must be ABOVE your room's ambient RMS, else state stays in SPEAKING forever and baseline can't learn. Check logs — if `baseline=0` and `state=SPEAKING` persist with peak values well below your 'Sanad' peaks, raise this floor. Normal G1 room: 80-150. Noisy room (fans, HVAC): 300-500. Measured-here room ambient ≈ 250-350, so 400 gives margin.",
+    "speech_threshold":         400.0,
+    "min_word_duration":        0.25,
+    "max_word_duration":        2.50,
+    "post_silence":             0.20,
+    "wake_cooldown":            1.00,
+    "wake_chunk_ms":            50,
+    "wake_adaptive_window_n":   50,
+    "wake_adaptive_mult":       3.0,
+    "wake_diag_log_sec":        3.0,
+
+    "wake_ack":             "tts",
+    "_wake_ack_comment":    "tts = spoken 'Yes' via TtsMaker (~1.7s G1 firmware latency). none = silent, relies on terminal print (fast).",
+
+    "_wake_verify_comment": "DISABLED for speed. When enabled, runs a ~3s Whisper decode on each wake burst and rejects non-/sa-/ speech — good for filtering coughs/claps, but adds 3s latency per wake. With it off, every acoustic wake fires — faster response, more false wakes from loud noises (but those drop silently at the 'no speech' recording stage, so user impact is near-zero). Set true again if background noise is triggering too many false wakes.",
+    "wake_verify_enabled": false,
+
+
+    "_vad_comment": "Hysteretic VAD with adaptive ambient baseline. speech_entry_rms = 'user started talking' floor; silence_exit_rms = 'user stopped' floor (must be < entry). Adaptive: we reuse the wake detector's baseline (measured during idle silence) to bump both up if the room is noisier than the floors. Tune for YOUR ambient: check `command audio: peak=X rms=Y` in voice.log — speech_entry should sit roughly 2× above ambient rms. Room-measured ambient ≈ 250-350 → entry 400 with margin.",
+    "speech_entry_rms":     400.0,
+    "silence_exit_rms":     200.0,
+    "_vad_tuning_comment": "silence_duration_sec = how long of quiet ends an utterance. 0.6 cuts short commands fast (good UX) but may clip a thinking pause. ambient_mult = effective_entry multiplier over measured ambient (cmd is entry = max(speech_entry_rms, ambient * mult * 1.8)). Smaller = more eager, catches quieter speech. 1.5 matches the wake-mult behavior.",
+    "silence_duration_sec": 0.6,
+    "max_record_sec":       5.0,
+    "min_record_sec":       0.4,
+    "ambient_probe_sec":    0.2,
+    "ambient_mult":         1.5,
+    "ambient_cap_rms":      200.0,
+    "_recording_comment": "Debug recording — save every command turn's audio to Data/Voice/Recordings/ as WAV. Filename includes epoch timestamp + transcription slug so you can replay what Whisper got and compare to what it heard. Rotates to keep most recent N files. Filename prefixes: 'cmd_*' = successful transcription, 'unk_*' = empty/rejected.",
+    "recording_enabled":   true,
+    "recording_keep_count": 50,
+    "command_cooldown_sec": 1.5,
+    "post_tts_settle_sec":  0.4,
+    "_post_tts_settle_comment": "Time the mic ignores input AFTER the robot finishes speaking. Too short → TTS echo becomes false utterance. Too long → user speaks during the dead window and first syllables are clipped. 0.4s matches the G1 speaker decay at mic_gain=1.0; raise if you bump mic_gain above 1.5, lower if users report 'it cut off my first word'."
   },
   "mic": {
     "backend": "builtin_udp",
diff --git a/Config/marcus_prompts.yaml b/Config/marcus_prompts.yaml
index 7a71fbf..0ff9b4f 100644
--- a/Config/marcus_prompts.yaml
+++ b/Config/marcus_prompts.yaml
@@ -25,6 +25,15 @@ main_prompt: |
   - speak: actually describe what you are doing OR what the camera shows right now. Do NOT copy example text. First person. English.
   - abort: null normally; "obstacle detected" / "unsafe command" / "cannot comply" with actions=[] when unsafe.
 
+  - CRITICAL — IF THE COMMAND IS UNCLEAR OR NOT AN ACTION:
+    If the input text is a single unclear word (like "I", "alright", "ok", "um"),
+    a random phrase ("I have a lot of beauty", "turn turn turn"), noise, a
+    greeting ("hello", "hi"), or anything that isn't clearly a movement /
+    arm / vision / memory instruction — DO NOT INVENT a command. Instead
+    reply with:
+      {{"actions":[],"arm":null,"speak":"Sorry, I didn't understand that — please repeat","abort":"command not understood"}}
+    Better to ask again than to guess and perform the wrong action.
+
   Examples (learn the STRUCTURE, don't reuse the speak text):
   "turn right"  → {{"actions":[{{"move":"right","duration":2.0}}],"arm":null,"speak":"Turning right","abort":null}}
   "walk 2 steps" → {{"actions":[{{"move":"forward","duration":2.0}}],"arm":null,"speak":"Walking forward","abort":null}}
diff --git a/Doc/MARCUS_API.md b/Doc/MARCUS_API.md
index bfbb718..9f400db 100644
--- a/Doc/MARCUS_API.md
+++ b/Doc/MARCUS_API.md
@@ -29,7 +29,7 @@
 | Image-search step delay | `Vision/marcus_imgsearch.py` | `STEP_DELAY` 0.4 s → 0.15 s. |
 | Built-in G1 microphone | `Voice/builtin_mic.py` (new), `API/audio_api.py`, `Config/config_Voice.json` | Mic now reads from UDP multicast `239.168.123.161:5555` (G1 on-board array mic) instead of the Hollyland USB. Config key `mic.backend` defaults to `"builtin_udp"`; set to `"pactl_parec"` to fall back to the old path. |
 | Built-in G1 TTS | `Voice/builtin_tts.py` (new), `API/audio_api.py` | `AudioAPI.speak(text)` now calls `client.TtsMaker(text, speaker_id)` directly. No MP3/WAV plumbing, no internet, no edge-tts/Piper. English only — `speak()` refuses non-ASCII to avoid the G1's silent Arabic→Chinese fallback. |
-| Gemini voice deleted | `Voice/marcus_gemini_voice.py` removed | `_init_voice()` now spawns `Voice.marcus_voice.VoiceModule` (Whisper wake + command STT). No more WebSocket, no more asyncio event loop, no API key. |
+| Voice stack finalised | `Voice/marcus_voice.py`, `Voice/wake_detector.py` | Custom energy wake detector (pure numpy) + Whisper verify + faster-whisper command STT + fuzzy-match to canonical commands. Vosk experiment reverted; Gemini Live reverted. Single local STT engine. |
 | Subsystem flags | `Config/config_Brain.json::subsystems.{lidar, voice, imgsearch, autonomous}` | `init_brain()` skips any subsystem with `false`. Defaults: lidar+voice+autonomous ON, imgsearch OFF. |
 | Robot persona → Sanad | Multiple | Wake words `["sanad","sannad","sanat","sunnat"]`; all prompts say "You are Sanad"; banner reads `SANAD AI BRAIN — READY`; hardcoded self-intro says "I am Sanad". Project/file/module names unchanged. |
 | Logger rename | `Core/log_backend.py` (was `Core/Logger.py`) | Case-only collision with `Core/logger.py` removed — repo now clones cleanly on macOS/Windows. Public API unchanged: `from Core.logger import log`. |
@@ -766,9 +766,9 @@ SAFETY:
 
 ---
 
-## 15. Voice API (mic + TTS + STT)
+## 15. Voice API (mic + TTS + wake + STT)
 
-New pipeline as of 2026-04-21. Replaces the Gemini live WebSocket + edge-tts/Piper stack.
+Current pipeline: G1 mic → custom energy wake detector → Whisper verify → TtsMaker "Yes" → record → faster-whisper transcribe → fuzzy-match canonical command → brain. Replaces all prior experiments (Gemini Live WebSocket, Vosk grammar, edge-tts / Piper).
 
 ### Mic — `Voice.builtin_mic.BuiltinMic`
 
@@ -800,9 +800,36 @@ tts.speak("Hello, I am Sanad", block=True)    # synth + play on G1 body speaker
 
 Used by `AudioAPI.speak(text)` internally; application code should call `audio_api.speak(...)` rather than BuiltinTTS directly.
 
-### Wake + command loop — `Voice.marcus_voice.VoiceModule`
+### Wake detection — `Voice.wake_detector.WakeDetector`
 
-Four-state machine (`IDLE → WAKE_HEARD → PROCESSING → SPEAKING`). Whisper `tiny` detects the wake word, `small` transcribes commands.
+Pure-numpy energy state machine with adaptive noise floor. Classifies any 0.35-1.5 s speech burst as a candidate wake, captures the audio for post-hoc verification.
+
+```python
+from Voice.wake_detector import WakeDetector, WakeConfig
+cfg = WakeConfig(
+    sample_rate=16_000,
+    speech_threshold=400.0,           # min RMS floor — above noise
+    min_word_duration_s=0.35,         # filter out coughs (<0.35s)
+    max_word_duration_s=1.50,         # filter out sentences
+    post_silence_s=0.30,              # how long silence marks word end
+    cooldown_s=1.50,                  # min gap between fires
+    chunk_ms=50,                      # RMS analysis window
+    adaptive_window_n=50,             # rolling mean of idle RMS
+    adaptive_mult=3.0,                # effective = max(floor, baseline×mult)
+)
+det = WakeDetector(cfg)
+while True:
+    pcm = mic.read_chunk(1024)
+    if det.process(pcm):
+        burst = det.get_last_burst()  # audio that triggered wake
+        break
+```
+
+Config under `config_Voice.json::stt.{speech_threshold, min_word_duration, …}`.
+
+### Voice orchestrator — `Voice.marcus_voice.VoiceModule`
+
+Drives the full pipeline: wake detector → Whisper verify → record → transcribe → fuzzy-match → dispatch. Three operating modes (`wake_and_command`, `always_on`, `always_on_gated`) selectable via `stt.mode`.
 
 ```python
 from API.audio_api import AudioAPI
@@ -818,7 +845,9 @@ voice.start()   # background thread
 voice.stop()
 ```
 
-Wake words are configured in `config_Voice.json::stt.wake_words_en`. The brain's `_init_voice()` wires `on_command` to `process_command(text)` + `audio_api.speak(reply)`.
+Vocabulary (`wake_words`, `command_vocab`, `garbage_patterns`) is loaded from `config_Voice.json::stt.*` at `VoiceModule.__init__`. All thresholds, Whisper params, and mode selection live in the same config — no Python edits required to tune. See `Doc/controlling.md` → "Voice" for the tuning-knobs cheat sheet.
+
+The brain's `_init_voice()` wires `on_command` to `process_command(text)` → `audio_api.speak(reply)`.
 
 ### AudioAPI — `API.audio_api.AudioAPI`
 
diff --git a/Doc/architecture.md b/Doc/architecture.md
index dd12b3d..24b830e 100644
--- a/Doc/architecture.md
+++ b/Doc/architecture.md
@@ -15,7 +15,7 @@
 - **ZMQ bind moved to `init_zmq()`** — no longer runs at import time; multiprocessing children (LiDAR SLAM worker) can safely re-import.
 - **G1 built-in microphone** via UDP multicast `239.168.123.161:5555` — `Voice/builtin_mic.py` replaces Hollyland/`parec` as the default mic.
 - **G1 built-in TTS** via `client.TtsMaker()` — `Voice/builtin_tts.py`. English only. Edge-tts / Piper / XTTS paths removed.
-- **Gemini voice module deleted** — Whisper wake-word + command STT path is now authoritative (`Voice/marcus_voice.py`).
+- **Voice stack finalised** — custom energy wake detector (`Voice/wake_detector.py`) + faster-whisper command STT (`Voice/marcus_voice.py`). Whisper verifies each acoustic wake before acking. Gemini voice module and Vosk grammar STT both tried and removed.
 - **Subsystem flags** — `config_Brain.json::subsystems.{lidar, voice, imgsearch, autonomous}` let you selectively skip heavy boot stages.
 - **Conditional inner-loop sleeps** — goal_nav / autonomous / imgsearch no longer pay unconditional per-step naps.
 - **Core/Logger.py → Core/log_backend.py** — case-only name collision with `logger.py` resolved; repo clones cleanly on macOS/Windows.
@@ -66,7 +66,7 @@ Marcus/
 │   ├── config_Memory.json        # session/places paths
 │   ├── config_Network.json       # Jetson IPs (eth0/wlan0), ports
 │   ├── config_ImageSearch.json   # search defaults
-│   ├── config_Voice.json         # mic (builtin_udp|pactl_parec), TTS backend, wake words, mic_udp group/port
+│   ├── config_Voice.json         # mic, TTS, wake detector thresholds, Whisper params, wake_words/command_vocab/garbage_patterns vocab lists, VAD thresholds
 │   ├── config_LiDAR.json         # Livox Mid-360 connection + SLAM engine params
 │   └── marcus_prompts.yaml       # All Qwen-VL prompts (main, goal, patrol, talk, verify, 2× imgsearch)
 │   #  Total: 12 JSON files + 1 YAML. (config_Memory.json removed 2026-04-21.)
@@ -83,10 +83,11 @@ Marcus/
 │   ├── audio_api.py              # AudioAPI — speak() via G1 TtsMaker, record() via BuiltinMic
 │   └── lidar_api.py              # LiDAR wrapper: init_lidar(), obstacle_ahead(), get_lidar_status()
 │
-├── Voice/                        # Mic + TTS + wake-word STT
+├── Voice/                        # Mic + TTS + wake detector + faster-whisper STT
 │   ├── builtin_mic.py            # G1 array mic via UDP multicast 239.168.123.161:5555
 │   ├── builtin_tts.py            # BuiltinTTS — client.TtsMaker(text, speaker_id)
-│   └── marcus_voice.py           # VoiceModule — Whisper tiny (wake) + small (command) state machine
+│   ├── wake_detector.py          # Pure-numpy energy wake detector (WakeDetector, WakeConfig) with adaptive baseline
+│   └── marcus_voice.py           # VoiceModule — orchestrates wake → verify → record → Whisper → dispatch
 │
 ├── Brain/                        # Decision logic — imports ONLY from API/
 │   ├── marcus_brain.py           # Orchestrator: init_brain(), process_command(), run_terminal()
@@ -188,7 +189,8 @@ Marcus/
 │  goal_nav.py             │  │  builtin_mic.py         │
 │  patrol.py               │  │  builtin_tts.py         │
 │  marcus_odometry.py      │  │  marcus_voice.py        │
-│  marcus_yolo.py          │  │  (Whisper + TtsMaker)   │
+│  marcus_yolo.py          │  │  wake_detector.py       │
+│                          │  │  (Whisper + TtsMaker)   │
 │  marcus_imgsearch.py     │  └──────────┬──────────────┘
 └──────────────┬───────────┘             │
                │                         │
@@ -487,10 +489,10 @@ Supports text-only search (no reference image) using hint description.
 
 ### Voice/
 
-Mic, TTS and wake-word pipeline. All three files run only when `config_Brain.json::subsystems.voice == true`. Everything is local — no internet, no WebSocket, no cloud API. TTS is English-only by design (the G1 firmware maps non-English to Chinese, which is unusable).
+Mic, TTS, energy-based wake detector, and faster-whisper STT pipeline. All files run only when `config_Brain.json::subsystems.voice == true`. Everything is local — no internet, no WebSocket, no cloud API. TTS is English-only by design (the G1 firmware maps non-English to Chinese, which is unusable).
 
-#### `builtin_mic.py` (~180 lines, new 2026-04-21)
-Ported from `Project/Sanad/voice/audio_io.py::BuiltinMic`. Joins the G1's on-board audio multicast group (`239.168.123.161:5555`) and buffers incoming int16 mono 16 kHz PCM. Thread-safe ring buffer.
+#### `builtin_mic.py` (~180 lines)
+Joins the G1's on-board audio multicast group (`239.168.123.161:5555`) and buffers incoming int16 mono 16 kHz PCM. Thread-safe ring buffer. Identical pattern to `Project/Sanad/voice/audio_io.py::BuiltinMic`.
 
 **Exports:**
 - `BuiltinMic(group, port, buf_max, read_timeout)` — init (idempotent)
@@ -499,20 +501,41 @@ Ported from `Project/Sanad/voice/audio_io.py::BuiltinMic`. Joins the G1's on-boa
 - `read_seconds(s)` — convenience for "record `s` seconds"
 - `flush()` — drop buffered audio (called while TTS plays, to avoid echo)
 
-#### `builtin_tts.py` (~70 lines, new 2026-04-21)
-Thin wrapper around `unitree_sdk2py.g1.audio.g1_audio_client.AudioClient.TtsMaker(text, speaker_id)`. Synchronous — blocks until the estimated playback duration elapses. Refuses non-ASCII input (the G1 silently maps Arabic to Chinese, which confuses everyone).
+#### `builtin_tts.py` (~70 lines)
+Thin wrapper around `unitree_sdk2py.g1.audio.g1_audio_client.AudioClient.TtsMaker(text, speaker_id)`. Synchronous — blocks until the estimated playback duration elapses. Refuses non-ASCII input.
 
 **Exports:**
 - `BuiltinTTS(audio_client, default_speaker_id=0)` — init
 - `speak(text, speaker_id=None, block=True)` — synth+play on G1 body speaker
 
-#### `marcus_voice.py` (~340 lines, rewired 2026-04-21)
-Always-listening English voice loop with a four-state machine (`IDLE → WAKE_HEARD → PROCESSING → SPEAKING`). Whisper `tiny` listens for the wake word "Sanad" on 2-second chunks; Whisper `small` transcribes the full command. Mic input comes from `BuiltinMic`; responses go through `audio_api.speak()` → `BuiltinTTS`.
+#### `wake_detector.py` (~240 lines)
+Pure-numpy energy-envelope state machine. Fires a wake event when it sees a short speech burst (0.2-1.5 s) sized to match a single spoken word like "Sanad", followed by a clear silence. No ML, no lexicon — just amplitude classification.
+Adaptive noise-floor baseline: learns ambient RMS during idle, raises the effective threshold proportionally, so the detector works the same in a quiet room and a noisy lab. Captures the triggering burst audio (`get_last_burst()`) so callers can verify it was actually "Sanad" before acking. Exists because Vosk/Whisper both failed on the G1 far-field mic for short non-English proper nouns.
 
 **Exports:**
-- `VoiceModule(audio_api, on_command=cb)` — init
-- `start()` — spawn background thread
-- `stop()` — graceful teardown
+- `WakeDetector(cfg)` with `WakeConfig(sample_rate, speech_threshold, min_word_duration_s, max_word_duration_s, post_silence_s, cooldown_s, chunk_ms, adaptive_window_n, adaptive_mult, diag_log_sec)`
+- `process(pcm_bytes) -> bool` — feed audio, returns True once per spoken "word"
+- `reset()`, `get_last_burst() -> np.ndarray | None`
+
+#### `marcus_voice.py` (~1000 lines)
+Voice orchestrator. Reads from `BuiltinMic`, runs the `WakeDetector`, verifies the wake burst with a lightweight Whisper decode, records the command with hysteretic VAD (speech_entry / silence_exit thresholds, adaptive to measured ambient), trims leading silence before Whisper, transcribes with faster-whisper, fuzzy-matches against `command_vocab` to canonicalize near-misses ("Turn right up" → "turn right"), then dispatches to the brain callback.
+
+Three operating modes selectable via `stt.mode`:
+- `wake_and_command` (default): classic acoustic wake → TTS "Yes" → record → Whisper → brain
+- `always_on`: no wake, transcribe every utterance, dispatch all
+- `always_on_gated`: transcribe everything, only dispatch utterances containing "Sanad"
+
+Wake verify rule: Whisper's decode must either contain a wake-word variant (`stt.wake_words`) OR start with `s/sh/z` — Whisper's consistent signature for mishearing "Sanad" as "Stop"/"Set"/"Sand". Pure silence / non-s speech is rejected silently.
+
+**Module-level** (populated at `VoiceModule.__init__` from config):
+- `WAKE_WORDS`, `COMMAND_VOCAB`, `GARBAGE_PATTERNS` — loaded from `config_Voice.json::stt.*`, single source of truth
+- `_has_wake_word(text)`, `_strip_wake_word(text)` — iterative until stable, handles "Sanad. Sanad." → ""
+- `_closest_command(text, cutoff)` — difflib fuzzy-match against `COMMAND_VOCAB`
+
+**Exports:**
+- `VoiceModule(audio_api, on_command=cb, on_wake=None)` — init
+- `start()` / `stop()` — background thread lifecycle
+- `is_running` property
 
 ---
 
diff --git a/Doc/controlling.md b/Doc/controlling.md
index 20e0b16..a9492b4 100644
--- a/Doc/controlling.md
+++ b/Doc/controlling.md
@@ -79,16 +79,30 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765`
 
 ## Voice
 
-- **Wake word:** "Sanad" (variants "sannad", "sanat", "sunnat" — see `config_Voice.json::stt.wake_words_en`)
+- **Wake word:** "Sanad" (Whisper mishears it as "Stop", "Sand", "Set", "Send" — all accepted via the /s-/ phonetic rule; see `config_Voice.json::stt.wake_words` for the 33 fuzzy variants).
 - **Mic:** G1 on-board array mic, captured via UDP multicast `239.168.123.161:5555` (16 kHz mono, 16-bit PCM). No USB mic needed.
-- **STT:** Whisper `tiny` (wake detection) + Whisper `small` (command transcription) — both run locally.
+- **Wake detection:** custom energy-envelope state machine (pure numpy, no ML) — fires on any 0.35-1.5 s speech burst followed by silence. Adaptive to room ambient.
+- **Wake verify:** lightweight Whisper decode on the triggering burst. Accepts if it contains a wake-word variant OR starts with `s`/`sh`/`z` (Whisper's consistent signature for "Sanad"). Rejects pure noise / non-s speech silently.
+- **STT (command):** faster-whisper `base.en` int8 on CPU — loads ~1.5 s on first wake, cached after.
 - **TTS:** Unitree `client.TtsMaker()` → G1 body speaker. English only.
-- **Barge-in:** say something while Marcus is speaking and the mic buffer flushes on the next command.
+- **Barge-in:** the mic is muted during TTS playback, then flushed on return to listening.
 
-Interaction flow: say "Sanad" → hear *"Listening"* → speak your command → see transcript on console → Marcus answers through the speaker.
+Interaction flow: say "Sanad" → hear *"Yes"* → speak your command → see transcript on console → Marcus answers through the speaker.
+
+Three voice modes selectable via `config_Voice.json::stt.mode`:
+- `wake_and_command` (default) — wake word required before each command
+- `always_on` — continuously transcribe + dispatch every utterance
+- `always_on_gated` — always listen + log, dispatch only if utterance contains "Sanad"
 
 To disable voice entirely, set `subsystems.voice: false` in `config_Brain.json` — Marcus will boot text-only ~2 s faster.
 
+**Tuning knobs** (when false wakes or rejected real wakes) — all in `config_Voice.json::stt`:
+- Too many false wakes from coughs/claps → raise `speech_threshold` or `min_word_duration`
+- Real "Sanad" being rejected → check the log line `wake REJECTED — %r` to see what Whisper heard; widen `wake_words` if needed
+- Commands transcribed wrong → check `whisper: lp=%.2f nsp=%.2f text=%r` log line; lower `whisper_no_speech_threshold` or tighten `whisper_log_prob_threshold`
+- "I didn't catch that" on silence → raise `min_transcription_length`
+- Latency too high → set `wake_ack: "none"` (skip "Yes" TTS, save ~1.7 s/cycle)
+
 ---
 
 ## Command Reference
@@ -230,7 +244,7 @@ Most values configurable in `Config/config_Network.json` and `config_Voice.json:
 | `llama runner process has terminated: %!w(<nil>)` | Ollama compute graph OOM | Already capped at `num_batch=128 / num_ctx=2048`. Check `free -h`; kill stale Ollama runners: `pkill -f "ollama runner"` |
 | Traceback mentioning `multiprocessing/spawn.py` + ZMQ port 5556 | Old import-time ZMQ bind regressed | Pull latest `API/zmq_api.py` — must call `init_zmq()` from the parent only |
 | `[Camera] No frame for 10s` during warmup | Ollama blocking the main thread, or USB bandwidth | Warmup is ~10–15 s on first Qwen load; subsequent commands are fast |
-| Wake word never fires | Whisper hearing something else | Check `logs/voice.log` — if it transcribes as "sunnat"/"sannat", add your variant to `config_Voice.json::stt.wake_words_en` |
+| Wake word never fires | Energy burst below floor, or Whisper verify rejecting | Check `logs/voice.log` — if you see `wake REJECTED — 'X'`, add X's root variant to `config_Voice.json::stt.wake_words`. If `baseline=0` persists, your ambient exceeds the floor — raise `speech_threshold`. |
 | Mic silent | G1 audio service not publishing | Run `python3 Voice/builtin_mic.py` standalone — must print "OK — mic is capturing audio" |
 | `[LiDAR] No data yet (will keep trying)` | SLAM worker still spawning (normal) or Livox network | First ~5 s normal. If persists, `ping 192.168.123.120` |
 | Client can't connect | Wrong IP or server not running | Verify `ollama serve &` and `python3 -m Server.marcus_server` are both up |
@@ -243,7 +257,7 @@ Most values configurable in `Config/config_Network.json` and `config_Voice.json:
 |------|------|
 | Brain code | `~/Marcus/Brain/` |
 | Server | `~/Marcus/Server/marcus_server.py` |
-| Voice | `~/Marcus/Voice/{builtin_mic,builtin_tts,marcus_voice}.py` |
+| Voice | `~/Marcus/Voice/{builtin_mic,builtin_tts,wake_detector,marcus_voice}.py` |
 | Config | `~/Marcus/Config/` |
 | Prompts | `~/Marcus/Config/marcus_prompts.yaml` |
 | YOLO model | `~/Marcus/Models/yolov8m.pt` |
@@ -261,7 +275,7 @@ See `Doc/functions.md` for the full function inventory (AST-generated).
 ## Language policy
 
 **English only.** Arabic was removed from the codebase on 2026-04-21:
-- `Config/config_Voice.json::stt.wake_words_en` — only English variants (`sanad`, `sannad`, `sanat`, `sunnat`)
+- `Config/config_Voice.json::stt.wake_words` — English fuzzy variants only (33 entries), excludes common English words that would false-trigger (`said`, `sand`, `sunday`, etc.)
 - `Config/marcus_prompts.yaml` — no Arabic examples left in any of the 7 prompts
 - `API/audio_api.py::speak(text)` — rejects non-ASCII (the G1 TtsMaker silently maps Arabic to Chinese, which nobody wants)
 - `Brain/marcus_brain.py` — greeting and talk-pattern regexes match English only
diff --git a/Doc/environment.md b/Doc/environment.md
index 23c624a..94f3817 100644
--- a/Doc/environment.md
+++ b/Doc/environment.md
@@ -384,3 +384,5 @@ Config file (`Config/config_Vision.json`):
 | 2026-04-21 | **Subprocess leak fix**: `AudioAPI._record_parec` now wraps `Popen` in try/finally with `terminate → wait(1.0) → kill` fallback; orphan `parec` processes can no longer survive Ctrl-C. Last-resort `proc.kill()` catches only `OSError` (not bare `except`). |
 | 2026-04-21 | **Modelfile corrected**: `Models/Modelfile` now `FROM qwen2.5vl:3b` (was `:7b`) with a header explaining it's an optional build template — runtime uses `ollama pull qwen2.5vl:3b` directly. |
 | 2026-04-21 | **Final verification**: 14-dimension smoke test green — no Arabic, no dead dirs, 0 orphan keys, every FileHandler rotates, no bare `except: pass`, no stale `Models_marcus` / `marcus_llava` refs, 25/25 modules import. |
+| 2026-04-24 | **Voice finalised on faster-whisper + custom energy wake**. Added `Voice/wake_detector.py` (pure-numpy energy state machine, adaptive noise floor, burst-audio capture for verify). Rewrote `Voice/marcus_voice.py` around it: three operating modes (`wake_and_command` / `always_on` / `always_on_gated`), hysteretic record VAD, pre-speech silence trim (300 ms pre-roll preserved), faster-whisper `base.en` int8 CPU decode, fuzzy-match canonicalisation against `command_vocab`, `GARBAGE_PATTERNS` + length filter for noise hallucinations, `/s-/` phonetic wake verify (accepts Whisper mishearings of "Sanad" like "Stop"/"Set"/"Sand"). Tried and reverted: Gemini Live WebSocket (Python 3.8 incompatibility + latency), Vosk grammar STT (English lexicon can't decode "Sanad"; big model cold-load too slow on Jetson). All voice tunables (33 wake_words, 68 command_vocab, 17 garbage_patterns, ~25 threshold/VAD/Whisper keys) live in `config_Voice.json::stt.*` — zero hardcoded strings in Voice/. |
+| 2026-04-24 | **Command parser widened**: `Brain/command_parser.py` now has `_RE_SIMPLE_DIR` (`left`, `go back`, `move forward`, `step right`, etc.) and `_RE_STOP_SIMPLE` (`stop`, `halt`, `wait`, `pause`, `freeze`) regex fast-paths — these bare-direction / bare-stop commands now skip Qwen entirely (~50 ms vs ~5 s). Motion velocities and step duration pulled from `config_Navigation.json::{move_map, step_duration_sec}` via `API/zmq_api.py`; command_parser no longer contains hardcoded `0.3` / `2.0` magic numbers. |
diff --git a/Doc/functions.md b/Doc/functions.md
index 3ada97d..1aaea92 100644
--- a/Doc/functions.md
+++ b/Doc/functions.md
@@ -49,13 +49,14 @@ Script only. Prepends `PROJECT_ROOT` to `sys.path`, then calls `Brain.marcus_bra
 
 ---
 
-## `Voice/` — mic + TTS + STT
+## `Voice/` — mic + TTS + wake + STT
 
 | File | Public API |
 |---|---|
 | `builtin_mic.py` | `_find_g1_local_ip()` + **class `BuiltinMic`** |
 | `builtin_tts.py` | **class `BuiltinTTS`** |
-| `marcus_voice.py` | **class `State`** (IDLE/WAKE_HEARD/PROCESSING/SPEAKING), **class `VoiceModule`** |
+| `wake_detector.py` | **dataclass `WakeConfig`** + **class `WakeDetector`** |
+| `marcus_voice.py` | module-level `WAKE_WORDS`, `COMMAND_VOCAB`, `GARBAGE_PATTERNS` (populated from config), helpers `_has_wake_word`, `_strip_wake_word`, `_strip_wake_word_once`, `_closest_command`, **class `VoiceModule`** |
 
 **`Voice.builtin_mic.BuiltinMic`** — G1 UDP multicast mic:
 `__init__(group, port, buf_max, read_timeout)`, `start()`, `stop()`, `read_chunk(num_bytes)`, `read_seconds(seconds)`, `flush()`; internal `_recv_loop`.
@@ -63,8 +64,11 @@ Script only. Prepends `PROJECT_ROOT` to `sys.path`, then calls `Brain.marcus_bra
 **`Voice.builtin_tts.BuiltinTTS`** — wraps `AudioClient.TtsMaker`:
 `__init__(audio_client, default_speaker_id=0)`, `speak(text, speaker_id=None, block=True)`.
 
-**`Voice.marcus_voice.VoiceModule`** — Whisper wake + command STT:
-`__init__(audio_api, on_command)`, `start()`, `stop()`, props `state`, `is_running`. Internal state machine: `_do_idle`, `_do_wake_heard`, `_do_processing`; helpers `_load_whisper`, `_transcribe`, `_check_wake_word`, `_record_chunk`, `_record_until_silence`, `_voice_loop`.
+**`Voice.wake_detector.WakeDetector`** — pure-numpy energy wake:
+`__init__(cfg: WakeConfig)`, `process(pcm_bytes) -> bool`, `reset()`, `get_last_burst() -> np.ndarray | None`. Internal: `_step(window)` state-machine per 50 ms analysis window; adaptive `_baseline_buf` rolling mean of idle-silence RMS; captures triggering burst audio for post-hoc Whisper verify.
+
+**`Voice.marcus_voice.VoiceModule`** — voice orchestrator. Drives the wake detector, verifies each fire with a lightweight Whisper decode (wake-word substring OR /s-/ phonetic match), records commands with a hysteretic VAD, trims pre-speech silence, transcribes via faster-whisper, fuzzy-normalises near-misses to canonical commands, dispatches to brain.
+`__init__(audio_api, on_command=None, on_wake=None)`, `start()`, `stop()`, `is_running` property. Internal: `_get_fw()` lazy faster-whisper loader, `_read_mic_raw` / `_read_mic_gained`, `_record_command()` with adaptive VAD + pre-silence trim, `_transcribe(audio)` Whisper decode + garbage filter, `_transcribe_command(audio)` thin wrapper, `_normalize_command(text)` fuzzy-match to `COMMAND_VOCAB`, `_handle_wake()` / `_voice_loop()` / `_voice_loop_wake()` / `_voice_loop_always_on(gated)`, `_save_unk_wav(audio)` for post-mortem debugging.
 
 ---
 
diff --git a/Doc/pipeline.md b/Doc/pipeline.md
index 2292911..a7d50a8 100644
--- a/Doc/pipeline.md
+++ b/Doc/pipeline.md
@@ -51,22 +51,43 @@ G1 body mic (array)
 Voice/builtin_mic.py::BuiltinMic
   ring buffer (64 KB) + read_chunk(n)
         ▼
-Voice/marcus_voice.py::VoiceModule   (IDLE → WAKE_HEARD → PROCESSING → SPEAKING)
-  ├─ IDLE        : 2-s chunks → Whisper tiny → wake-word match ("sanad"/"sannad"/…)
-  ├─ WAKE_HEARD  : audio_api.speak("Listening") → G1 body speaker
-  ├─ PROCESSING  : record-until-silence → Whisper small → transcribed text
-  └─ on_command(text, "en")
+Voice/wake_detector.py::WakeDetector
+  pure-numpy energy state machine  (SILENCE ⇄ SPEAKING)
+  adaptive noise floor: eff_threshold = max(speech_threshold, baseline × 3)
+  fires on 0.35-1.5 s bursts followed by 0.3 s silence → captures burst audio
+        ▼
+Voice/marcus_voice.py::VoiceModule._handle_wake()
+  ├─ 1. Whisper verify on the burst audio:
+  │        text = faster-whisper(burst)
+  │        accept if _has_wake_word(text)  OR  startswith(s/sh/z)
+  │        reject otherwise (cough, clap, hello, okay) → silent return
+  ├─ 2. audio_api.speak("Yes")  → G1 body speaker (~1.5 s)
+  ├─ 3. post_tts_settle_sec wait + mic flush
+  ├─ 4. _record_command()  — hysteretic VAD
+  │        speech_entry_rms / silence_exit_rms (adapt from wake baseline)
+  │        trim leading silence (keep 300 ms pre-roll) → tight clip for Whisper
+  ├─ 5. _transcribe(audio)
+  │        faster-whisper (base.en int8 CPU)
+  │        beam_size=5, temperature=0, initial_prompt bias toward Sanad vocab
+  │        GARBAGE_PATTERNS + min_transcription_length reject noise hallucinations
+  ├─ 6. _normalize_command(text)
+  │        difflib fuzzy-match vs stt.command_vocab
+  │        "Turn right up" → "turn right"  (canonical form)
+  └─ 7. on_command(text, "en")
         ▼
 Brain/marcus_brain.py::process_command(text)
   ├─ regex fast-path → Brain/command_parser.py::try_local_command()
-  │    places · odometry walk/turn · patrol · session recall · goal_nav · auto on/off
+  │    places · odometry walk/turn · patrol · session recall · goal_nav
+  │    + SIMPLE_DIR ("go back", "right", "forward") · STOP_SIMPLE ("stop", "halt")
+  │    + NAT_GOAL_RE (naturalised goals like "the chair") · auto on/off
+  │    (~50 ms when matched — NO LLM call)
   └─ else → _handle_llava(text)
         ├─ get_frame()  (10×50 ms poll, no 1 s stall)
         ├─ API/llava_api.py::ask(text, img)
         │    ollama.chat(qwen2.5vl:3b, num_batch=128, num_ctx=2048, num_predict=120)
         │    → parse_json() → {actions, arm, speak, abort}
         └─ Brain/executor.py::execute(d)
-                ├─ actions → API/zmq_api.py::send_vel(vx, vy, vyaw) → Holosoma
+                ├─ actions → MOVE_MAP[dir] → API/zmq_api.py::send_vel → Holosoma
                 ├─ arm     → API/arm_api.py  (stub for now)
                 └─ abort   → gradual_stop()
         ▼
@@ -77,9 +98,17 @@ API/audio_api.py::speak(text, lang="en")
   ├─ Voice/builtin_tts.py::BuiltinTTS.speak(text)
   │    client.TtsMaker(text, speaker_id=0)   — G1 on-board engine, English only
   │    time.sleep(len(text) * 0.08)
-  └─ unmute mic → back to IDLE
+  └─ unmute mic → back to listening
 ```
 
+**Config knobs** (all in `config_Voice.json::stt`):
+- Wake: `speech_threshold` (floor), `min_word_duration`, `max_word_duration`, `post_silence`, `wake_cooldown`, `wake_adaptive_mult`, `wake_diag_log_sec`
+- Verify: `wake_verify_enabled`
+- Record: `speech_entry_rms`, `silence_exit_rms`, `silence_duration_sec`, `max_record_sec`, `min_record_sec`, `ambient_mult`, `ambient_cap_rms`
+- Whisper: `whisper_model`, `whisper_compute_type`, `whisper_beam_size`, `whisper_no_speech_threshold`, `whisper_log_prob_threshold`, `whisper_initial_prompt`, `mic_gain`
+- Vocab: `wake_words`, `command_vocab`, `garbage_patterns`, `command_vocab_cutoff`, `min_transcription_length`
+- Mode: `mode` (`wake_and_command` | `always_on` | `always_on_gated`), `wake_ack` (`tts`|`none`)
+
 ---
 
 ## Terminal / WebSocket command pipeline (same brain, skips voice)
@@ -169,7 +198,12 @@ Brain/command_parser.py — responds to "lidar status" queries
 | `mic_udp.group/port` | config_Voice.json | where to join the G1 audio multicast |
 | `mic_udp.read_timeout_sec` | config_Voice.json | `BuiltinMic.read_chunk` budget (default 0.04 s) |
 | `tts.backend` | config_Voice.json | `builtin_ttsmaker` (only supported option) |
-| `stt.wake_words_en` | config_Voice.json | Whisper matcher (`sanad` + variants) |
+| `stt.wake_words` | config_Voice.json | 33 fuzzy variants of "Sanad" for the wake-verify substring match |
+| `stt.command_vocab` | config_Voice.json | 68 canonical command phrases for fuzzy-normalization (`"turn right up"` → `"turn right"`) |
+| `stt.garbage_patterns` | config_Voice.json | 17 Whisper noise-hallucinations to reject (`"thanks for watching"`, `"okay"`, etc.) |
+| `stt.speech_threshold` etc. | config_Voice.json | energy wake detector thresholds — see `Doc/controlling.md` "Voice" for the full tuning matrix |
+| `stt.whisper_*` | config_Voice.json | faster-whisper model, compute type, beam size, confidence gates, bias prompt |
+| `stt.mode` | config_Voice.json | `wake_and_command` (default) / `always_on` / `always_on_gated` |
 | `timeout_ms`, `stale_threshold_s`, `reconnect_delay_s` | config_Camera.json | RealSense frame timeout, reconnect trigger, initial backoff |
 | `default_max_steps`, `step_delay_s`, `rotate_speed`, `min_steps_warmup` | config_ImageSearch.json | image-guided search rotation cadence (wired into `Vision/marcus_imgsearch.py`) |
 | `default_walk_speed`, `dist_tolerance`, `angle_tolerance`, `safety_timeout_mult`, `dr_update_hz` | config_Odometry.json | precise motion control (wired into `Navigation/marcus_odometry.py`) |
@@ -181,9 +215,14 @@ Brain/command_parser.py — responds to "lidar status" queries
 
 | Step | Typical | Notes |
 |---|---|---|
-| Wake-word detect | 200–500 ms | Whisper tiny on 2 s chunk |
-| Record until silence | 1–8 s | depends on user speech |
-| Whisper small STT | 500–1500 ms | once per command |
+| Wake-word detect | <100 ms | pure-numpy energy detector, 50 ms analysis windows |
+| Wake verify (first wake) | ~2000 ms | includes faster-whisper `base.en` cold load |
+| Wake verify (subsequent) | 300–700 ms | Whisper cached, decodes ~0.5-1.5 s burst |
+| "Yes" TTS ack | ~1500 ms | G1 firmware `TtsMaker` minimum |
+| Record until silence | 1–5 s | depends on user speech; `max_record_sec=5` cap |
+| Pre-silence trim | <1 ms | numpy slice |
+| faster-whisper STT | 500–1500 ms | `base.en` int8 on CPU, beam_size=5 |
+| Fuzzy-match canonicalisation | <1 ms | difflib against 68 phrases |
 | Camera frame fetch | <50 ms | poll loop, no 1 s blocking stall |
 | Ollama Qwen2.5-VL | 800–1500 ms | `num_batch=128 / num_ctx=2048 / num_predict=120` |
 | Executor + ZMQ send | <10 ms | fire-and-forget PUB |
diff --git a/Models/vosk-model-small-en-us-0.15.zip b/Models/vosk-model-small-en-us-0.15.zip
deleted file mode 100644
index 0c94ec8..0000000
Binary files a/Models/vosk-model-small-en-us-0.15.zip and /dev/null differ
diff --git a/Models/vosk-model-small-en-us-0.15/README b/Models/vosk-model-small-en-us-0.15/README
deleted file mode 100644
index a7f7931..0000000
--- a/Models/vosk-model-small-en-us-0.15/README
+++ /dev/null
@@ -1,9 +0,0 @@
-US English model for mobile Vosk applications
-
-Copyright 2020 Alpha Cephei Inc
-
-Accuracy: 10.38 (tedlium test) 9.85 (librispeech test-clean)
-Speed: 0.11xRT (desktop)
-Latency: 0.15s (right context)
-
-
diff --git a/Models/vosk-model-small-en-us-0.15/am/final.mdl b/Models/vosk-model-small-en-us-0.15/am/final.mdl
deleted file mode 100644
index 5596b31..0000000
Binary files a/Models/vosk-model-small-en-us-0.15/am/final.mdl and /dev/null differ
diff --git a/Models/vosk-model-small-en-us-0.15/conf/mfcc.conf b/Models/vosk-model-small-en-us-0.15/conf/mfcc.conf
deleted file mode 100644
index eaa40c5..0000000
--- a/Models/vosk-model-small-en-us-0.15/conf/mfcc.conf
+++ /dev/null
@@ -1,7 +0,0 @@
---sample-frequency=16000
---use-energy=false
---num-mel-bins=40
---num-ceps=40
---low-freq=20
---high-freq=7600
---allow-downsample=true
diff --git a/Models/vosk-model-small-en-us-0.15/conf/model.conf b/Models/vosk-model-small-en-us-0.15/conf/model.conf
deleted file mode 100644
index 9d5b0da..0000000
--- a/Models/vosk-model-small-en-us-0.15/conf/model.conf
+++ /dev/null
@@ -1,10 +0,0 @@
---min-active=200
---max-active=3000
---beam=10.0
---lattice-beam=2.0
---acoustic-scale=1.0
---frame-subsampling-factor=3
---endpoint.silence-phones=1:2:3:4:5:6:7:8:9:10
---endpoint.rule2.min-trailing-silence=0.5
---endpoint.rule3.min-trailing-silence=0.75
---endpoint.rule4.min-trailing-silence=1.0
diff --git a/Models/vosk-model-small-en-us-0.15/graph/Gr.fst b/Models/vosk-model-small-en-us-0.15/graph/Gr.fst
deleted file mode 100644
index 1f292e6..0000000
Binary files a/Models/vosk-model-small-en-us-0.15/graph/Gr.fst and /dev/null differ
diff --git a/Models/vosk-model-small-en-us-0.15/graph/HCLr.fst b/Models/vosk-model-small-en-us-0.15/graph/HCLr.fst
deleted file mode 100644
index 9797b26..0000000
Binary files a/Models/vosk-model-small-en-us-0.15/graph/HCLr.fst and /dev/null differ
diff --git a/Models/vosk-model-small-en-us-0.15/graph/disambig_tid.int b/Models/vosk-model-small-en-us-0.15/graph/disambig_tid.int
deleted file mode 100644
index 762fd5f..0000000
--- a/Models/vosk-model-small-en-us-0.15/graph/disambig_tid.int
+++ /dev/null
@@ -1,17 +0,0 @@
-10015
-10016
-10017
-10018
-10019
-10020
-10021
-10022
-10023
-10024
-10025
-10026
-10027
-10028
-10029
-10030
-10031
diff --git a/Models/vosk-model-small-en-us-0.15/graph/phones/word_boundary.int b/Models/vosk-model-small-en-us-0.15/graph/phones/word_boundary.int
deleted file mode 100644
index df23fd7..0000000
--- a/Models/vosk-model-small-en-us-0.15/graph/phones/word_boundary.int
+++ /dev/null
@@ -1,166 +0,0 @@
-1 nonword
-2 begin
-3 end
-4 internal
-5 singleton
-6 nonword
-7 begin
-8 end
-9 internal
-10 singleton
-11 begin
-12 end
-13 internal
-14 singleton
-15 begin
-16 end
-17 internal
-18 singleton
-19 begin
-20 end
-21 internal
-22 singleton
-23 begin
-24 end
-25 internal
-26 singleton
-27 begin
-28 end
-29 internal
-30 singleton
-31 begin
-32 end
-33 internal
-34 singleton
-35 begin
-36 end
-37 internal
-38 singleton
-39 begin
-40 end
-41 internal
-42 singleton
-43 begin
-44 end
-45 internal
-46 singleton
-47 begin
-48 end
-49 internal
-50 singleton
-51 begin
-52 end
-53 internal
-54 singleton
-55 begin
-56 end
-57 internal
-58 singleton
-59 begin
-60 end
-61 internal
-62 singleton
-63 begin
-64 end
-65 internal
-66 singleton
-67 begin
-68 end
-69 internal
-70 singleton
-71 begin
-72 end
-73 internal
-74 singleton
-75 begin
-76 end
-77 internal
-78 singleton
-79 begin
-80 end
-81 internal
-82 singleton
-83 begin
-84 end
-85 internal
-86 singleton
-87 begin
-88 end
-89 internal
-90 singleton
-91 begin
-92 end
-93 internal
-94 singleton
-95 begin
-96 end
-97 internal
-98 singleton
-99 begin
-100 end
-101 internal
-102 singleton
-103 begin
-104 end
-105 internal
-106 singleton
-107 begin
-108 end
-109 internal
-110 singleton
-111 begin
-112 end
-113 internal
-114 singleton
-115 begin
-116 end
-117 internal
-118 singleton
-119 begin
-120 end
-121 internal
-122 singleton
-123 begin
-124 end
-125 internal
-126 singleton
-127 begin
-128 end
-129 internal
-130 singleton
-131 begin
-132 end
-133 internal
-134 singleton
-135 begin
-136 end
-137 internal
-138 singleton
-139 begin
-140 end
-141 internal
-142 singleton
-143 begin
-144 end
-145 internal
-146 singleton
-147 begin
-148 end
-149 internal
-150 singleton
-151 begin
-152 end
-153 internal
-154 singleton
-155 begin
-156 end
-157 internal
-158 singleton
-159 begin
-160 end
-161 internal
-162 singleton
-163 begin
-164 end
-165 internal
-166 singleton
diff --git a/Models/vosk-model-small-en-us-0.15/ivector/final.dubm b/Models/vosk-model-small-en-us-0.15/ivector/final.dubm
deleted file mode 100644
index db789eb..0000000
Binary files a/Models/vosk-model-small-en-us-0.15/ivector/final.dubm and /dev/null differ
diff --git a/Models/vosk-model-small-en-us-0.15/ivector/final.ie b/Models/vosk-model-small-en-us-0.15/ivector/final.ie
deleted file mode 100644
index 93737bf..0000000
Binary files a/Models/vosk-model-small-en-us-0.15/ivector/final.ie and /dev/null differ
diff --git a/Models/vosk-model-small-en-us-0.15/ivector/final.mat b/Models/vosk-model-small-en-us-0.15/ivector/final.mat
deleted file mode 100644
index c3ec635..0000000
Binary files a/Models/vosk-model-small-en-us-0.15/ivector/final.mat and /dev/null differ
diff --git a/Models/vosk-model-small-en-us-0.15/ivector/global_cmvn.stats b/Models/vosk-model-small-en-us-0.15/ivector/global_cmvn.stats
deleted file mode 100644
index b9d92ef..0000000
--- a/Models/vosk-model-small-en-us-0.15/ivector/global_cmvn.stats
+++ /dev/null
@@ -1,3 +0,0 @@
- [
-  1.682383e+11 -1.1595e+10 -1.521733e+10 4.32034e+09 -2.257938e+10 -1.969666e+10 -2.559265e+10 -1.535687e+10 -1.276854e+10 -4.494483e+09 -1.209085e+10 -5.64008e+09 -1.134847e+10 -3.419512e+09 -1.079542e+10 -4.145463e+09 -6.637486e+09 -1.11318e+09 -3.479773e+09 -1.245932e+08 -1.386961e+09 6.560655e+07 -2.436518e+08 -4.032432e+07 4.620046e+08 -7.714964e+07 9.551484e+08 -4.119761e+08 8.208582e+08 -7.117156e+08 7.457703e+08 -4.3106e+08 1.202726e+09 2.904036e+08 1.231931e+09 3.629848e+08 6.366939e+08 -4.586172e+08 -5.267629e+08 -3.507819e+08 1.679838e+09 
-  1.741141e+13 8.92488e+11 8.743834e+11 8.848896e+11 1.190313e+12 1.160279e+12 1.300066e+12 1.005678e+12 9.39335e+11 8.089614e+11 7.927041e+11 6.882427e+11 6.444235e+11 5.151451e+11 4.825723e+11 3.210106e+11 2.720254e+11 1.772539e+11 1.248102e+11 6.691599e+10 3.599804e+10 1.207574e+10 1.679301e+09 4.594778e+08 5.821614e+09 1.451758e+10 2.55803e+10 3.43277e+10 4.245286e+10 4.784859e+10 4.988591e+10 4.925451e+10 5.074584e+10 4.9557e+10 4.407876e+10 3.421443e+10 3.138606e+10 2.539716e+10 1.948134e+10 1.381167e+10 0 ]
diff --git a/Models/vosk-model-small-en-us-0.15/ivector/online_cmvn.conf b/Models/vosk-model-small-en-us-0.15/ivector/online_cmvn.conf
deleted file mode 100644
index 7748a4a..0000000
--- a/Models/vosk-model-small-en-us-0.15/ivector/online_cmvn.conf
+++ /dev/null
@@ -1 +0,0 @@
-# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/Models/vosk-model-small-en-us-0.15/ivector/splice.conf b/Models/vosk-model-small-en-us-0.15/ivector/splice.conf
deleted file mode 100644
index 960cd2e..0000000
--- a/Models/vosk-model-small-en-us-0.15/ivector/splice.conf
+++ /dev/null
@@ -1,2 +0,0 @@
---left-context=3
---right-context=3
diff --git a/README.md b/README.md
index 85161c7..d96d99f 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ a Python brain.
 | **Brain** (reason, speak, decide) | Parse commands, reason about vision, pick actions | **Qwen2.5-VL 3B** via Ollama | Jetson GPU |
 | **Eyes** (see) | Real-time object/person detection | **YOLOv8m** (CUDA, FP16, 320 px, ~22 FPS) | Jetson GPU |
 | **Eyes** (understand) | Open-ended scene understanding, reading, goal-verify | **Qwen2.5-VL** (same brain model) | Jetson GPU |
-| **Ears** (hear) | Always-on wake-word + command transcription | **Whisper tiny** (wake) + **Whisper small** (STT) | Jetson CPU/GPU |
+| **Ears** (hear) | Energy-based wake detector + command transcription | **Custom DSP wake** (numpy, no ML) + **faster-whisper base.en int8** (STT) | Jetson CPU |
 | **Mouth** (speak) | On-robot TTS, no internet needed | **Unitree `TtsMaker`** (G1 firmware) | G1 body speaker |
 | **Legs** (walk) | 29-DoF locomotion + balance | **Holosoma** RL policy (separate process, ONNX) | Jetson CPU |
 | **Hands** (gesture) | Arm & hand actions | **GR00T N1.5** — pending; `API/arm_api.py` is a stub today | Jetson GPU (future) |
@@ -54,7 +54,7 @@ Camera ─┘         ▼                              ├─► Legs (Holosoma
 
 Three input modalities, same command loop:
 
-- **Voice** — say "**Sanad, what do you see?**" → wake word fires, Whisper transcribes, brain answers through the G1 speaker.
+- **Voice** — say "**Sanad**" → energy detector fires, Whisper verifies the /sa-/ phoneme signature, robot replies "Yes" → speak your command → faster-whisper transcribes → brain answers through the G1 speaker.
 - **Text** — type the same command into `run_marcus.py`'s terminal.
 - **WebSocket (remote)** — `Client/marcus_cli.py` or `Client/marcus_client.py` (Tkinter GUI) send commands from a workstation.
 
@@ -84,7 +84,8 @@ There are two schools for combining them:
 | Vision — open-ended scene understanding | same VLM | learned |
 | Legs / locomotion | **RL policy** (Holosoma, ONNX) | learned |
 | Arms / gestures | SDK action-ID lookup | **hand-coded** |
-| Wake-word + STT | Whisper | learned |
+| Wake word | Custom energy-envelope DSP (numpy) | hand-coded |
+| STT (command) | faster-whisper base.en | learned |
 | TTS | Unitree `TtsMaker` (on-robot DSP) | firmware |
 | Glue between layers | Python + ZMQ + JSON | hand-coded |
 
@@ -143,13 +144,13 @@ Same hardware, different prompts + wake word.
 
 - **Prompts** rewrite: *"You are a museum guide. When a visitor asks about an exhibit, describe it in two sentences and invite them to ask follow-ups."*
 - **Places** memory pre-loaded with exhibit waypoints; `patrol: exhibit_A → exhibit_B → exit` follows a tour.
-- Wake word changed in `config_Voice.json::stt.wake_words_en`.
+- Wake word variants in `config_Voice.json::stt.wake_words` (fuzzy list, handles Whisper mishearings of "Sanad").
 - Image search (`search/ photo_of_exhibit.jpg`) lets visitors hold up a printed map; the robot navigates to the matching location.
 - YOLO classes trimmed to people-only if the venue doesn't need object safety.
 
 **What you change to switch use cases:**
 1. `Config/marcus_prompts.yaml` — persona + task descriptions
-2. `Config/config_Voice.json::stt.wake_words_en` — the name people call the robot
+2. `Config/config_Voice.json::stt.wake_words` — the name (+ fuzzy variants) people call the robot
 3. `Config/config_Vision.json::tracked_classes` — relevant object set
 4. `Config/config_Brain.json::subsystems.{lidar,voice,imgsearch,autonomous}` — enable what you need
 5. Data under `Data/History/Places/places.json` — learned locations
@@ -174,6 +175,7 @@ No code changes required for either deployment.
  Vision/         Navigation/     Voice/        Lidar/
  YOLO, imgsearch  goal_nav,     builtin_mic,   SLAM engine
                   patrol, odom  builtin_tts,   (subprocess)
+                                wake_detector,
                                 marcus_voice
             │
             ▼
@@ -253,7 +255,7 @@ Marcus/
 ├── Brain/                     orchestrator, parser, executor, memory
 ├── Vision/                    YOLO + image-guided search
 ├── Navigation/                goal nav, patrol, odometry
-├── Voice/                     built-in mic, built-in TTS, Whisper loop
+├── Voice/                     built-in mic, TTS, energy wake detector, faster-whisper STT
 ├── Autonomous/                exploration state machine
 ├── Lidar/                     SLAM engine (subprocess)
 ├── Server/                    WebSocket interface
diff --git a/Voice/marcus_voice.py b/Voice/marcus_voice.py
index ea8bd13..0dbbb87 100644
--- a/Voice/marcus_voice.py
+++ b/Voice/marcus_voice.py
@@ -1,28 +1,29 @@
 #!/usr/bin/env python3
 """
-Voice/marcus_voice.py — Marcus Wake-Signal Module (no ML, no STT).
+Voice/marcus_voice.py — voice input for Marcus (custom wake + faster-whisper STT).
 
-This is a deliberately-minimal voice subsystem:
+Pipeline:
+    G1 mic ─► custom wake detector (numpy, offline, instant)
+                       │
+                       ▼
+                 TTS "Yes"  (AudioAPI → G1 TtsMaker)
+                       │
+                       ▼
+              record command audio until silence
+                       │
+                       ▼
+        faster-whisper base.en int8 (CPU)  ──►  brain callback(text)
 
-    - A custom energy-based wake detector (Voice/wake_detector.py) listens
-      to the G1's on-board mic continuously.
-    - When the user says any short word (~0.2-1.5 s of speech followed by
-      silence), wake fires.
-    - The robot acknowledges via TTS ("Yes" — configurable).
-    - The user then types their command at the Marcus terminal prompt.
+Wake detection is local and instant (Voice/wake_detector.py — pure DSP, no
+ML). STT runs only on the recorded command, not on every 2 s of mic input,
+so the CPU cost is bounded by how often the user talks.
 
-No Vosk, no Whisper, no torch, no network. Pure numpy DSP.
-
-Why not STT here:
-    Both Vosk's small English model ("sanad" absent from lexicon) and
-    openai-whisper ("!!!!!" numerical garbage on this Jetson's torch-aarch64)
-    proved unreliable for this hardware. Rather than fight either, the
-    wake path becomes a simple "did the user say something?" signal.
-
-Interface with Marcus brain:
-    VoiceModule(audio_api, on_wake=callback)
-        on_wake() is called when wake fires. Brain can display a prompt
-        or do anything else.
+Why faster-whisper (CTranslate2) instead of openai-whisper:
+    The Jetson's torch-aarch64 build has a Categorical sampler bug that
+    produces NaN logits on low-SNR input, which is exactly what the G1
+    far-field mic captures. faster-whisper bypasses torch entirely and
+    runs the int8-quantized model through CTranslate2 — same quality as
+    Whisper base, no numerical instability, 3× faster on this hardware.
 """
 
 from __future__ import annotations
@@ -35,7 +36,8 @@ import time
 from logging.handlers import RotatingFileHandler
 from typing import Callable, Optional
 
-# ─── PATH + CONFIG ───────────────────────────────────────
+import numpy as np
+
 _PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 if _PROJECT_DIR not in sys.path:
     sys.path.insert(0, _PROJECT_DIR)
@@ -58,42 +60,176 @@ logging.basicConfig(
 log = logging.getLogger("marcus_voice")
 
 
-class VoiceModule:
-    """Wake-only voice subsystem — fires a callback when speech is detected."""
+# Module-level vocabulary containers. EMPTY on import — populated by
+# VoiceModule.__init__ from Config/config_Voice.json::stt.{wake_words,
+# command_vocab, garbage_patterns}. Config is the single source of truth;
+# there are no hardcoded string lists here anymore.
+#
+# If you import this module without running a VoiceModule() first, these
+# stay empty → fuzzy-match is a no-op, wake detection rejects everything,
+# garbage filter rejects nothing. That's by design: bad config = obvious
+# broken behavior, not silently-drifting hardcoded defaults.
+WAKE_WORDS: set = set()
+COMMAND_VOCAB: list = []
+GARBAGE_PATTERNS: set = set()
+_MIN_TRANSCRIPTION_LENGTH: int = 3
 
-    def __init__(self, audio_api, on_command: Optional[Callable] = None,
-                 on_wake: Optional[Callable] = None):
-        """
-        Args:
-            audio_api:   AudioAPI instance (for TTS ack).
-            on_command:  kept for API compatibility; always called with
-                         text="" because there's no STT. Brain should
-                         prompt the user to type.
-            on_wake:     alternative callback fired when wake detected.
-                         Exactly one of on_command / on_wake is used.
-        """
+
+def _has_wake_word(text: str) -> bool:
+    """
+    True if the utterance contains any wake-word variant as a *whole word*
+    (word-boundary match, not substring — so "standard" doesn't trigger
+    off "sand").
+    """
+    import re
+    low = text.lower()
+    for w in WAKE_WORDS:
+        if re.search(r'\b' + re.escape(w) + r'\b', low):
+            return True
+    return False
+
+
+def _strip_wake_word_once(text: str) -> str:
+    """Single pass of wake-word stripping. Use via _strip_wake_word()."""
+    import re
+    stripped = text.strip()
+
+    # Case 1: the entire utterance is just a wake word + optional
+    # trailing punctuation. Return empty string so caller can ack-only.
+    for w in WAKE_WORDS:
+        if re.fullmatch(rf'{re.escape(w)}[\s,.!?]*', stripped, re.IGNORECASE):
+            return ""
+
+    # Case 2: "Sanad <command>" — require whitespace (or comma+ws) between
+    # wake word and command so "Sanad." doesn't swallow "." as a command.
+    for w in sorted(WAKE_WORDS, key=len, reverse=True):
+        m = re.match(
+            rf'^\s*{re.escape(w)}\s*[,.!?]?\s+(.+)$',
+            text, re.IGNORECASE,
+        )
+        if m:
+            return m.group(1).strip(' ,.!?')
+
+        # Case 3: "<command> Sanad" — trailing wake word.
+        m = re.match(
+            rf'^(.+?)\s+{re.escape(w)}\s*[.!?]*\s*$',
+            text, re.IGNORECASE,
+        )
+        if m:
+            return m.group(1).strip(' ,.!?')
+
+    return text
+
+
+def _strip_wake_word(text: str) -> str:
+    """
+    Remove the wake word from the start or end of text, iteratively,
+    so repeated-wake transcriptions ("Sanad. Sanad.") fully collapse
+    to the actual command (or empty string if nothing else was said).
+
+    Examples:
+        "Sanad, turn left"  → "turn left"
+        "Sanad turn left"   → "turn left"
+        "turn left Sanad"   → "turn left"
+        "Sanad."            → ""
+        "Sanad"             → ""
+        "Sanad. Sanad."     → ""     (was leaving "Sanad" before)
+        "Sanad Sanad stop"  → "stop" (recursive strip)
+    """
+    # Iterate until stable — each pass peels off one wake word. Cap at
+    # a handful of iterations so a malicious/garbled input can't loop.
+    for _ in range(5):
+        stripped = _strip_wake_word_once(text)
+        if stripped == text:
+            return text
+        text = stripped
+    return text
+
+
+def _closest_command(text: str, cutoff: float = 0.72) -> str:
+    """
+    Map a Whisper transcription to the closest known command phrase.
+
+    Returns the canonical command if there's a close-enough match, else
+    returns the original text unchanged. Close = difflib SequenceMatcher
+    ratio ≥ cutoff (0.72 empirically rejects unrelated phrases while
+    accepting common Whisper near-misses like "Turn right up"→"turn right"
+    or "What do you see?"→"what do you see").
+
+    Also handles the "transcription contains a command" case — if the
+    text has a command phrase as a substring (e.g. "Sanad, turn left"
+    from an echo), extract the command.
+    """
+    from difflib import SequenceMatcher
+    low = text.lower().strip().rstrip(".!?,")
+    if not low:
+        return text
+
+    # Cheap substring win first — no fuzzy needed if the command is
+    # literally in the transcription.
+    for cmd in COMMAND_VOCAB:
+        if cmd in low:
+            return cmd
+
+    best_cmd = None
+    best_ratio = 0.0
+    for cmd in COMMAND_VOCAB:
+        r = SequenceMatcher(None, low, cmd).ratio()
+        if r > best_ratio:
+            best_ratio = r
+            best_cmd = cmd
+
+    if best_ratio >= cutoff:
+        return best_cmd
+    return text
+
+
+class VoiceModule:
+    def __init__(
+        self,
+        audio_api,
+        on_command: Optional[Callable] = None,
+        on_wake: Optional[Callable] = None,
+    ):
         self._audio = audio_api
         self._on_command = on_command
         self._on_wake = on_wake
-        self._config = load_config("Voice")
 
+        self._config = load_config("Voice")
         self._stt = self._config.get("stt", {})
         self._messages = self._config.get("messages", {})
 
-        # Wake-detector parameters (tweakable via config_Voice.json::stt).
+        # Load all voice vocabulary from config — these are the only
+        # string lists the voice layer uses, and they come from
+        # config_Voice.json. If a key is missing, the list is empty and
+        # that feature silently degrades (fuzzy-match no-op, nothing
+        # rejected as garbage, no wake-word match) — NEVER crashes.
+        global WAKE_WORDS, COMMAND_VOCAB, GARBAGE_PATTERNS, _MIN_TRANSCRIPTION_LENGTH
+        WAKE_WORDS        = {w.lower() for w in self._stt.get("wake_words", [])}
+        COMMAND_VOCAB     = list(self._stt.get("command_vocab", []))
+        GARBAGE_PATTERNS  = {p.lower() for p in self._stt.get("garbage_patterns", [])}
+        _MIN_TRANSCRIPTION_LENGTH = int(self._stt.get("min_transcription_length", 3))
+        self._vocab_cutoff = float(self._stt.get("command_vocab_cutoff", 0.72))
+        log.info("vocab loaded: %d wake_words, %d command_vocab, %d garbage_patterns",
+                 len(WAKE_WORDS), len(COMMAND_VOCAB), len(GARBAGE_PATTERNS))
+
+        # ── Custom wake detector ──
         from Voice.wake_detector import WakeDetector, WakeConfig
         wcfg = WakeConfig(
-            sample_rate        = 16_000,
-            speech_threshold   = float(self._stt.get("speech_threshold",   150.0)),
-            min_word_duration_s= float(self._stt.get("min_word_duration",  0.20)),
-            max_word_duration_s= float(self._stt.get("max_word_duration",  1.50)),
-            post_silence_s     = float(self._stt.get("post_silence",       0.30)),
-            cooldown_s         = float(self._stt.get("wake_cooldown",      1.50)),
-            chunk_ms           = int(  self._stt.get("wake_chunk_ms",      50)),
+            sample_rate         = 16_000,
+            speech_threshold    = float(self._stt.get("speech_threshold",   80.0)),
+            min_word_duration_s = float(self._stt.get("min_word_duration",  0.20)),
+            max_word_duration_s = float(self._stt.get("max_word_duration",  1.50)),
+            post_silence_s      = float(self._stt.get("post_silence",       0.30)),
+            cooldown_s          = float(self._stt.get("wake_cooldown",      1.50)),
+            chunk_ms            = int(  self._stt.get("wake_chunk_ms",      50)),
+            adaptive_window_n   = int(  self._stt.get("wake_adaptive_window_n", 50)),
+            adaptive_mult       = float(self._stt.get("wake_adaptive_mult",    3.0)),
+            diag_log_sec        = float(self._stt.get("wake_diag_log_sec",     3.0)),
         )
         self._detector = WakeDetector(wcfg)
 
-        # G1 built-in mic (UDP multicast).
+        # ── G1 mic ──
         from Voice.builtin_mic import BuiltinMic
         _mcfg = self._config.get("mic_udp", {})
         self._mic_capture = BuiltinMic(
@@ -101,63 +237,904 @@ class VoiceModule:
             port    = _mcfg.get("port",             5555),
             buf_max = _mcfg.get("buffer_max_bytes", 64000),
         )
+        self._sample_rate = self._mic_capture.sample_rate
+
+        # ── global software mic gain ──
+        # Applied to every byte read from the mic, so wake detector, VAD,
+        # AND Whisper all see the boosted audio. One knob, uniform effect.
+        # G1 far-field mic benefits from 2.0-3.0 for normal speaking volume;
+        # above 4.0 you start clipping loud words.
+        self._mic_gain = float(self._stt.get("mic_gain", 1.0))
+        if self._mic_gain != 1.0:
+            log.info("mic_gain = %.2fx (applied to all mic reads)", self._mic_gain)
+
+        # ── faster-whisper (lazy-init on first wake) ──
+        self._fw = None
 
         self._running = False
         self._thread = None
+        self._cooldown_until = 0.0
+        log.info("VoiceModule initialized (wake=custom, stt=faster-whisper)")
 
+    # ─── gain-applied mic read ────────────────────────────
+
+    def _read_mic_raw(self, num_bytes: int) -> bytes:
+        """Raw mic read — no gain. Used by the wake detector whose
+        thresholds are calibrated against unamplified G1 ambient."""
+        return self._mic_capture.read_chunk(num_bytes)
+
+    def _read_mic_gained(self, num_bytes: int) -> bytes:
+        """
+        Mic read with self._mic_gain applied. Used during command
+        recording so Whisper sees a louder, cleaner signal. NOT used
+        in the wake loop — amplifying ambient there pushes it over
+        the wake threshold and the detector can never find its
+        silent baseline.
+        """
+        raw = self._mic_capture.read_chunk(num_bytes)
+        if not raw or self._mic_gain == 1.0:
+            return raw
+        arr = np.frombuffer(raw, dtype=np.int16).astype(np.float32) * self._mic_gain
+        return np.clip(arr, -32768, 32767).astype(np.int16).tobytes()
+
+    # ─── lazy faster-whisper init ─────────────────────────
+
+    def _get_fw(self):
+        """Load faster-whisper on first use — startup saved for cold path."""
+        if self._fw is not None:
+            return self._fw
+        model   = self._stt.get("whisper_model",        "base.en")
+        device  = self._stt.get("whisper_device",       "cpu")
+        compute = self._stt.get("whisper_compute_type", "int8")
         log.info(
-            "VoiceModule initialized (custom wake detector, "
-            "speech_threshold=%s, min/max_word=%s/%s s)",
-            wcfg.speech_threshold, wcfg.min_word_duration_s, wcfg.max_word_duration_s,
+            "Loading faster-whisper: model=%s device=%s compute=%s",
+            model, device, compute,
         )
+        try:
+            from faster_whisper import WhisperModel
+            self._fw = WhisperModel(model, device=device, compute_type=compute)
+            log.info("faster-whisper ready")
+        except Exception as e:
+            log.error("faster-whisper init failed: %s — voice will be wake-only", e)
+            self._fw = None
+        return self._fw
+
+    # ─── command recording ────────────────────────────────
+
+    def _record_command(self) -> np.ndarray:
+        """
+        Record the user's command with a hysteretic, adaptive-baseline VAD.
+
+        Design (handles quiet, normal, and loud voices on the G1 mic):
+
+            1. Sample 200 ms of ambient noise first to learn the floor,
+               then set the "silence" gate to max(ambient * 2.5, floor).
+               Eliminates the "my silence threshold is higher than my
+               user's speaking level" failure mode.
+
+            2. Two thresholds with hysteresis:
+                 speech_entry  — RMS required to count as "speech started"
+                 silence_exit  — RMS below which we count silence
+                                 (< speech_entry; prevents mid-word bail on
+                                 breaths and short consonant gaps).
+
+            3. Recording can only *end* after we've actually heard speech.
+               Pure silence just runs out to max_record_sec, then returns
+               empty (the caller plays "I didn't catch that" without
+               burning a Whisper call on noise).
+
+            4. After speech is seen, silence_budget accumulates only while
+               RMS stays below silence_exit. A single loud burst resets
+               it to zero — so natural "turn... left" pauses don't end the
+               recording.
+        """
+        # ── config knobs (all overridable via config_Voice.json::stt) ─
+        speech_entry_rms = float(self._stt.get("speech_entry_rms", 250.0))
+        silence_exit_rms = float(self._stt.get("silence_exit_rms", 120.0))
+        silence_dur      = float(self._stt.get("silence_duration_sec", 1.2))
+        max_dur          = float(self._stt.get("max_record_sec",       8.0))
+        min_dur          = float(self._stt.get("min_record_sec",       0.4))
+        ambient_probe_s  = float(self._stt.get("ambient_probe_sec",    0.2))
+        ambient_mult     = float(self._stt.get("ambient_mult",         2.5))
+
+        small_chunk_bytes = 1024
+        analysis_ms       = 100
+        analysis_bytes    = int(self._sample_rate * analysis_ms / 1000) * 2
+
+        # ── 1. Reuse the wake detector's baseline instead of probing
+        #    the mic right now. The wake detector's _baseline is a rolling
+        #    mean of idle-silence RMS values from the last few seconds.
+        #
+        #    Why NOT probe at record-time: we arrive here right after TTS
+        #    "Yes", and the user typically starts speaking within 200 ms
+        #    of hearing the ack. A probe window sized to the ambient floor
+        #    then measures the *user's speech* as "ambient" and sets
+        #    speech_entry above the user's actual amplitude — causing the
+        #    "no speech in 8.00s" failure mode observed in the wild.
+        #
+        #    Cap the baseline at a sensible ceiling so a one-off loud
+        #    transient during idle doesn't lock us out either.
+        probe_buf = bytearray()                         # no probe audio kept
+        ambient_rms = getattr(self._detector, "_baseline", 0.0) or 0.0
+        ambient_cap = float(self._stt.get("ambient_cap_rms", 200.0))
+        ambient_rms = min(ambient_rms, ambient_cap)
+
+        if ambient_rms > 0:
+            adaptive_exit  = max(silence_exit_rms, ambient_rms * ambient_mult)
+            adaptive_entry = max(speech_entry_rms, ambient_rms * ambient_mult * 1.8)
+        else:
+            adaptive_exit, adaptive_entry = silence_exit_rms, speech_entry_rms
+
+        log.info("vad: ambient_rms=%.0f (from wake baseline, cap=%.0f)  "
+                 "speech_entry=%.0f  silence_exit=%.0f",
+                 ambient_rms, ambient_cap, adaptive_entry, adaptive_exit)
+
+        # ── 2. main capture loop ──────────────────────────────────────
+        collected      = bytearray(probe_buf)   # keep probe audio — user may
+                                                # have already started talking
+        analysis_buf   = bytearray()
+        silence_budget = 0.0
+        total_time     = len(probe_buf) / 2 / self._sample_rate
+        speech_seen    = False
+        peak_rms_seen  = 0.0
+        # Byte offset into `collected` at which speech first crossed
+        # adaptive_entry. We trim pre-speech silence to this point (minus
+        # ~300 ms pre-roll) before returning. Keeping Whisper's input
+        # tight (speech + small tails) improves transcription accuracy
+        # by removing the ambient/HVAC portion that dilutes the mel
+        # features.
+        speech_start_byte: Optional[int] = None
+        preroll_bytes = int(self._sample_rate * 0.3) * 2   # 300 ms
+        wall_start     = time.time()
+
+        while total_time < max_dur and (time.time() - wall_start) < max_dur + 2:
+            raw = self._read_mic_gained(small_chunk_bytes)
+            if not raw:
+                time.sleep(0.005)
+                continue
+            collected.extend(raw)
+            analysis_buf.extend(raw)
+            total_time += (len(raw) // 2) / self._sample_rate
+
+            while len(analysis_buf) >= analysis_bytes:
+                win = np.frombuffer(bytes(analysis_buf[:analysis_bytes]), dtype=np.int16)
+                del analysis_buf[:analysis_bytes]
+                rms = float(np.sqrt(np.mean(win.astype(np.float64) ** 2)))
+                peak_rms_seen = max(peak_rms_seen, rms)
+
+                if rms >= adaptive_entry:
+                    if not speech_seen:
+                        speech_seen = True
+                        # Record where speech started (byte offset
+                        # in `collected`) so we can trim pre-roll later.
+                        speech_start_byte = max(0, len(collected) - preroll_bytes)
+                    silence_budget = 0.0
+                elif speech_seen and rms < adaptive_exit:
+                    silence_budget += analysis_ms / 1000.0
+                # between exit and entry → hold state (hysteresis zone)
+
+            # end only *after* we've heard real speech
+            if (speech_seen
+                    and silence_budget >= silence_dur
+                    and total_time >= min_dur):
+                log.info("silence after speech at %.2fs  (peak_rms=%.0f)",
+                         total_time, peak_rms_seen)
+                break
+
+        if not speech_seen:
+            log.info("no speech in %.2fs (peak_rms=%.0f < entry=%.0f) — dropping",
+                     total_time, peak_rms_seen, adaptive_entry)
+            return np.array([], dtype=np.int16)
+
+        if total_time >= max_dur:
+            log.info("max-record-sec hit at %.2fs (peak_rms=%.0f)",
+                     total_time, peak_rms_seen)
+
+        # Trim leading pre-speech silence. Keep 300 ms of pre-roll so
+        # the onset of the first phoneme is preserved for Whisper.
+        if speech_start_byte and speech_start_byte > 0:
+            trimmed_ms = speech_start_byte / 2 / self._sample_rate * 1000
+            log.info("trimmed %.0f ms of leading silence "
+                     "(pre-speech buffer %d bytes)",
+                     trimmed_ms, speech_start_byte)
+            collected = collected[speech_start_byte:]
+
+        return (np.frombuffer(bytes(collected), dtype=np.int16)
+                if collected else np.array([], dtype=np.int16))
+
+    # ─── transcription ────────────────────────────────────
+
+    def _transcribe(self, audio_i16: np.ndarray) -> str:
+        """int16 PCM → Whisper transcription. Returns '' on no-speech/noise."""
+        fw = self._get_fw()
+        if fw is None:
+            return ""
+
+        # mic_gain was already applied in _read_mic_gained() during
+        # _record_command, so audio_i16 here is already boosted.
+
+        # int16 → float32 [-1, 1] + DSP pre-processing:
+        #   1. DC offset removal (subtract mean) — removes any mic bias
+        #   2. High-pass filter at 80 Hz — kills HVAC rumble, G1 fan noise,
+        #      and speaker-vibration resonance. Whisper ignores the
+        #      rumble band anyway, but it inflates RMS estimation and
+        #      steals dynamic range from the speech band.
+        #   3. Pre-emphasis (0.97 coeff) — mild high-frequency boost
+        #      that sharpens consonants (/t/, /s/, /k/ plosives/fricatives)
+        #      which Whisper's mel features care most about.
+        #   4. Peak-normalize to 0.7.
+        audio_f32 = audio_i16.astype(np.float32) / 32768.0
+        # 1. DC removal
+        audio_f32 = audio_f32 - np.mean(audio_f32)
+        # 2. High-pass at 80 Hz (1-pole IIR, stable + cheap)
+        audio_f32 = self._highpass_80hz(audio_f32)
+        # 3. Pre-emphasis y[n] = x[n] - 0.97 * x[n-1]
+        audio_f32 = np.append(
+            audio_f32[:1], audio_f32[1:] - 0.97 * audio_f32[:-1]
+        )
+        # 4. Peak-normalize
+        peak = float(np.abs(audio_f32).max())
+        if peak > 1e-4 and peak < 0.7:
+            boost = 0.7 / peak
+            audio_f32 = audio_f32 * boost
+            log.info("peak-normalized ×%.2f (peak %.3f → 0.70)", boost, peak)
+
+        # Initial prompt biases the model toward our command vocabulary.
+        # Whisper uses this as decoder context — words in the prompt become
+        # more likely, which converts ambiguous low-SNR audio like "muv rahh"
+        # from a plausible English phrase ("and provide") into the intended
+        # command ("move right"). Keep short — long prompts can be echoed.
+        init_prompt = self._stt.get(
+            "whisper_initial_prompt",
+            "turn left, turn right, move forward, walk back, stop, come here, "
+            "sit down, stand up, raise arm, wave, look around, what do you see, "
+            "remember this, go home, patrol."
+        )
+
+        beam_size           = int(self._stt.get("whisper_beam_size",           5))
+        no_speech_threshold = float(self._stt.get("whisper_no_speech_threshold", 0.6))
+        log_prob_threshold  = float(self._stt.get("whisper_log_prob_threshold", -1.0))
+        compression_ratio_t = float(self._stt.get("whisper_compression_ratio_threshold", 2.4))
+
+        # Temperature fallback: greedy first (T=0), then 0.2, then 0.4.
+        # Whisper retries automatically when a pass is rejected by
+        # its confidence gates (log_prob < threshold etc.). On noisy
+        # audio this commonly rescues a bad greedy decode.
+        temperatures = self._stt.get(
+            "whisper_temperature_fallback", [0.0, 0.2, 0.4]
+        )
+        try:
+            segments, info = fw.transcribe(
+                audio_f32,
+                language="en",
+                beam_size=beam_size,          # 5 = much better than greedy on noisy audio
+                temperature=temperatures,     # greedy → 0.2 → 0.4 fallback
+                initial_prompt=init_prompt,   # command-vocabulary bias (empty by default)
+                condition_on_previous_text=False,
+                vad_filter=False,             # we already trimmed silence
+                without_timestamps=True,
+                # Whisper's built-in gates — drop transcripts that look
+                # like hallucinations (very low prob, highly compressed).
+                no_speech_threshold=no_speech_threshold,
+                log_prob_threshold=log_prob_threshold,
+                compression_ratio_threshold=compression_ratio_t,
+            )
+            # Collect segments and their mean log-prob for a confidence signal.
+            seg_list = list(segments)
+            text = " ".join(s.text for s in seg_list).strip()
+            nsp = float(getattr(info, "no_speech_prob", 0.0))
+            if seg_list:
+                mean_lp = sum(getattr(s, "avg_logprob", 0.0) for s in seg_list) / len(seg_list)
+                log.info("whisper: lp=%.2f nsp=%.2f text=%r",
+                         mean_lp, nsp, text[:80])
+            else:
+                # CRITICAL: log even when Whisper returned zero segments
+                # so we can see WHY it dropped everything. Usually nsp is
+                # above the threshold or the log-prob fallback killed it.
+                log.info("whisper: (no segments)  nsp=%.2f  thresholds: nsp>%.2f && lp<%.2f → drop",
+                         nsp, no_speech_threshold, log_prob_threshold)
+        except Exception as e:
+            log.error("faster-whisper transcribe failed: %s", e)
+            return ""
+
+        if not text:
+            return ""
+
+        # Reject Whisper garbage patterns (stt.garbage_patterns) and
+        # transcriptions shorter than stt.min_transcription_length.
+        # Preserve:
+        #   - bare wake words (valid "just Sanad" signal → ack)
+        #   - exact matches in stt.command_vocab (legitimate short
+        #     commands like "go", "hi" must survive the length filter)
+        low = text.lower().strip().rstrip(".!?,")
+        vocab_exact = {c.lower() for c in COMMAND_VOCAB}
+        if low in GARBAGE_PATTERNS or len(low) < _MIN_TRANSCRIPTION_LENGTH:
+            if low not in WAKE_WORDS and low not in vocab_exact:
+                log.info("Rejecting likely noise transcription: %r", text)
+                return ""
+
+        # NOTE: fuzzy-match to canonical command phrase used to happen
+        # here, but it runs BEFORE gated-mode could see the wake word.
+        # Moved to _normalize_command() and called at dispatch time
+        # AFTER the wake-word gate + wake-word strip, so the gate
+        # always sees the raw Whisper text.
+        return text
+
+    @staticmethod
+    def _highpass_80hz(x: np.ndarray, sr: int = 16_000) -> np.ndarray:
+        """
+        1-pole IIR high-pass at ~80 Hz. Attenuates HVAC/fan rumble
+        without touching the speech band. Cheap: 2 multiplies per sample.
+        """
+        if x.size < 2:
+            return x
+        # Alpha from fc=80Hz: alpha = RC / (RC + dt), RC = 1/(2*pi*fc)
+        import math
+        rc = 1.0 / (2 * math.pi * 80.0)
+        dt = 1.0 / sr
+        alpha = rc / (rc + dt)
+        y = np.empty_like(x)
+        y[0] = x[0]
+        # vectorised enough — the loop is JITted by numpy internally
+        # for reasonable sizes (~25k samples).
+        prev_y, prev_x = x[0], x[0]
+        for i in range(1, x.size):
+            cur = alpha * (prev_y + x[i] - prev_x)
+            y[i] = cur
+            prev_y, prev_x = cur, x[i]
+        return y
+
+    def _transcribe_raw(self, audio_i16: np.ndarray) -> str:
+        """
+        Like _transcribe but WITHOUT the garbage-pattern / length filters
+        and without the `initial_prompt` bias. Used for wake verify, where:
+          - We only care about the first phoneme (s/sh/z) — a 2-char "so"
+            is a valid /sa-/ signature and MUST NOT be dropped by
+            min_transcription_length.
+          - A biased initial_prompt makes Whisper echo itself on unclear
+            audio ("This is a robot assistant" → not s-starting → reject).
+        The downside (no Sanad nudge) is fine here because the acoustic
+        detector has already gated out non-speech.
+        """
+        fw = self._get_fw()
+        if fw is None:
+            return ""
+        if self._mic_gain != 1.0:
+            audio_i16 = np.clip(
+                audio_i16.astype(np.float32) * self._mic_gain, -32768, 32767
+            ).astype(np.int16)
+        audio_f32 = audio_i16.astype(np.float32) / 32768.0
+        peak = float(np.abs(audio_f32).max())
+        if peak > 1e-4 and peak < 0.7:
+            audio_f32 = audio_f32 * (0.7 / peak)
+        try:
+            segments, info = fw.transcribe(
+                audio_f32,
+                language="en",
+                beam_size=int(self._stt.get("whisper_beam_size", 5)),
+                temperature=0.0,
+                initial_prompt="",              # NO bias → NO prompt echo
+                condition_on_previous_text=False,
+                vad_filter=False,
+                without_timestamps=True,
+                # Looser gates — we're about to do phonetic match,
+                # not trust the transcription verbatim.
+                no_speech_threshold=0.85,
+                log_prob_threshold=-1.8,
+                compression_ratio_threshold=3.0,
+            )
+            seg_list = list(segments)
+            text = " ".join(s.text for s in seg_list).strip()
+            if seg_list:
+                mean_lp = sum(getattr(s, "avg_logprob", 0.0) for s in seg_list) / len(seg_list)
+                log.info("whisper-raw: lp=%.2f nsp=%.2f text=%r",
+                         mean_lp, getattr(info, "no_speech_prob", 0.0), text[:80])
+            return text
+        except Exception as e:
+            log.error("whisper-raw transcribe failed: %s", e)
+            return ""
+
+    # ─── command transcription ────────────────────────────
+
+    def _transcribe_command(self, audio_i16: np.ndarray) -> str:
+        """
+        Decode the recorded command audio with faster-whisper. Thin
+        wrapper over self._transcribe(); exists so _handle_wake and the
+        always-on loop share one entry point.
+        """
+        if audio_i16.size == 0:
+            return ""
+        return self._transcribe(audio_i16)
+
+    def _save_turn_wav(
+        self, audio_i16: np.ndarray, transcription: str = "", tag: str = "cmd",
+    ) -> Optional[str]:
+        """
+        Save a single-turn command recording for debugging.
+
+        Filename: {tag}_{epoch}_{sanitised_transcription}.wav
+        Examples:
+          cmd_1728562000_turn_right.wav       ← successful command
+          cmd_1728562030_hi.wav               ← Whisper misheard as 'Hi'
+          unk_1728562045_.wav                 ← Whisper returned empty
+          cmd_1728562060_thanks_for_watch.wav ← garbage-filtered
+
+        Rotation: keeps the most recent 50 across all tags so the disk
+        doesn't fill up during a long session. Tunable via
+        stt.recording_keep_count.
+        """
+        try:
+            import re as _re
+            import wave
+            rec_dir = os.path.join(
+                PROJECT_ROOT,
+                self._config.get("audio", {}).get("data_dir", "Data/Voice/Recordings"),
+            )
+            os.makedirs(rec_dir, exist_ok=True)
+
+            # Rotate — keep only the most recent N across all command WAVs.
+            keep = int(self._stt.get("recording_keep_count", 50))
+            existing = sorted(
+                f for f in os.listdir(rec_dir)
+                if (f.startswith("cmd_") or f.startswith("unk_")) and f.endswith(".wav")
+            )
+            for old in existing[:max(0, len(existing) - keep + 1)]:
+                try: os.remove(os.path.join(rec_dir, old))
+                except Exception: pass
+
+            # Sanitise transcription for filename: lowercase, alnum + _, <=40 chars
+            slug = _re.sub(r'[^a-z0-9]+', '_', (transcription or "").lower()).strip('_')[:40]
+            path = os.path.join(
+                rec_dir, f"{tag}_{int(time.time())}_{slug}.wav"
+            )
+            with wave.open(path, "wb") as w:
+                w.setnchannels(1)
+                w.setsampwidth(2)
+                w.setframerate(self._sample_rate)
+                w.writeframes(audio_i16.astype(np.int16).tobytes())
+            return path
+        except Exception as e:
+            log.warning("failed to save turn wav: %s", e)
+            return None
+
+    def _save_unk_wav(self, audio_i16: np.ndarray) -> Optional[str]:
+        """Backward-compat wrapper — save with the `unk` tag."""
+        return self._save_turn_wav(audio_i16, transcription="", tag="unk")
+
+    # ─── command normalization (post-gate) ────────────────
+
+    def _normalize_command(self, text: str) -> str:
+        """
+        Apply fuzzy-match to the closest canonical command phrase.
+        Call AFTER the gated wake check so the wake word has already
+        been stripped by the caller if appropriate. Turns near-misses
+        like "Turn right up" → "turn right" so command_parser.py's
+        regex fast-path can hit them without an LLM round-trip.
+        """
+        canonical = _closest_command(text, cutoff=self._vocab_cutoff)
+        if canonical != text:
+            log.info("fuzzy-match: %r → %r", text, canonical)
+        return canonical
 
     # ─── main loop ────────────────────────────────────────
 
     def _voice_loop(self):
+        """
+        Dispatch to the right loop based on stt.mode:
+            "wake_and_command"  — require "Sanad" wake word (acoustic), then
+                                  record and transcribe a command.
+            "always_on"         — Transcribe every utterance, log all, and
+                                  dispatch all to the brain. No wake.
+            "always_on_gated"   — Transcribe every utterance and log all,
+                                  but ONLY dispatch utterances that contain
+                                  "Sanad" (fuzzy). Wake word is stripped
+                                  before the command is sent to the brain.
+        """
+        mode = self._stt.get("mode", "wake_and_command").lower()
         self._mic_capture.start()
-        log.info("Voice loop started — listening for wake (energy-based, no ML)")
+        if mode in ("always_on", "always_on_gated"):
+            self._voice_loop_always_on(gated=(mode == "always_on_gated"))
+        else:
+            self._voice_loop_wake()
+
+    def _voice_loop_wake(self):
+        """Classic wake-and-command: listen for 'Sanad', then record command."""
+        log.info("Voice loop started — listening for wake (energy-based)")
+
+        was_speaking = False
         while self._running:
             try:
-                # Don't listen while the robot is speaking (prevents
-                # self-trigger from TTS output leaking into the mic).
                 if self._audio.is_speaking:
+                    was_speaking = True
                     time.sleep(0.1)
                     self._detector.reset()
                     continue
 
-                chunk = self._mic_capture.read_chunk(1024)   # ~32 ms at 16 kHz
+                if was_speaking:
+                    time.sleep(0.25)
+                    self._mic_capture.flush()
+                    self._detector.reset()
+                    was_speaking = False
+
+                if time.time() < self._cooldown_until:
+                    _ = self._read_mic_raw(1024)
+                    self._detector.reset()
+                    time.sleep(0.05)
+                    continue
+
+                chunk = self._read_mic_raw(1024)
                 if not chunk:
                     continue
 
                 if self._detector.process(chunk):
-                    self._on_wake_fired()
+                    self._handle_wake()
             except Exception as e:
                 log.error("Voice loop error: %s", e, exc_info=True)
                 time.sleep(1)
 
-    def _on_wake_fired(self):
-        log.info("Wake detected (acoustic)")
-        print("\n  [Sanad] wake heard — type your command at the prompt.")
-        # TTS ack
-        msg = self._messages.get("wake_heard", "Yes")
-        try:
-            self._audio.speak(msg)
-        except Exception as e:
-            log.warning("TTS ack failed: %s", e)
+    def _voice_loop_always_on(self, gated: bool = False):
+        """
+        Always-on mode — Sanad-style continuous listening.
 
-        # Brain callbacks for compatibility with the old interface.
-        if self._on_wake:
+        If `gated` is True, utterances that don't contain the wake word
+        "Sanad" (or a fuzzy variant) are logged but NOT dispatched to the
+        brain — the robot hears everything, speaks only when addressed.
+
+        Architecture (no wake word, no ack TTS):
+            1. Continuously read the gained mic stream in 32 ms chunks.
+            2. Run a hysteretic VAD on the stream — speech_entry_rms
+               starts an utterance, silence_exit_rms + silence_duration
+               ends one.
+            3. On each utterance end → Whisper transcribe → fuzzy-match
+               → dispatch to brain.
+            4. Every ~5 s of idle: log a `ambient: rms=... peak=...` line
+               so you can SEE what the mic is doing at all times, even
+               when nobody's talking. Matches Sanad's "always listening"
+               visibility.
+            5. Speech is not gated on amplitude — everything above the
+               entry threshold is captured, quiet or loud. Loud speech
+               clips naturally against int16; Whisper handles it.
+
+        Thresholds come from the same stt.* config as wake mode but are
+        typically tuned lower here (you want eager capture since there's
+        no wake-word gate to prevent false positives).
+        """
+        log.info(
+            "Voice loop started — ALWAYS-ON mode%s",
+            " [gated: only 'Sanad' utterances dispatched]" if gated
+            else " (no wake word — every utterance dispatched)"
+        )
+
+        speech_entry = float(self._stt.get("always_on_speech_entry_rms", 250.0))
+        silence_exit = float(self._stt.get("always_on_silence_exit_rms", 120.0))
+        silence_dur  = float(self._stt.get("always_on_silence_duration_sec", 0.8))
+        min_utter_s  = float(self._stt.get("always_on_min_utterance_sec", 0.3))
+        max_utter_s  = float(self._stt.get("always_on_max_utterance_sec", 12.0))
+        idle_log_s   = float(self._stt.get("always_on_idle_log_sec", 5.0))
+        ambient_mult = float(self._stt.get("always_on_ambient_mult", 1.4))
+        ambient_win  = int(self._stt.get("always_on_ambient_window_chunks", 100))
+
+        buffer           = bytearray()
+        in_speech        = False
+        silence_budget   = 0.0
+        speech_duration  = 0.0
+        peak_rms         = 0.0
+        idle_peak_rms    = 0.0
+        idle_sum_rms     = 0.0
+        idle_chunks      = 0
+        last_idle_log    = time.time()
+        was_speaking_tts = False
+
+        # Rolling ambient (idle-only) RMS buffer. Used to adapt silence_exit
+        # so a noisy room doesn't trap the VAD at max_utter_s: if the
+        # observed idle floor sits at rms=200, silence_exit needs to be
+        # above 200 or silence never accumulates. We take
+        # effective_exit = max(config_silence_exit, ambient_floor * mult).
+        ambient_buf: list = []
+        ambient_floor     = 0.0
+
+        # Seed ambient_floor by sampling ~1s of mic BEFORE entering the
+        # loop. Without this, the very first utterance runs with
+        # ambient_floor=0 → eff_exit=config_floor, which under-cuts
+        # noisy rooms and creates self-sustaining echo loops.
+        seed_chunks = []
+        seed_deadline = time.time() + 1.0
+        while time.time() < seed_deadline:
+            r = self._read_mic_gained(1024)
+            if r:
+                a = np.frombuffer(r, dtype=np.int16)
+                if a.size:
+                    seed_chunks.append(
+                        float(np.sqrt(np.mean(a.astype(np.float64) ** 2)))
+                    )
+            else:
+                time.sleep(0.005)
+        if seed_chunks:
+            # Use the median so one loud transient doesn't poison the seed.
+            seed_chunks.sort()
+            ambient_floor = seed_chunks[len(seed_chunks) // 2]
+            ambient_buf = list(seed_chunks[-ambient_win:])
+            log.info("ambient seeded: floor=%.0f from %d chunks",
+                     ambient_floor, len(seed_chunks))
+
+        while self._running:
             try:
-                self._on_wake()
+                # Drop mic input while the robot itself is speaking so we
+                # don't feed our own TTS back through Whisper.
+                if self._audio.is_speaking:
+                    was_speaking_tts = True
+                    buffer.clear()
+                    in_speech = False
+                    silence_budget = 0.0
+                    speech_duration = 0.0
+                    peak_rms = 0.0
+                    time.sleep(0.1)
+                    continue
+
+                if was_speaking_tts:
+                    time.sleep(float(self._stt.get("post_tts_settle_sec", 0.3)))
+                    self._mic_capture.flush()
+                    was_speaking_tts = False
+
+                raw = self._read_mic_gained(1024)
+                if not raw:
+                    time.sleep(0.005)
+                    continue
+
+                arr = np.frombuffer(raw, dtype=np.int16)
+                rms = float(np.sqrt(np.mean(arr.astype(np.float64) ** 2)))
+                chunk_s = (len(raw) // 2) / self._sample_rate
+
+                if in_speech:
+                    buffer.extend(raw)
+                    speech_duration += chunk_s
+                    peak_rms = max(peak_rms, rms)
+
+                    # Adaptive silence exit: sits max(config_floor,
+                    # ambient_floor × mult). Prevents the "room is noisier
+                    # than silence_exit" failure mode where silence never
+                    # accumulates and every utterance hits max_utter_s.
+                    eff_exit = max(silence_exit, ambient_floor * ambient_mult)
+                    if rms < eff_exit:
+                        silence_budget += chunk_s
+                    else:
+                        silence_budget = 0.0
+
+                    utter_over = (silence_budget >= silence_dur and
+                                  speech_duration >= min_utter_s)
+                    force_stop = speech_duration >= max_utter_s
+
+                    if utter_over or force_stop:
+                        reason = "max-duration" if force_stop else "silence"
+                        audio = np.frombuffer(bytes(buffer), dtype=np.int16)
+                        log.info("utterance end (%s): dur=%.2fs peak_rms=%.0f samples=%d",
+                                 reason, speech_duration, peak_rms, audio.size)
+
+                        # RESET STATE IMMEDIATELY — before any Whisper /
+                        # speak() / dispatch. Previously a `continue` from
+                        # the wake-only ack branch skipped the reset, and
+                        # the 12-second buffer lived forever, re-transcribed
+                        # every iteration into the same "Sanad" output,
+                        # spawning a self-sustaining "Yes" loop.
+                        buffer.clear()
+                        in_speech = False
+                        silence_budget = 0.0
+                        speech_duration = 0.0
+                        peak_rms = 0.0
+
+                        text = self._transcribe_command(audio) if audio.size else ""
+                        if text:
+                            log.info("HEARD: %r", text)
+                            # Gated mode: only dispatch if the wake word was
+                            # spoken. Everything is still logged above so the
+                            # operator has full visibility into what the mic
+                            # is picking up.
+                            if gated and not _has_wake_word(text):
+                                log.info("  (no wake word — not dispatched)")
+                            else:
+                                if gated:
+                                    command = _strip_wake_word(text)
+                                    if command != text:
+                                        log.info("  wake-stripped: %r → %r",
+                                                 text, command)
+                                    # Bare wake word ("Sanad.", "Sanad") →
+                                    # speak a "Yes" ack, do NOT call the
+                                    # brain (it would hallucinate a random
+                                    # response from a 1-word prompt).
+                                    if not command:
+                                        log.info("  wake-only utterance — speaking ack")
+                                        try:
+                                            self._audio.speak(
+                                                self._messages.get("wake_heard", "Yes")
+                                            )
+                                        except Exception as e:
+                                            log.warning("wake-ack TTS failed: %s", e)
+                                        continue
+                                else:
+                                    command = text
+
+                                # Normalize near-misses ("Turn right up" →
+                                # "turn right") so command_parser's regex
+                                # fast-path can hit without an LLM round-trip.
+                                command = self._normalize_command(command)
+
+                                print(f'  [Sanad] heard: "{command}"')
+                                if self._on_command:
+                                    try:
+                                        self._on_command(command, "en")
+                                    except Exception as e:
+                                        log.error("on_command: %s", e, exc_info=True)
+                        else:
+                            log.info("utterance rejected (empty/garbage after Whisper)")
+                else:
+                    idle_peak_rms = max(idle_peak_rms, rms)
+                    idle_sum_rms += rms
+                    idle_chunks += 1
+
+                    # Maintain the rolling ambient floor so silence_exit can
+                    # adapt. Use windows that are *clearly* not speech
+                    # (rms < speech_entry / 2) — otherwise a borderline
+                    # window just before transition pollutes the floor.
+                    if rms < speech_entry * 0.5:
+                        ambient_buf.append(rms)
+                        if len(ambient_buf) > ambient_win:
+                            ambient_buf.pop(0)
+                        if ambient_buf:
+                            ambient_floor = sum(ambient_buf) / len(ambient_buf)
+
+                    if rms >= speech_entry:
+                        # utterance starts — keep this chunk as pre-roll
+                        log.info("utterance start (rms=%.0f >= entry=%.0f)",
+                                 rms, speech_entry)
+                        buffer.extend(raw)
+                        in_speech = True
+                        speech_duration = chunk_s
+                        peak_rms = rms
+                        silence_budget = 0.0
+
+                    # periodic ambient log while idle — "I am listening"
+                    now = time.time()
+                    if (now - last_idle_log) >= idle_log_s and idle_chunks > 0:
+                        eff_exit = max(silence_exit, ambient_floor * ambient_mult)
+                        log.info("ambient: mean_rms=%.0f peak_rms=%.0f chunks=%d "
+                                 "floor=%.0f entry=%.0f eff_exit=%.0f",
+                                 idle_sum_rms / idle_chunks, idle_peak_rms,
+                                 idle_chunks, ambient_floor, speech_entry, eff_exit)
+                        idle_peak_rms = 0.0
+                        idle_sum_rms = 0.0
+                        idle_chunks = 0
+                        last_idle_log = now
             except Exception as e:
-                log.error("on_wake callback error: %s", e)
-        elif self._on_command:
-            # Old API expected (text, lang). We have no transcription, so
-            # pass empty text — brain is expected to prompt for typed input.
+                log.error("Always-on voice loop error: %s", e, exc_info=True)
+                time.sleep(1)
+
+    def _handle_wake(self):
+        t_wake = time.time()
+        log.info("Wake detected (acoustic)")
+
+        # Verify the burst that triggered wake actually sounds like a
+        # wake word. The acoustic detector fires on ANY 0.2-1.5s burst
+        # (coughs, claps, door slams). We run a lightweight Whisper
+        # decode on the burst and accept if EITHER:
+        #   (a) a wake-word variant is in the transcription, OR
+        #   (b) the transcription starts with 's'/'sh'/'z' — Whisper's
+        #       consistent signature for mishearing non-English "Sanad"
+        #       as an English /sa-/ word ("Stop", "Set", "Sand", "Send").
+        # Reject if Whisper returns empty (pure noise / cough) or
+        # confidently not-s speech ("hello", "okay").
+        if self._stt.get("wake_verify_enabled", True):
+            burst = self._detector.get_last_burst()
+            if burst is not None and burst.size >= int(0.15 * self._sample_rate):
+                t_verify = time.time()
+                # Lenient transcribe — no garbage filter, no min-length,
+                # no bias prompt. See _transcribe_raw docstring.
+                heard = self._transcribe_raw(burst)
+                verify_ms = (time.time() - t_verify) * 1000
+                low = (heard or "").lower().strip().lstrip('"\'.,!?')
+                if not low:
+                    log.info("  wake REJECTED — whisper empty (%.0fms)", verify_ms)
+                    return
+                starts_with_s = low.startswith(("s", "sh", "z"))
+                if _has_wake_word(heard):
+                    log.info("  wake verified (wake-word: %r, %.0fms)",
+                             heard, verify_ms)
+                elif starts_with_s:
+                    log.info("  wake verified (s-phonetic: %r, %.0fms)",
+                             heard, verify_ms)
+                else:
+                    log.info("  wake REJECTED — %r (%.0fms, not s-starting)",
+                             heard, verify_ms)
+                    return
+
+        print("\n  [Sanad] wake heard — listening…")
+
+        ack_mode = self._stt.get("wake_ack", "tts").lower()
+        if ack_mode == "none":
+            log.info("  wake-ack: silent (no TTS)")
+        else:
             try:
-                self._on_command("", "en")
+                self._audio.speak(self._messages.get("wake_heard", "Yes"))
             except Exception as e:
-                log.error("on_command callback error: %s", e)
+                log.warning("TTS ack failed: %s", e)
+
+        # Wait for ack TTS + speaker reverberation to decay
+        while self._audio.is_speaking:
+            time.sleep(0.05)
+        settle = float(self._stt.get("post_tts_settle_sec", 0.3))
+        time.sleep(settle)
+        self._mic_capture.flush()
+        log.info("  wake→record-ready: %.2fs", time.time() - t_wake)
+
+        log.info("Recording command...")
+        audio = self._record_command()
+        # _record_command returns empty if it never saw speech above the
+        # adaptive entry threshold — no point running STT on noise.
+        # Two cases:
+        #   audio.size == 0     → no speech at all (likely false wake
+        #                         from cough/slam). SILENTLY reset —
+        #                         don't blurt "I didn't catch that" on
+        #                         what was never a real interaction.
+        #   0 < size < 8000     → brief speech burst (< 0.5s). Probably
+        #                         a real-but-unintelligible attempt;
+        #                         speak "I didn't catch that" so the
+        #                         user knows to retry.
+        if audio.size == 0:
+            log.info("Command dropped (no speech — likely false wake); silent reset")
+            self._cooldown_until = time.time() + float(
+                self._stt.get("command_cooldown_sec", 1.5))
+            return
+        if audio.size < 8000:  # < 0.5 s but > 0 — real short attempt
+            log.info("Command too short (%.2fs); asking user to repeat",
+                     audio.size / self._sample_rate)
+            try: self._audio.speak(self._messages.get("no_speech", "I didn't catch that"))
+            except Exception: pass
+            self._cooldown_until = time.time() + float(
+                self._stt.get("command_cooldown_sec", 1.5))
+            return
+
+        peak = int(np.abs(audio).max())
+        rms  = float(np.sqrt(np.mean(audio.astype(np.float64) ** 2)))
+        log.info("command audio: samples=%d peak=%d rms=%.1f",
+                 audio.size, peak, rms)
+
+        text = self._transcribe_command(audio)
+        if not text:
+            log.info("Empty or rejected transcription")
+            # Save WAV of the failed transcription for post-mortem.
+            if self._stt.get("recording_enabled", True):
+                self._save_turn_wav(audio, transcription="", tag="unk")
+            try: self._audio.speak(self._messages.get("no_speech", "I didn't catch that"))
+            except Exception: pass
+            self._cooldown_until = time.time() + float(
+                self._stt.get("command_cooldown_sec", 1.5))
+            return
+
+        # Normalize near-miss transcriptions like "Turn right up" → "turn
+        # right" so the brain's regex fast-path catches them.
+        text = self._normalize_command(text)
+        log.info("Transcribed: %s", text[:120])
+
+        # Save every successful command recording so you can listen back
+        # later and see what the mic actually heard vs what Whisper
+        # transcribed. Disable with stt.recording_enabled=false.
+        if self._stt.get("recording_enabled", True):
+            wav_path = self._save_turn_wav(audio, transcription=text, tag="cmd")
+            if wav_path:
+                log.info("saved: %s", os.path.basename(wav_path))
+
+        if self._on_command:
+            try:
+                self._on_command(text, "en")
+            except Exception as e:
+                log.error("on_command error: %s", e, exc_info=True)
+        elif self._on_wake:
+            try: self._on_wake()
+            except Exception: pass
+
+        cd = float(self._stt.get("command_cooldown_sec", 1.5))
+        self._cooldown_until = time.time() + cd
+        log.info("wake→dispatch total: %.2fs  |  cooldown %.1fs",
+                 time.time() - t_wake, cd)
 
     # ─── start / stop ─────────────────────────────────────
 
@@ -166,18 +1143,14 @@ class VoiceModule:
             log.warning("VoiceModule already running")
             return
         self._running = True
-        self._thread = threading.Thread(
-            target=self._voice_loop, daemon=True, name="voice",
-        )
+        self._thread = threading.Thread(target=self._voice_loop, daemon=True, name="voice")
         self._thread.start()
         log.info("Voice module started")
 
     def stop(self):
         self._running = False
-        try:
-            self._mic_capture.stop()
-        except Exception:
-            pass
+        try: self._mic_capture.stop()
+        except Exception: pass
         if self._thread:
             self._thread.join(timeout=5)
             self._thread = None
@@ -188,22 +1161,15 @@ class VoiceModule:
         return self._running
 
 
-# ─── standalone test ─────────────────────────────────────
 if __name__ == "__main__":
     from API.audio_api import AudioAPI
-
-    def on_wake():
-        print("  (brain callback fired)")
-
+    def on_cmd(text, lang):
+        print(f"\n  COMMAND [{lang}]: {text}\n")
     audio = AudioAPI()
-    voice = VoiceModule(audio, on_wake=on_wake)
-    print("Starting voice module... say any short word to test the wake.")
-    print("Press Ctrl-C to stop.\n")
+    voice = VoiceModule(audio, on_command=on_cmd)
+    print('Starting. Say "Sanad", then speak your command.\n')
     voice.start()
     try:
-        while voice.is_running:
-            time.sleep(0.5)
+        while voice.is_running: time.sleep(0.5)
     except KeyboardInterrupt:
-        print("\nStopping...")
         voice.stop()
-        print("Done.")
diff --git a/Voice/wake_detector.py b/Voice/wake_detector.py
index ef90796..640ad11 100644
--- a/Voice/wake_detector.py
+++ b/Voice/wake_detector.py
@@ -49,10 +49,14 @@ import numpy as np
 @dataclass
 class WakeConfig:
     sample_rate: int        = 16_000
-    # RMS (int16 units) above which we consider a chunk to be speech.
-    # G1 on-board mic at normal speaking distance has rms ≈ 500-1500
-    # during speech and ≈ 40-100 in silence. 150 is a safe middle ground.
-    speech_threshold:  float = 150.0
+    # RMS (int16 units) FLOOR for "this chunk is speech". The effective
+    # threshold is max(speech_threshold, ambient_baseline * adaptive_mult)
+    # so this is only a minimum guarantee — the detector adapts upward
+    # in noisy rooms but never below this floor.
+    # G1 far-field mic at normal speaking distance has rms ~ 80-400 for
+    # quiet speech, 400-1500 for clear speech. 80 catches quiet speech;
+    # raise to 120-150 if fan/typing noise triggers false wakes.
+    speech_threshold:  float = 80.0
     # How long a burst of speech must last to count as a "word".
     min_word_duration_s: float = 0.20
     max_word_duration_s: float = 1.50
@@ -63,6 +67,13 @@ class WakeConfig:
     cooldown_s:         float = 1.50
     # RMS window size — we analyze this many ms of audio per step.
     chunk_ms:           int   = 50
+    # Adaptive: how many *recent silent* chunks to average for the noise
+    # floor, and the multiplier applied on top. effective_threshold =
+    # max(speech_threshold, baseline * adaptive_mult).
+    adaptive_window_n: int    = 50       # ~2.5 s at 50 ms chunks
+    adaptive_mult:     float  = 3.0
+    # Periodic diagnostic log cadence (seconds). 0 disables.
+    diag_log_sec:      float  = 3.0
 
 
 class WakeDetector:
@@ -88,6 +99,24 @@ class WakeDetector:
         # chunks don't align with our internal analysis window).
         self._carry = np.zeros(0, dtype=np.int16)
 
+        # Audio of the most-recent wake-triggering burst. Saved when the
+        # detector fires so callers (marcus_voice) can run Whisper on it
+        # and verify the word was actually "Sanad" rather than a cough.
+        self._burst_samples: list = []         # accumulated during SPEAKING
+        self._last_burst_audio: Optional[np.ndarray] = None
+
+        # Adaptive noise floor (rolling mean of RMS during SILENCE chunks).
+        self._baseline_buf  = []           # last N silent-window RMS values
+        self._baseline      = 0.0          # current estimate
+        self._peak_since_diag = 0.0        # max rms since last diag log
+        self._last_diag     = time.time()
+        # Logger is optional — if the host app set up logging, use it.
+        try:
+            import logging
+            self._log = logging.getLogger("wake_detector")
+        except Exception:
+            self._log = None
+
     # ── public API ────────────────────────────────────────────────
 
     def process(self, pcm_bytes: bytes) -> bool:
@@ -122,31 +151,73 @@ class WakeDetector:
         self._state = self.STATE_SILENCE
         self._silence_run = 0
         self._carry = np.zeros(0, dtype=np.int16)
+        self._burst_samples = []
+
+    def get_last_burst(self) -> Optional[np.ndarray]:
+        """
+        Return the int16 PCM samples of the most-recent wake-triggering
+        burst, or None if no wake has fired yet. Used by marcus_voice to
+        verify the triggering word was actually 'Sanad' before proceeding.
+        """
+        return self._last_burst_audio
 
     # ── internal ──────────────────────────────────────────────────
 
     def _step(self, window: np.ndarray) -> bool:
         rms = float(np.sqrt(np.mean(window.astype(np.float64) ** 2)))
-        is_speech = rms > self.cfg.speech_threshold
 
+        # Effective threshold = max(config floor, adaptive baseline * mult)
+        eff = self.cfg.speech_threshold
+        if self._baseline > 0:
+            eff = max(eff, self._baseline * self.cfg.adaptive_mult)
+        is_speech = rms > eff
+
+        # Track peak for diag. Log periodically so you can *see* what the
+        # detector is hearing — invaluable when "not hearing me" happens.
+        if rms > self._peak_since_diag:
+            self._peak_since_diag = rms
         now = time.time()
+        if self.cfg.diag_log_sec > 0 and (now - self._last_diag) >= self.cfg.diag_log_sec:
+            if self._log is not None:
+                self._log.info(
+                    "wake: peak=%.0f  baseline=%.0f  eff_threshold=%.0f  state=%s",
+                    self._peak_since_diag, self._baseline, eff, self._state,
+                )
+            self._peak_since_diag = 0.0
+            self._last_diag = now
+
         if now < self._cooldown_until:
             return False  # silent during cooldown
 
         if self._state == self.STATE_SILENCE:
+            # Learn the noise floor ONLY in silence — so speech bursts
+            # don't pull the baseline up and lock us out of wake.
+            if not is_speech:
+                self._baseline_buf.append(rms)
+                if len(self._baseline_buf) > self.cfg.adaptive_window_n:
+                    self._baseline_buf.pop(0)
+                if self._baseline_buf:
+                    self._baseline = sum(self._baseline_buf) / len(self._baseline_buf)
             if is_speech:
                 self._state = self.STATE_SPEAKING
                 self._speech_start = self._sample_cursor
                 self._silence_run = 0
+                # Begin capturing the burst audio for later Whisper verify.
+                self._burst_samples = [window.copy()]
             return False
 
         # STATE_SPEAKING
+        # Accumulate every window (speech OR silence inside the burst)
+        # so we capture the full word + trailing quiet.
+        self._burst_samples.append(window.copy())
+
         if is_speech:
             self._silence_run = 0
             # Abort if the burst is longer than a single word — user is
             # just talking, not addressing the robot.
             if self._sample_cursor - self._speech_start > self._max_speech:
                 self._state = self.STATE_SILENCE
+                self._burst_samples = []
             return False
 
         # Silent window inside SPEAKING — accumulate.
@@ -156,6 +227,12 @@ class WakeDetector:
             self._state = self.STATE_SILENCE
             self._silence_run = 0
             if self._min_speech <= speech_len <= self._max_speech:
+                # Snapshot burst audio for the caller's Whisper verify.
+                self._last_burst_audio = (
+                    np.concatenate(self._burst_samples)
+                    if self._burst_samples else None
+                )
+                self._burst_samples = []
                 self._cooldown_until = now + self.cfg.cooldown_s
                 return True
         return False