Update 2026-04-21 16:10:00

2026-04-21 16:10:03 +04:00 · 2026-04-21 16:10:03 +04:00 · e0f6acd5c7
commit e0f6acd5c7
parent 8491be7f1e
24 changed files with 1291 additions and 1157 deletions
--- a/API/audio_api.py
+++ b/API/audio_api.py
@ -2,19 +2,24 @@
 """
 API/audio_api.py — Marcus Audio API Layer
 ==========================================
-Provides speak() and record_audio() for the Brain layer.
+Provides speak() and record() for the Brain layer.
 Brain imports ONLY from this API — never from unitree SDK directly.

-Speaker: _CallRequestWithParamAndBin (single call, full buffer)
-Mic:     parec -d 3 (Hollyland wireless, PulseAudio source index from config)
-TTS EN:  Unitree built-in TtsMaker
-TTS AR:  Piper ar_JO-kareem-medium → resample → G1 speaker
+Speaker: Unitree built-in TtsMaker (G1 on-board engine, English only,
+         no MP3/WAV plumbing, no internet).  Optional raw-PCM playback path
+         via _play_pcm() is kept for future modules that synthesize their
+         own audio (e.g. offline Piper).
+Mic:     G1 built-in mic (UDP multicast 239.168.123.161:5555, 16 kHz mono).
+         Legacy Hollyland/parec path retained as fallback when
+         config_Voice.json has mic.backend="pactl_parec".
+TTS:     English only.  Arabic is rejected (the G1 firmware silently maps
+         Arabic to Chinese, which confuses everyone — if Arabic TTS is ever
+         needed again, use a separate offline backend like Piper).

 Usage:
    from API.audio_api import AudioAPI
    audio = AudioAPI()
-    audio.speak("Hello", "en")
-    audio.speak("مرحبا", "ar")
+    audio.speak("Hello, I am Sanad")
    recording = audio.record(seconds=5)
    audio.play_pcm(recording)
 """
@ -71,7 +76,24 @@ class AudioAPI:
        self._tts = self._config["tts"]
        self._mic = self._config["mic"]
        self._spk = self._config["speaker"]
-        self._target_rate = self._tts["target_sample_rate"]
+        self._target_rate = self._tts.get("target_sample_rate", 16000)
+
+        # Default mic backend: G1 built-in UDP multicast.
+        # Set mic.backend="pactl_parec" in config_Voice.json to fall back
+        # to the legacy Hollyland/PulseAudio path.
+        self._mic_backend = self._mic.get("backend", "builtin_udp")
+        self._builtin_mic = None    # lazy-initialized on first record()
+
+        # Built-in TTS wrapper (uses the already-initialized AudioClient).
+        # Keeps TTS synchronous so `is_speaking` is meaningful to the voice
+        # loop that needs to skip mic input during playback.
+        self._tts_engine = None
+        if self._sdk_available:
+            from Voice.builtin_tts import BuiltinTTS
+            self._tts_engine = BuiltinTTS(
+                self._client,
+                default_speaker_id=self._tts.get("builtin_speaker_id", 0),
+            )

        # Data dir
        data_dir = os.path.join(PROJECT_ROOT, self._config["audio"]["data_dir"])
@ -82,7 +104,10 @@ class AudioAPI:
        self._speaking = False
        self._speak_lock = threading.Lock()

-        log.info(self._config["messages"]["ready"])
+        log.info("%s (mic=%s, tts=%s)",
+                 self._config["messages"]["ready"],
+                 self._mic_backend,
+                 "builtin_ttsmaker" if self._tts_engine else "disabled")

    def _init_sdk(self):
        """Initialize Unitree AudioClient."""
@ -105,55 +130,63 @@ class AudioAPI:

    # ─── SPEAK ────────────────────────────────────────────

-    def speak(self, text: str, lang: str = "auto"):
+    def speak(self, text: str, lang: str = "en"):
        """
-        Speak text in the given language.
-        Mutes mic during playback to prevent self-listening.
-          lang="en" → built-in TtsMaker
-          lang="ar" → Piper → resample → G1 speaker
-          lang="auto" → detect from text
-        """
-        if lang == "auto":
-            lang = self._detect_lang(text)
+        Speak `text` in English through the G1 built-in TTS (TtsMaker).

-        log.info("[%s] speak: %s", lang.upper(), text[:80])
+        Mutes (flushes) the mic during playback so the voice loop doesn't
+        hear the robot's own voice and transcribe itself. The `lang`
+        argument is accepted for API compatibility but only "en" plays —
+        non-ASCII text (Arabic) is rejected by BuiltinTTS.
+        """
+        if lang and lang not in ("en", "auto"):
+            log.warning("builtin_tts only supports English; got lang=%r — skipping", lang)
+            return
+        if self._tts_engine is None:
+            log.error("No TTS engine initialized — audio SDK unavailable")
+            return
+
+        log.info("speak: %s", text[:80])

        with self._speak_lock:
            self._speaking = True
            self._mute_mic()
-
            try:
-                if lang == "en":
-                    self._speak_english(text)
-                elif lang == "ar":
-                    self._speak_arabic(text)
-                else:
-                    log.warning("Unknown lang '%s', falling back to English", lang)
-                    self._speak_english(text)
+                self._tts_engine.speak(text, block=True)
            except Exception as e:
                log.error("%s: %s", self._config["messages"]["error_tts"], e)
            finally:
-                # Small delay so speaker fully stops before mic reopens
-                time.sleep(0.3)
+                # Small tail so the speaker fully finishes before the mic is
+                # re-opened for capture
+                time.sleep(0.2)
                self._unmute_mic()
                self._speaking = False

    def _mute_mic(self):
-        """Mute the wireless mic to prevent self-listening."""
+        """
+        Suppress mic input during TTS playback.
+        For the UDP built-in mic, flush the buffer so we don't capture any
+        echo that's already been queued. For the legacy PulseAudio path,
+        actually mute the source.
+        """
+        if self._mic_backend == "builtin_udp":
+            if self._builtin_mic is not None:
+                self._builtin_mic.flush()
+            return
        source = self._mic["source_index"]
-        subprocess.run(
-            ["pactl", "set-source-mute", source, "1"],
-            capture_output=True,
-        )
+        subprocess.run(["pactl", "set-source-mute", source, "1"],
+                       capture_output=True)
        log.debug("Mic muted")

    def _unmute_mic(self):
-        """Unmute the wireless mic."""
+        """Re-enable mic after TTS playback (pactl path only)."""
+        if self._mic_backend == "builtin_udp":
+            if self._builtin_mic is not None:
+                self._builtin_mic.flush()
+            return
        source = self._mic["source_index"]
-        subprocess.run(
-            ["pactl", "set-source-mute", source, "0"],
-            capture_output=True,
-        )
+        subprocess.run(["pactl", "set-source-mute", source, "0"],
+                       capture_output=True)
        log.debug("Mic unmuted")

    @property
@ -161,88 +194,8 @@ class AudioAPI:
        """True while TTS is playing — voice module checks this."""
        return self._speaking

-    def _speak_english(self, text: str):
-        """English TTS via edge-tts."""
-        self._speak_edge_tts(text, "en")
-
-    def _speak_arabic(self, text: str):
-        """Arabic TTS via edge-tts."""
-        self._speak_edge_tts(text, "ar")
-
-    def speak_piper_en(self, text: str):
-        """Alternative: English via Piper instead of built-in."""
-        voice = self._tts["piper_voice_en"]
-        audio, rate = self._piper_synthesize(text, voice)
-        audio_16k = self._resample(audio, rate)
-        self._play_pcm(audio_16k)
-
-    # ─── PIPER TTS ────────────────────────────────────────
-
-    def _piper_synthesize(self, text: str, voice: str) -> tuple:
-        """Run Piper CLI, return (audio_int16, sample_rate)."""
-        cmd = ["piper", "--model", voice, "--output_raw"]
-        timeout = self._tts["piper_timeout_sec"]
-
-        proc = subprocess.run(
-            cmd,
-            input=text.encode("utf-8"),
-            capture_output=True,
-            timeout=timeout,
-        )
-
-        if proc.returncode != 0:
-            stderr = proc.stderr.decode()[:300]
-            raise RuntimeError(f"Piper failed: {stderr}")
-
-        audio = np.frombuffer(proc.stdout, dtype=np.int16)
-        piper_rate = self._tts["piper_sample_rate"]
-        log.info("Piper: %d samples @ %dHz (%.1fs)", len(audio), piper_rate, len(audio) / piper_rate)
-        return audio, piper_rate
-
-    # ─── RESAMPLE ─────────────────────────────────────────
-
-
-    def _speak_edge_tts(self, text: str, lang: str):
-        """Generate speech via edge-tts and play on G1."""
-        import os as _os
-        voice = "ar-AE-HamdanNeural" if lang == "ar" else "en-US-GuyNeural"
-        ts = int(time.time() * 1000)
-        mp3_path = f"/tmp/edge_{lang}_{ts}.mp3"
-        wav_path = f"/tmp/edge_{lang}_{ts}.wav"
-
-        safe_text = text.replace('"', '\\"')
-        code = f'import edge_tts, asyncio; asyncio.run(edge_tts.Communicate(\"{safe_text}\", voice=\"{voice}\").save(\"{mp3_path}\"))'
-        result = subprocess.run(["python3", "-c", code], capture_output=True, text=True, timeout=30)
-
-        if result.returncode != 0:
-            log.error("edge-tts failed: %s", result.stderr[:200])
-            if lang == "en" and self._sdk_available:
-                self._client.TtsMaker(text, self._tts.get("builtin_speaker_id", 1))
-                time.sleep(max(2.0, len(text) * 0.06))
-            return
-
-        try:
-            from pydub import AudioSegment
-            a = AudioSegment.from_mp3(mp3_path)
-            a = a.set_frame_rate(16000).set_channels(1).set_sample_width(2)
-            a.export(wav_path, format="wav")
-
-            import wave
-            with wave.open(wav_path, "rb") as wf:
-                audio = np.frombuffer(wf.readframes(wf.getnframes()), dtype=np.int16)
-
-            _os.unlink(mp3_path)
-            _os.unlink(wav_path)
-            self._play_pcm(audio)
-        except Exception as e:
-            log.error("edge-tts conversion error: %s", e)
-            try: _os.unlink(mp3_path)
-            except: pass
-            try: _os.unlink(wav_path)
-            except: pass
-
    def _resample(self, audio: np.ndarray, src_rate: int) -> np.ndarray:
-        """Resample to target rate (16kHz)."""
+        """Linear resample int16 PCM to self._target_rate (16 kHz)."""
        if src_rate == self._target_rate:
            return audio
        tl = int(len(audio) * self._target_rate / src_rate)
@ -252,7 +205,7 @@ class AudioAPI:
            audio.astype(np.float64),
        ).astype(np.int16)

-    # ─── G1 SPEAKER PLAYBACK ─────────────────────────────
+    # ─── G1 SPEAKER PLAYBACK (raw PCM, kept for future backends) ─────────

    def _play_pcm(self, audio_16k: np.ndarray) -> float:
        """Play 16kHz mono int16 on G1 speaker. Returns duration."""
@ -308,24 +261,50 @@ class AudioAPI:
    # ─── MIC RECORDING ───────────────────────────────────

    def record(self, seconds: float = 5.0) -> np.ndarray:
-        """Record from Hollyland wireless mic via parec. Returns int16 array."""
+        """
+        Capture `seconds` of int16 mono 16 kHz PCM.
+
+        Default backend is the G1 built-in mic (UDP multicast). Set
+        mic.backend="pactl_parec" in config_Voice.json to use the
+        legacy Hollyland/parec path instead.
+        """
+        if self._mic_backend == "builtin_udp":
+            return self._record_builtin(seconds)
+        return self._record_parec(seconds)
+
+    def _record_builtin(self, seconds: float) -> np.ndarray:
+        """Built-in mic path — join UDP multicast, read the requested duration."""
+        if self._builtin_mic is None:
+            from Voice.builtin_mic import BuiltinMic
+            mcfg = self._config.get("mic_udp", {})
+            self._builtin_mic = BuiltinMic(
+                group=mcfg.get("group", "239.168.123.161"),
+                port=mcfg.get("port", 5555),
+                buf_max=mcfg.get("buffer_max_bytes", 64000),
+            )
+            self._builtin_mic.start()
+            time.sleep(0.2)  # let the receiver thread fill in
+
+        log.info("Recording %.1fs from G1 built-in mic", seconds)
+        raw = self._builtin_mic.read_seconds(seconds)
+        audio = np.frombuffer(raw, dtype=np.int16)
+        log.info("Recorded: %d samples, std=%.0f", len(audio), audio.std())
+        if audio.std() < 50:
+            log.warning(self._config["messages"]["error_mic"] +
+                        " — G1 mic silent (check audio service on robot)")
+        return audio
+
+    def _record_parec(self, seconds: float) -> np.ndarray:
+        """Legacy Hollyland/PulseAudio path — only used if mic.backend='pactl_parec'."""
        source = self._mic["source_index"]
        rate = str(self._mic["rate"])
        channels = str(self._mic["channels"])
        fmt = self._mic["format"]

-        # Unmute mic
-        subprocess.run(
-            ["pactl", "set-source-mute", source, "0"],
-            capture_output=True,
-        )
-        subprocess.run(
-            ["pactl", "set-source-volume", source, "100%"],
-            capture_output=True,
-        )
-
-        log.info("Recording %.1fs from mic source %s", seconds, source)
+        subprocess.run(["pactl", "set-source-mute",  source, "0"],   capture_output=True)
+        subprocess.run(["pactl", "set-source-volume", source, "100%"], capture_output=True)

+        log.info("Recording %.1fs from mic source %s (parec)", seconds, source)
        proc = subprocess.Popen(
            ["parec", "-d", source,
             f"--format={fmt}", f"--rate={rate}", f"--channels={channels}", "--raw"],
@ -337,10 +316,8 @@ class AudioAPI:

        audio = np.frombuffer(raw, dtype=np.int16)
        log.info("Recorded: %d samples, std=%.0f", len(audio), audio.std())
-
        if audio.std() < 50:
            log.warning(self._config["messages"]["error_mic"] + " — mic may be silent")
-
        return audio

    def save_recording(self, audio: np.ndarray, name: str) -> str:
@ -355,16 +332,6 @@ class AudioAPI:
        log.info("Saved: %s", path)
        return path

-    # ─── LANGUAGE DETECTION ───────────────────────────────
-
-    @staticmethod
-    def _detect_lang(text: str) -> str:
-        """Detect language from text — Arabic Unicode range check."""
-        for c in text:
-            if '\u0600' <= c <= '\u06FF':
-                return "ar"
-        return "en"
-
    # ─── STATUS ───────────────────────────────────────────

    @property
@ -378,27 +345,16 @@ if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Marcus Audio API Test")
-    parser.add_argument("--test", action="store_true", help="Run speak tests")
-    parser.add_argument("--speak", type=str, help="Speak this text")
-    parser.add_argument("--lang", default="auto", help="Language: en, ar, auto")
+    parser.add_argument("--test", action="store_true", help="Run TTS + record test")
+    parser.add_argument("--speak", type=str, help="Speak this English text")
    parser.add_argument("--record", type=float, default=0, help="Record N seconds")
    args = parser.parse_args()

    api = AudioAPI()

    if args.test:
-        print("\n--- English built-in ---")
-        api.speak("Hello, I am Marcus.", "en")
-        time.sleep(1)
-
-        print("\n--- Arabic Piper ---")
-        api.speak("مرحبا، أنا ماركوس", "ar")
-        time.sleep(1)
-
-        print("\n--- Auto-detect ---")
-        api.speak("How are you?")
-        time.sleep(1)
-        api.speak("كيف حالك؟")
+        print("\n--- English (TtsMaker) ---")
+        api.speak("Hello, I am Sanad.")
        time.sleep(1)

        print("\n--- Record 3s + playback ---")
@ -408,7 +364,7 @@ if __name__ == "__main__":
        print("\nDone.")

    elif args.speak:
-        api.speak(args.speak, args.lang)
+        api.speak(args.speak)

    elif args.record > 0:
        rec = api.record(args.record)
--- a/API/yolo_api.py
+++ b/API/yolo_api.py
@ -49,9 +49,28 @@ def init_yolo(raw_frame_ref, frame_lock) -> bool:
        print(f"marcus_yolo.py not found ({e})")
        return False

-    # GPU is required — let RuntimeError from _resolve_device propagate so
-    # Marcus hard-fails at startup instead of silently running without vision.
-    ok = start_yolo(raw_frame_ref=raw_frame_ref, frame_lock=frame_lock)
+    # GPU is required. _resolve_device() raises RuntimeError when CUDA is
+    # missing — surface that with an actionable banner before re-raising so
+    # Marcus hard-fails with a clear error instead of a raw stack trace.
+    try:
+        ok = start_yolo(raw_frame_ref=raw_frame_ref, frame_lock=frame_lock)
+    except RuntimeError as e:
+        print()
+        print("╔" + "═" * 68 + "╗")
+        print("║  MARCUS STARTUP ABORTED — GPU REQUIRED".ljust(69) + "║")
+        print("╠" + "═" * 68 + "╣")
+        print(f"║  {str(e)[:66]:<66}  ║")
+        print("║" + " " * 68 + "║")
+        print("║  On the Jetson, verify:".ljust(69) + "║")
+        print("║    tegrastats           # GPU exists & is not throttled".ljust(69) + "║")
+        print("║    python3 -c 'import torch; print(torch.cuda.is_available())'".ljust(69) + "║")
+        print("║    nvcc --version       # CUDA toolkit reachable".ljust(69) + "║")
+        print("║  Expected: torch 2.1.0 nv23.06, CUDA 11.4, GPU=Orin.".ljust(69) + "║")
+        print("║  See Doc/environment.md section 9 for the reinstall recipe.".ljust(69) + "║")
+        print("╚" + "═" * 68 + "╝")
+        print()
+        raise
+
    if ok:
        YOLO_AVAILABLE       = True
        yolo_sees            = _ys
--- a/API/zmq_api.py
+++ b/API/zmq_api.py
@ -1,7 +1,16 @@
 """
 zmq_api.py — ZMQ velocity + command interface to Holosoma
+
+Previously the PUB socket was bound at module import time. That made the
+module unsafe to re-import from any multiprocessing child (e.g. the LiDAR
+SLAM_worker spawn), because the child would try to rebind the same port
+and crash with `Address already in use`.
+
+The bind now lives in init_zmq() — call it once from the brain entrypoint.
+Child processes can import this module without any network side effects.
 """
 import json
+import os
 import time
 import zmq
 from Core.config_loader import load_config
@ -15,35 +24,62 @@ STOP_ITERATIONS = _cfg["stop_iterations"]
 STOP_DELAY      = _cfg["stop_delay"]
 STEP_PAUSE      = _cfg["step_pause"]

-ctx  = zmq.Context()
-sock = ctx.socket(zmq.PUB)
-sock.bind(f"tcp://{ZMQ_HOST}:{ZMQ_PORT}")
-time.sleep(0.5)
-log(f"ZMQ PUB bound on tcp://{ZMQ_HOST}:{ZMQ_PORT}", "info", "zmq")
+# Shared state. These stay None until init_zmq() is called.
+ctx: zmq.Context = None
+sock: zmq.Socket = None
+_INIT_SETTLE = 0.5       # seconds to let PUB tell subscribers it's alive
+
+
+def init_zmq() -> zmq.Socket:
+    """
+    Bind the PUB socket. Idempotent — safe to call more than once.
+    Call from the main (parent) process only. Do NOT call from multiprocessing
+    children — they inherit nothing useful from the bound socket anyway.
+    """
+    global ctx, sock
+    if sock is not None:
+        return sock
+    ctx  = zmq.Context()
+    sock = ctx.socket(zmq.PUB)
+    sock.bind(f"tcp://{ZMQ_HOST}:{ZMQ_PORT}")
+    time.sleep(_INIT_SETTLE)
+    log(f"ZMQ PUB bound on tcp://{ZMQ_HOST}:{ZMQ_PORT} (pid={os.getpid()})",
+        "info", "zmq")
+    return sock
+
+
+def _ensure_sock() -> zmq.Socket:
+    if sock is None:
+        raise RuntimeError(
+            "zmq_api not initialized — call init_zmq() from the brain "
+            "entrypoint before using send_vel/send_cmd/gradual_stop"
+        )
+    return sock


 def get_socket():
    """Return the shared ZMQ PUB socket (for odometry to reuse)."""
-    return sock
+    return _ensure_sock()


 def send_vel(vx: float = 0.0, vy: float = 0.0, vyaw: float = 0.0):
    """Send velocity to Holosoma. vx m/s | vy m/s | vyaw rad/s"""
-    sock.send_string(json.dumps({"vel": {"vx": vx, "vy": vy, "vyaw": vyaw}}))
+    _ensure_sock().send_string(json.dumps({"vel": {"vx": vx, "vy": vy, "vyaw": vyaw}}))


 def gradual_stop():
    """Smooth deceleration to zero over ~1 second."""
+    s = _ensure_sock()
    for _ in range(STOP_ITERATIONS):
-        send_vel(0.0, 0.0, 0.0)
+        s.send_string(json.dumps({"vel": {"vx": 0.0, "vy": 0.0, "vyaw": 0.0}}))
        time.sleep(STOP_DELAY)


 def send_cmd(cmd: str):
    """Send Holosoma state command: start | walk | stand | stop"""
-    sock.send_string(json.dumps({"cmd": cmd}))
+    _ensure_sock().send_string(json.dumps({"cmd": cmd}))


-# Load MOVE_MAP from navigation config
+# Load MOVE_MAP from navigation config (pure data, safe at import time)
 _nav = load_config("Navigation")
 MOVE_MAP = {k: tuple(v) for k, v in _nav["move_map"].items()}
--- a/Autonomous/marcus_autonomous.py
+++ b/Autonomous/marcus_autonomous.py
@ -292,7 +292,10 @@ class AutonomousMode:
                self._enabled = False
                break

-            time.sleep(YOLO_CHECK_INTERVAL)
+            # No trailing sleep — _move_forward() takes FORWARD_DURATION,
+            # _turn() takes TURN_DURATION, and LLaVA assessment is ~1-2s.
+            # The body always consumes real wall time, so an extra sleep here
+            # would be pure dead time.

        # Clean up
        self._gradual_stop()
--- a/Brain/marcus_brain.py
+++ b/Brain/marcus_brain.py
@ -17,7 +17,7 @@ PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 if PROJECT_DIR not in sys.path:
    sys.path.insert(0, PROJECT_DIR)

-from API.zmq_api import send_vel, gradual_stop, send_cmd
+from API.zmq_api import init_zmq, send_vel, gradual_stop, send_cmd
 from API.camera_api import start_camera, stop_camera, get_frame
 from API.yolo_api import (
    init_yolo, yolo_summary, yolo_fps,
@ -70,7 +70,19 @@ _NAT_GOAL_RE = re.compile(
 # ══════════════════════════════════════════════════════════════════════════════

 def init_brain():
-    """Initialize all subsystems. Call once at startup."""
+    """Initialize all subsystems. Call once at startup from the parent process.
+
+    Optional subsystems (lidar / voice / imgsearch / autonomous) are gated on
+    `config_Brain.json::subsystems.<name>`. Disabling the ones you don't need
+    brings Marcus's boot time down from ~18 s to ~5-7 s.
+    """
+    subsys = _cfg.get("subsystems", {}) or {}
+
+    # Bind the ZMQ PUB socket before anything tries to publish on it.
+    # This is now explicit (previously it happened as an import side effect,
+    # which crashed every multiprocessing child that re-imported zmq_api).
+    init_zmq()
+
    raw_frame, raw_lock = start_camera()
    init_yolo(raw_frame, raw_lock)

@ -79,53 +91,65 @@ def init_brain():

    init_memory()

-    # LiDAR (optional — continues without it)
-    try:
-        from API.lidar_api import init_lidar
-        init_lidar()
-    except Exception as e:
-        print(f"  [LiDAR] Init failed: {e} — continuing without LiDAR")
+    # LiDAR — optional
+    if subsys.get("lidar", True):
+        try:
+            from API.lidar_api import init_lidar
+            init_lidar()
+        except Exception as e:
+            print(f"  [LiDAR] Init failed: {e} — continuing without LiDAR")
+    else:
+        print("  [LiDAR] disabled by config")

-    init_imgsearch(
-        get_frame_fn=get_frame,
-        send_vel_fn=send_vel,
-        gradual_stop_fn=gradual_stop,
-        llava_fn=call_llava,
-        yolo_sees_fn=yolo_sees,
-        model=OLLAMA_MODEL,
-    )
+    # Image search — optional
+    if subsys.get("imgsearch", False):
+        init_imgsearch(
+            get_frame_fn=get_frame,
+            send_vel_fn=send_vel,
+            gradual_stop_fn=gradual_stop,
+            llava_fn=call_llava,
+            yolo_sees_fn=yolo_sees,
+            model=OLLAMA_MODEL,
+        )
+    else:
+        print("  [ImgSearch] disabled by config")

-    # Autonomous exploration mode
-    from API.memory_api import mem as _mem_ref
-    from API.llava_api import PATROL_PROMPT
-    auto = AutonomousMode(
-        get_frame_fn=get_frame,
-        send_vel_fn=send_vel,
-        gradual_stop_fn=gradual_stop,
-        yolo_sees_fn=yolo_sees,
-        yolo_summary_fn=yolo_summary,
-        yolo_all_classes_fn=yolo_all_classes,
-        yolo_closest_fn=yolo_closest,
-        odom_fn=lambda: {"x": 0, "y": 0, "heading": 0},  # fallback if no odom
-        call_llava_fn=call_llava,
-        patrol_prompt=PATROL_PROMPT,
-        mem=_mem_ref,
-    )
-    # Wire odometry if available
-    from API.odometry_api import odom as _odom_ref, ODOM_AVAILABLE
-    if _odom_ref and ODOM_AVAILABLE:
-        auto._odom_pos = lambda: {
-            "x": _odom_ref._x, "y": _odom_ref._y, "heading": _odom_ref._heading
-        }
-    init_autonomous(auto)
+    # Autonomous exploration mode — optional
+    if subsys.get("autonomous", True):
+        from API.memory_api import mem as _mem_ref
+        from API.llava_api import PATROL_PROMPT
+        auto = AutonomousMode(
+            get_frame_fn=get_frame,
+            send_vel_fn=send_vel,
+            gradual_stop_fn=gradual_stop,
+            yolo_sees_fn=yolo_sees,
+            yolo_summary_fn=yolo_summary,
+            yolo_all_classes_fn=yolo_all_classes,
+            yolo_closest_fn=yolo_closest,
+            odom_fn=lambda: {"x": 0, "y": 0, "heading": 0},
+            call_llava_fn=call_llava,
+            patrol_prompt=PATROL_PROMPT,
+            mem=_mem_ref,
+        )
+        from API.odometry_api import odom as _odom_ref, ODOM_AVAILABLE
+        if _odom_ref and ODOM_AVAILABLE:
+            auto._odom_pos = lambda: {
+                "x": _odom_ref._x, "y": _odom_ref._y, "heading": _odom_ref._heading
+            }
+        init_autonomous(auto)
+    else:
+        print("  [Autonomous] disabled by config")

    send_cmd("start")
    time.sleep(0.5)
    send_cmd("walk")
    time.sleep(0.5)

-    # Voice module (optional — continues without it)
-    _init_voice()
+    # Voice module — optional
+    if subsys.get("voice", True):
+        _init_voice()
+    else:
+        print("  [Voice] disabled by config")

    _log("Brain initialized", "info", "brain")
    _warmup_llava()
@ -137,44 +161,37 @@ _voice_module = None


 def _init_voice():
-    """Initialize voice module — runs in background, calls process_command on speech."""
+    """
+    Initialize the voice subsystem: G1 built-in mic + Whisper STT + G1
+    built-in TtsMaker for replies. Every transcribed command flows through
+    process_command(), and the resulting `speak` string is sent to the G1
+    speaker.
+    """
    global _audio_api, _voice_module
    try:
        from API.audio_api import AudioAPI
-        from Voice.marcus_gemini_voice import GeminiVoiceModule as VoiceModule
+        from Voice.marcus_voice import VoiceModule

        _audio_api = AudioAPI()

-        def _voice_callback(text, role):
-            """Gemini voice callback."""
-            pass  # handled below
-            if role != "user" or not text.strip():
+        def _on_command(text, lang):
+            text = (text or "").strip()
+            if not text:
                return
-            t = text.strip().lower()
-            act_kw = ["turn","move","go","walk","step","stop","come","wave","clap",
-                "high five","shake","hug","forward","backward","left","right",
-                "what do you see","what can you see","look","describe","patrol",
-                "دور","امشي","روح","تقدم","ارجع","وقف","قف","تعال",
-                "يمين","يسار","قدام","ورا","لوح","صفق","سلم",
-                "شو شايف","شو تشوف","ماذا ترى","شو قدامك","لف","خطوات"]
-            if any(kw in t for kw in act_kw):
-                print(f"  [Brain] Action: {text.strip()}")
-                try:
-                    result = process_command(text.strip())
-                    if isinstance(result, dict):
-                        sp = result.get("speak", "")
-                        vis_kw = ["see","look","describe","شايف","تشوف","ترى","قدامك"]
-                        if any(k in t for k in vis_kw) and sp and _audio_api:
-                            print(f"  [Brain] Vision: {sp}")
-                            _audio_api.speak(sp)
-                except Exception as e:
-                    print(f"  [Brain] Error: {e}")
-            else:
-                print(f"  [Chat] {text.strip()}")
+            print(f"  [Voice] {text}")
+            try:
+                result = process_command(text)
+            except Exception as e:
+                print(f"  [Brain] Error processing voice command: {e}")
+                return
+            if isinstance(result, dict):
+                sp = (result.get("speak") or "").strip()
+                if sp and _audio_api:
+                    _audio_api.speak(sp)

-        _voice_module = VoiceModule(_audio_api, on_transcript=_voice_callback)
+        _voice_module = VoiceModule(_audio_api, on_command=_on_command)
        _voice_module.start()
-        print(f"  [Voice] Always listening (Gemini voice)")
+        print("  [Voice] Always listening (Whisper + G1 mic + TtsMaker)")
    except Exception as e:
        print(f"  [Voice] Init failed: {e} — continuing without voice")
        _audio_api = None
@ -255,7 +272,7 @@ def process_command(cmd: str) -> dict:

    # ── Greeting ─────────────────────────────────────────────────────────
    if re.match(r"^(?:hi+|hey+|hello+|hola|salam|marhaba|sup|yo+|ahlan|السلام عليكم|مرحبا|أهلا|هلا|يا هلا)\s*[!.]*$", cmd, re.IGNORECASE):
-        response = "Hello! I am Marcus. How can I help you?"
+        response = "Hello! I am Sanad. How can I help you?"
        print(f"Marcus: {response}")
        add_to_history(cmd, response)
        log_cmd(cmd, response)
@ -346,10 +363,15 @@ def _handle_llava(cmd):
    t0 = time.time()
    img = get_frame()

+    # Poll up to 500 ms in 50 ms slices instead of blocking a full second.
+    # Returns the moment a frame is available — most drops recover in <100 ms.
    if img is None:
        print("  Waiting for camera...")
-        time.sleep(1.0)
-        img = get_frame()
+        for _ in range(10):
+            time.sleep(0.05)
+            img = get_frame()
+            if img is not None:
+                break

    if img is None:
        print("  Camera not ready — command cancelled")
@ -461,7 +483,7 @@ def run_terminal():
    status = get_brain_status()
    print()
    print("=" * 48)
-    print("         MARCUS AI BRAIN — READY")
+    print("         SANAD AI BRAIN — READY")
    print("=" * 48)
    for k, v in status.items():
        print(f"  {k:<10}: {v}")
--- a/Config/config_Brain.json
+++ b/Config/config_Brain.json
@ -3,13 +3,19 @@
  "max_history": 6,
  "num_batch": 128,
  "num_ctx": 2048,
-  "num_predict_main": 200,
+  "subsystems": {
+    "lidar": true,
+    "voice": true,
+    "imgsearch": false,
+    "autonomous": true
+  },
+  "num_predict_main": 120,
  "num_predict_goal": 80,
  "num_predict_patrol": 100,
  "num_predict_talk": 80,
  "num_predict_verify": 10,
  "warmup_num_predict": 5,
-  "main_prompt": "You are Marcus, a humanoid robot. Look at the image and follow the command.\n{facts}\n\nCommand: \"{command}\"\n\nReply with ONLY this JSON — no markdown, no explanation:\n{{\"actions\":[{{\"move\":\"forward|backward|left|right|stop\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"one sentence\",\"abort\":null}}\n\nRULES:\n- actions is a list of movement steps, max duration 5.0s each\n- move: \"forward\" \"backward\" \"left\" \"right\" \"stop\"\n- arm: \"wave\" \"raise_right\" \"raise_left\" \"clap\" \"high_five\" \"hug\" \"heart\" \"shake_hand\" \"face_wave\" or null\n- arm is NEVER a move value\n- questions/descriptions: actions=[]\n- obstacle < 0.5m: abort = \"obstacle detected\"\n- \"90 degrees\" = 5.0s | \"45 degrees\" = 2.5s | \"1 step\" = 1.0s\n\nEXAMPLES:\n\"turn right\"             -> {{\"actions\":[{{\"move\":\"right\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Turning right\",\"abort\":null}}\n\"turn right 90 degrees\"  -> {{\"actions\":[{{\"move\":\"right\",\"duration\":5.0}}],\"arm\":null,\"speak\":\"Turning 90 degrees\",\"abort\":null}}\n\"move back then left\"    -> {{\"actions\":[{{\"move\":\"backward\",\"duration\":2.0}},{{\"move\":\"left\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Moving back then left\",\"abort\":null}}\n\"wave\"                   -> {{\"actions\":[],\"arm\":\"wave\",\"speak\":\"Waving\",\"abort\":null}}\n\"raise your right arm\"   -> {{\"actions\":[],\"arm\":\"raise_right\",\"speak\":\"Raising right arm\",\"abort\":null}}\n\"walk forward and wave\"  -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":\"wave\",\"speak\":\"Walking and waving\",\"abort\":null}}\n\"what do you see\"        -> {{\"actions\":[],\"arm\":null,\"speak\":\"I see...\",\"abort\":null}}\n\"stop\"                   -> {{\"actions\":[{{\"move\":\"stop\",\"duration\":0}}],\"arm\":null,\"speak\":\"Stopping\",\"abort\":null}}\n\"come to me\"             -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming to you\",\"abort\":null}}\n\"come back\"              -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming back\",\"abort\":null}}\n\"come here\"              -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming\",\"abort\":null}}\n\"get closer\"             -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":1.0}}],\"arm\":null,\"speak\":\"Moving closer\",\"abort\":null}}\n\"go away\"                -> {{\"actions\":[{{\"move\":\"backward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Moving away\",\"abort\":null}}\n\nCommand: \"{command}\"\nJSON:",
-  "goal_prompt": "You are Marcus navigating toward a goal.\n\nGOAL: \"{goal}\"\n\nLook at the image. Have you reached the goal?\n\nReply ONLY this JSON:\n{{\"reached\":false,\"next_move\":\"left\",\"duration\":0.5,\"speak\":\"what you see\"}}\n\nRULES:\n- reached: true ONLY if you clearly see the goal target right now\n- next_move: \"forward\" \"left\" \"right\"\n- duration: 0.3 to 0.8 seconds\n- Default next_move: \"left\" to keep scanning\n\nGOAL: \"{goal}\"\nJSON:",
-  "patrol_prompt": "You are Marcus, an HSE inspection robot on autonomous patrol.\n\nLook at the camera and assess the scene.\n\nReply ONLY this JSON:\n{{\"observation\":\"one sentence\",\"alert\":null,\"next_move\":\"forward\",\"duration\":1.0}}\n\nRULES:\n- alert = null if safe\n- alert = \"PPE: no helmet\" if person without helmet\n- alert = \"PPE: no vest\" if person without safety vest\n- alert = \"Hazard: description\" for other hazards\n- next_move: \"forward\" \"left\" \"right\"\n- duration: 0.5 to 2.0s\n\nJSON:"
+  "main_prompt": "You are Sanad, a humanoid robot. Look at the image and follow the command.\n{facts}\n\nCommand: \"{command}\"\n\nReply with ONLY this JSON — no markdown, no explanation:\n{{\"actions\":[{{\"move\":\"forward|backward|left|right|stop\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"one sentence\",\"abort\":null}}\n\nRULES:\n- actions is a list of movement steps, max duration 5.0s each\n- move: \"forward\" \"backward\" \"left\" \"right\" \"stop\"\n- arm: \"wave\" \"raise_right\" \"raise_left\" \"clap\" \"high_five\" \"hug\" \"heart\" \"shake_hand\" \"face_wave\" or null\n- arm is NEVER a move value\n- questions/descriptions: actions=[]\n- obstacle < 0.5m: abort = \"obstacle detected\"\n- \"90 degrees\" = 5.0s | \"45 degrees\" = 2.5s | \"1 step\" = 1.0s\n\nEXAMPLES:\n\"turn right\"             -> {{\"actions\":[{{\"move\":\"right\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Turning right\",\"abort\":null}}\n\"turn right 90 degrees\"  -> {{\"actions\":[{{\"move\":\"right\",\"duration\":5.0}}],\"arm\":null,\"speak\":\"Turning 90 degrees\",\"abort\":null}}\n\"move back then left\"    -> {{\"actions\":[{{\"move\":\"backward\",\"duration\":2.0}},{{\"move\":\"left\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Moving back then left\",\"abort\":null}}\n\"wave\"                   -> {{\"actions\":[],\"arm\":\"wave\",\"speak\":\"Waving\",\"abort\":null}}\n\"raise your right arm\"   -> {{\"actions\":[],\"arm\":\"raise_right\",\"speak\":\"Raising right arm\",\"abort\":null}}\n\"walk forward and wave\"  -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":\"wave\",\"speak\":\"Walking and waving\",\"abort\":null}}\n\"what do you see\"        -> {{\"actions\":[],\"arm\":null,\"speak\":\"I see...\",\"abort\":null}}\n\"stop\"                   -> {{\"actions\":[{{\"move\":\"stop\",\"duration\":0}}],\"arm\":null,\"speak\":\"Stopping\",\"abort\":null}}\n\"come to me\"             -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming to you\",\"abort\":null}}\n\"come back\"              -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming back\",\"abort\":null}}\n\"come here\"              -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming\",\"abort\":null}}\n\"get closer\"             -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":1.0}}],\"arm\":null,\"speak\":\"Moving closer\",\"abort\":null}}\n\"go away\"                -> {{\"actions\":[{{\"move\":\"backward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Moving away\",\"abort\":null}}\n\nCommand: \"{command}\"\nJSON:",
+  "goal_prompt": "You are Sanad navigating toward a goal.\n\nGOAL: \"{goal}\"\n\nLook at the image. Have you reached the goal?\n\nReply ONLY this JSON:\n{{\"reached\":false,\"next_move\":\"left\",\"duration\":0.5,\"speak\":\"what you see\"}}\n\nRULES:\n- reached: true ONLY if you clearly see the goal target right now\n- next_move: \"forward\" \"left\" \"right\"\n- duration: 0.3 to 0.8 seconds\n- Default next_move: \"left\" to keep scanning\n\nGOAL: \"{goal}\"\nJSON:",
+  "patrol_prompt": "You are Sanad, an HSE inspection robot on autonomous patrol.\n\nLook at the camera and assess the scene.\n\nReply ONLY this JSON:\n{{\"observation\":\"one sentence\",\"alert\":null,\"next_move\":\"forward\",\"duration\":1.0}}\n\nRULES:\n- alert = null if safe\n- alert = \"PPE: no helmet\" if person without helmet\n- alert = \"PPE: no vest\" if person without safety vest\n- alert = \"Hazard: description\" for other hazards\n- next_move: \"forward\" \"left\" \"right\"\n- duration: 0.5 to 2.0s\n\nJSON:"
 }
--- a/Config/config_Voice.json
+++ b/Config/config_Voice.json
@ -1,55 +1,46 @@
 {
  "tts": {
-    "piper_voice_ar": "ar_JO-kareem-medium",
-    "piper_voice_en": "en_US-lessac-medium",
-    "piper_sample_rate": 22050,
+    "backend": "builtin_ttsmaker",
    "builtin_speaker_id": 0,
-    "target_sample_rate": 16000,
-    "piper_timeout_sec": 120,
-    "en_backend": "edge_tts",
-    "ar_backend": "edge_tts",
-    "edge_voice_ar": "ar-AE-HamdanNeural",
-    "edge_voice_en": "en-US-GuyNeural"
+    "target_sample_rate": 16000
  },
  "stt": {
    "wake_model": "tiny",
    "command_model": "small",
-    "wake_words_en": [
-      "marcus",
-      "marcos",
-      "markus"
-    ],
-    "wake_words_ar": [
-      "ماركوس",
-      "مارکوس",
-      "ماركس"
-    ],
-    "language": null,
+    "wake_words_en": ["sanad", "sannad", "sanat", "sunnat"],
+    "language": "en",
    "command_timeout_sec": 10,
    "silence_threshold": 500,
    "silence_duration_sec": 1.5,
    "max_record_sec": 15
  },
  "mic": {
+    "backend": "builtin_udp",
    "source_index": "3",
    "format": "s16le",
    "rate": 16000,
    "channels": 1
  },
+  "mic_udp": {
+    "group": "239.168.123.161",
+    "port": 5555,
+    "buffer_max_bytes": 64000,
+    "read_timeout_sec": 0.04
+  },
  "speaker": {
    "dds_interface": "eth0",
    "volume": 100,
-    "app_name": "marcus"
+    "app_name": "sanad"
  },
  "audio": {
    "data_dir": "Data/Voice/Recordings",
    "log_file": "logs/voice.log"
  },
  "messages": {
-    "wake_heard": "Listening...",
-    "no_speech": "I didn't catch that. Say my name again.",
+    "wake_heard": "Listening",
+    "no_speech": "I didn't catch that, please say it again",
    "error_tts": "Speech synthesis failed",
    "error_mic": "Microphone error",
    "ready": "Voice system ready"
  }
-}
+}
--- a/Config/marcus_prompts.yaml
+++ b/Config/marcus_prompts.yaml
@ -26,7 +26,7 @@
 # ══════════════════════════════════════════════════════════════════════════════

 main_prompt: |
-  You are Marcus — an advanced humanoid robot assistant built by YS Lootah Technology, Dubai.
+  You are Sanad — an advanced humanoid robot assistant built by YS Lootah Technology, Dubai.
  You are physically present in the room. You have a body, arms, and a camera.
  You follow commands from your operator and respond intelligently.
  {facts}
@ -184,7 +184,7 @@ main_prompt: |
  → {{"actions":[],"arm":null,"speak":"Nice to meet you Kassam, I will remember that","abort":null}}

  "who are you"
-  → {{"actions":[],"arm":null,"speak":"I am Marcus, a humanoid robot assistant by YS Lootah Technology","abort":null}}
+  → {{"actions":[],"arm":null,"speak":"I am Sanad, a humanoid robot assistant by YS Lootah Technology","abort":null}}

  Safety:
  "walk into the wall"
@ -307,7 +307,7 @@ main_prompt: |
 # ══════════════════════════════════════════════════════════════════════════════

 goal_prompt: |
-  You are Marcus, a humanoid robot actively navigating toward a specific target.
+  You are Sanad, a humanoid robot actively navigating toward a specific target.

  YOUR MISSION: "{goal}"

@ -392,7 +392,7 @@ goal_prompt: |
 # ══════════════════════════════════════════════════════════════════════════════

 patrol_prompt: |
-  You are Marcus, a humanoid robot autonomously exploring and mapping an office environment.
+  You are Sanad, a humanoid robot autonomously exploring and mapping an office environment.

  Your mission: move through the space intelligently, identify areas and objects,
  and build a spatial understanding of the layout.
@ -463,7 +463,7 @@ patrol_prompt: |
 # ══════════════════════════════════════════════════════════════════════════════

 talk_prompt: |
-  You are Marcus, a humanoid robot assistant. You have been asked a question
+  You are Sanad, a humanoid robot assistant. You have been asked a question
  or given information. Do NOT move — just respond intelligently.
  {facts}

@ -509,7 +509,7 @@ talk_prompt: |
  → {{"actions":[],"arm":null,"speak":"All systems are functioning normally and I am ready to assist you","abort":null}}

  "what is your name"
-  → {{"actions":[],"arm":null,"speak":"My name is Marcus, a humanoid robot assistant by YS Lootah Technology","abort":null}}
+  → {{"actions":[],"arm":null,"speak":"My name is Sanad, a humanoid robot assistant by YS Lootah Technology","abort":null}}

  "who built you"
  → {{"actions":[],"arm":null,"speak":"I was built by YS Lootah Technology in Dubai, and my programmer is Kassam","abort":null}}
--- a/Core/log_backend.py
+++ b/Core/log_backend.py
--- a/Core/logger.py
+++ b/Core/logger.py
@ -1,9 +1,13 @@
 """
-logger.py — Project-wide logging via Logger.py
+logger.py — Project-wide configured logging instance.
+
+Imports the `Logs` backend class from log_backend.py (formerly Logger.py;
+renamed to avoid a case-only filename collision with this module, which
+breaks any case-insensitive filesystem — macOS default HFS+/APFS, Windows).
 """
 import os
 from Core.env_loader import PROJECT_ROOT
-from Core.Logger import Logs
+from Core.log_backend import Logs

 # Single shared instance — all modules use this
 _logs = Logs(main_log_file=os.path.join(PROJECT_ROOT, "logs", "main.log"))
--- a/Doc/MARCUS_API.md
+++ b/Doc/MARCUS_API.md
@ -1,8 +1,37 @@
 # Marcus — Full API & Developer Reference

 **Project:** Marcus | YS Lootah Technology | Jetson Orin NX + G1 EDU
-**Scripts:** `~/Models_marcus/marcus_llava.py` + `~/Models_marcus/marcus_yolo.py`
-**Updated:** April 4, 2026
+**Robot persona:** Sanad (wake word + self-intro; project code stays under `Marcus/`)
+**Entry points:** `run_marcus.py` (terminal) / `Server/marcus_server.py` (WebSocket)
+**Updated:** 2026-04-21
+
+> **What changed since the early draft (April 4):** The project was restructured
+> from two monolithic scripts (`marcus_llava.py` + `marcus_yolo.py`) into a
+> layered architecture. See `Doc/architecture.md` for the current file tree and
+> `Doc/environment.md` for the verified Jetson software stack, exact library
+> versions, and GPU bring-up recipe. This reference still describes the
+> function-level semantics (inputs/outputs/examples) — treat any file path in
+> this document as illustrative and cross-check the actual module. Recent
+> deltas called out inline below.
+
+### Recent API deltas (2026-04-21)
+
+| Change | Location | Note |
+|---|---|---|
+| GPU is mandatory for YOLO | `Config/config_Vision.json`, `Vision/marcus_yolo.py` | `yolo_device` defaults to `"cuda"` and is enforced; `_resolve_device()` raises `RuntimeError` on missing CUDA. `yolo_half=true` runs FP16 on Orin (capability 8.7). |
+| Ollama model | `Config/config_Brain.json` | Default `ollama_model` is `qwen2.5vl:3b` (not `llava:7b`). |
+| Ollama compute-graph caps | `Config/config_Brain.json` | `num_batch=128`, `num_ctx=2048` — required on 16 GB Orin NX to prevent the llama runner OOM. Propagated by `API/llava_api.py` and `Vision/marcus_imgsearch.py` to every `ollama.chat` call. |
+| `num_predict_main` lowered | `Config/config_Brain.json` | 200 → 120 (shaves ~400–600 ms per open-ended command; JSON still parses). |
+| ZMQ bind moved out of import | `API/zmq_api.py` | `init_zmq()` must be called from the main process before any `send_vel/send_cmd`. `init_brain()` does this. Children spawned via `multiprocessing` no longer collide on port 5556. |
+| Camera-retry poll | `Brain/marcus_brain.py::_handle_llava` | Replaced `time.sleep(1.0)` with 10×50 ms polls. |
+| Conditional scan sleeps | `Navigation/goal_nav.py`, `Autonomous/marcus_autonomous.py` | Removed unconditional per-step naps when real work (YOLO hit, LLaVA call, forward move) already consumed wall time. |
+| Image-search step delay | `Vision/marcus_imgsearch.py` | `STEP_DELAY` 0.4 s → 0.15 s. |
+| Built-in G1 microphone | `Voice/builtin_mic.py` (new), `API/audio_api.py`, `Config/config_Voice.json` | Mic now reads from UDP multicast `239.168.123.161:5555` (G1 on-board array mic) instead of the Hollyland USB. Config key `mic.backend` defaults to `"builtin_udp"`; set to `"pactl_parec"` to fall back to the old path. |
+| Built-in G1 TTS | `Voice/builtin_tts.py` (new), `API/audio_api.py` | `AudioAPI.speak(text)` now calls `client.TtsMaker(text, speaker_id)` directly. No MP3/WAV plumbing, no internet, no edge-tts/Piper. English only — `speak()` refuses non-ASCII to avoid the G1's silent Arabic→Chinese fallback. |
+| Gemini voice deleted | `Voice/marcus_gemini_voice.py` removed | `_init_voice()` now spawns `Voice.marcus_voice.VoiceModule` (Whisper wake + command STT). No more WebSocket, no more asyncio event loop, no API key. |
+| Subsystem flags | `Config/config_Brain.json::subsystems.{lidar, voice, imgsearch, autonomous}` | `init_brain()` skips any subsystem with `false`. Defaults: lidar+voice+autonomous ON, imgsearch OFF. |
+| Robot persona → Sanad | Multiple | Wake words `["sanad","sannad","sanat","sunnat"]`; all prompts say "You are Sanad"; banner reads `SANAD AI BRAIN — READY`; hardcoded self-intro says "I am Sanad". Project/file/module names unchanged. |
+| Logger rename | `Core/log_backend.py` (was `Core/Logger.py`) | Case-only collision with `Core/logger.py` removed — repo now clones cleanly on macOS/Windows. Public API unchanged: `from Core.logger import log`. |

 ---

@ -22,38 +51,54 @@
 12. [JSON Schema Reference](#12-json-schema-reference)
 13. [Environment & Paths](#13-environment--paths)
 14. [Quick Reference Card](#14-quick-reference-card)
+15. [Voice API (mic + TTS + STT)](#15-voice-api-mic--tts--stt)

 ---

 ## 1. Configuration Variables

-Defined at the top of `marcus_llava.py`. Edit here to change global behavior.
+All configuration is now **JSON-driven** and lives under `Config/`. Each module
+loads its config at startup via `Core.config_loader.load_config(name)`.

-| Variable | Default | Description |
-|----------|---------|-------------|
-| `ZMQ_HOST` | `"127.0.0.1"` | Holosoma ZMQ host |
-| `ZMQ_PORT` | `5556` | Holosoma ZMQ port |
-| `ZMQ_YOLO_PORT` | `5557` | YOLO ZMQ port (standalone mode) |
-| `OLLAMA_MODEL` | `"llava:7b"` | LLaVA model via Ollama |
-| `CAM_WIDTH` | `424` | Camera capture width (px) |
-| `CAM_HEIGHT` | `240` | Camera capture height (px) |
-| `CAM_FPS` | `15` | Camera frame rate |
-| `CAM_QUALITY` | `70` | JPEG quality sent to LLaVA |
-| `STOP_ITERATIONS` | `20` | gradual_stop message count |
-| `STOP_DELAY` | `0.05` | seconds between stop messages |
-| `STEP_PAUSE` | `0.3` | pause between consecutive action steps |
-| `ARM_SDK_PATH` | `/home/unitree/unitree_sdk2_python` | Arm SDK path |
-| `ARM_INTERFACE` | `"eth0"` | Network interface for arm SDK |
+**`Config/config_ZMQ.json`** (Holosoma bridge)

-Defined at top of `marcus_yolo.py`:
+| Key | Default | Description |
+|---|---|---|
+| `zmq_host` | `"127.0.0.1"` | Holosoma ZMQ host |
+| `zmq_port` | `5556` | Holosoma ZMQ port |
+| `stop_iterations` | `20` | `gradual_stop()` message count |
+| `stop_delay` | `0.05` | seconds between stop messages |
+| `step_pause` | `0.3` | pause between consecutive action steps |

-| Variable | Default | Description |
-|----------|---------|-------------|
-| `YOLO_MODEL_PATH` | `.../Model/yolov8m.pt` | YOLO model path |
-| `YOLO_CONFIDENCE` | `0.45` | Minimum detection confidence |
-| `YOLO_IOU` | `0.45` | NMS IOU threshold |
-| `YOLO_DEVICE` | `"cpu"` | Inference device ("cpu" or "cuda") |
-| `YOLO_IMG_SIZE` | `320` | Inference image size (smaller = faster) |
+**`Config/config_Brain.json`** (Ollama VL model)
+
+| Key | Default | Description |
+|---|---|---|
+| `ollama_model` | `"qwen2.5vl:3b"` | Ollama model tag |
+| `max_history` | `6` | conversation turns retained |
+| `num_batch` | `128` | llama.cpp batch — **cap, required for Jetson** |
+| `num_ctx` | `2048` | llama.cpp KV context length — **cap, required for Jetson** |
+| `num_predict_main` | `120` | max tokens for the main command path |
+| `num_predict_goal` | `80` | goal-navigation call |
+| `num_predict_patrol` | `100` | autonomous patrol call |
+| `num_predict_talk` | `80` | talk-only path |
+| `num_predict_verify` | `10` | YOLO condition verifier (`yes`/`no`) |
+
+**`Config/config_Vision.json`** (YOLO)
+
+| Key | Default | Description |
+|---|---|---|
+| `yolo_model_path` | `"Models/yolov8m.pt"` | weights file (auto-fetched if missing) |
+| `yolo_confidence` | `0.45` | detection confidence threshold |
+| `yolo_iou` | `0.45` | NMS IOU threshold |
+| `yolo_device` | `"cuda"` | **GPU required** — `"cpu"` raises `RuntimeError` |
+| `yolo_half` | `true` | FP16 inference (Ampere tensor cores) |
+| `yolo_img_size` | `320` | inference image size |
+| `tracked_classes` | 19 COCO classes | filter for relevant detections |
+
+**`Config/config_Camera.json`**: `424x240 @ 15 fps`, `JPEG quality 70`.
+**`Config/config_Voice.json`**: see section 6 below.
+**`Config/config_Network.json`**: Jetson eth0/wlan0 IPs, WebSocket port.

 ---

@ -61,20 +106,28 @@ Defined at top of `marcus_yolo.py`:

 ### Setup

+The bind is no longer an import-time side effect. It runs inside `init_zmq()`, called once by `init_brain()` from the main process. Children (e.g. the LiDAR SLAM worker spawned via `multiprocessing.spawn`) can re-import `API.zmq_api` without rebinding.
+
 ```python
-ctx  = zmq.Context()
-sock = ctx.socket(zmq.PUB)
-sock.bind("tcp://127.0.0.1:5556")
-time.sleep(0.5)
+# API/zmq_api.py — bind happens here, not at module import
+def init_zmq() -> zmq.Socket:
+    global ctx, sock
+    if sock is not None:
+        return sock              # idempotent
+    ctx  = zmq.Context()
+    sock = ctx.socket(zmq.PUB)
+    sock.bind(f"tcp://{ZMQ_HOST}:{ZMQ_PORT}")
+    time.sleep(0.5)              # let SUBs attach
+    return sock
 ```

 ### `send_vel(vx, vy, vyaw)`

-Send velocity command to Holosoma.
+Send velocity command to Holosoma. Raises `RuntimeError` if `init_zmq()` wasn't called.

 ```python
 def send_vel(vx: float = 0.0, vy: float = 0.0, vyaw: float = 0.0):
-    sock.send_string(json.dumps({"vel": {"vx": vx, "vy": vy, "vyaw": vyaw}}))
+    _ensure_sock().send_string(json.dumps({"vel": {"vx": vx, "vy": vy, "vyaw": vyaw}}))
 ```

 | Parameter | Unit | Safe range | Effect |
@ -661,14 +714,17 @@ from unitree_sdk2py.g1.arm.g1_arm_action_client import G1ArmActionClient  # Arm

 ```
 STARTUP:
-  Tab 1: source ~/.holosoma_deps/miniconda3/bin/activate hsinference
-          cd ~/holosoma && sudo jetson_clocks
+  Tab 1 (hsinference env): Holosoma locomotion policy
          python3 run_policy.py inference:g1-29dof-loco \
            --task.velocity-input zmq --task.state-input zmq --task.interface eth0

-  Tab 2: ollama serve &
-          /home/unitree/miniconda3/envs/marcus/bin/python3 ~/Models_marcus/marcus_llava.py
-          (YOLO starts automatically — no Tab 3 needed)
+  Tab 2:  ollama serve > /tmp/ollama.log 2>&1 &
+          sleep 3
+
+  Tab 3 (marcus env):  conda activate marcus && cd ~/Marcus && python3 run_marcus.py
+          (YOLO + voice + LiDAR all start automatically per subsystems flags)
+
+WAKE WORD: "Sanad"

 COMMANDS:
  walk forward · turn right · turn left · move back
@ -704,4 +760,74 @@ SAFETY:

 ---

+## 15. Voice API (mic + TTS + STT)
+
+New pipeline as of 2026-04-21. Replaces the Gemini live WebSocket + edge-tts/Piper stack.
+
+### Mic — `Voice.builtin_mic.BuiltinMic`
+
+Captures the G1's on-board array microphone over UDP multicast. No USB mic required. 16 kHz mono int16 PCM natively; no resampling needed.
+
+```python
+from Voice.builtin_mic import BuiltinMic
+mic = BuiltinMic(group="239.168.123.161", port=5555, buf_max=64_000)
+mic.start()
+try:
+    pcm = mic.read_chunk(1024)       # 512 samples, ~32 ms, int16 mono
+    # or
+    pcm = mic.read_seconds(3.0)
+finally:
+    mic.stop()
+```
+
+Config under `config_Voice.json::mic_udp`.
+
+### TTS — `Voice.builtin_tts.BuiltinTTS`
+
+Wrapper around `unitree_sdk2py.g1.audio.g1_audio_client.AudioClient.TtsMaker`. English only — refuses non-ASCII input.
+
+```python
+from Voice.builtin_tts import BuiltinTTS
+tts = BuiltinTTS(audio_client, default_speaker_id=0)
+tts.speak("Hello, I am Sanad", block=True)    # synth + play on G1 body speaker
+```
+
+Used by `AudioAPI.speak(text)` internally; application code should call `audio_api.speak(...)` rather than BuiltinTTS directly.
+
+### Wake + command loop — `Voice.marcus_voice.VoiceModule`
+
+Four-state machine (`IDLE → WAKE_HEARD → PROCESSING → SPEAKING`). Whisper `tiny` detects the wake word, `small` transcribes commands.
+
+```python
+from API.audio_api import AudioAPI
+from Voice.marcus_voice import VoiceModule
+
+def on_command(text, lang):
+    print(f"heard: {text}")
+
+audio = AudioAPI()
+voice = VoiceModule(audio, on_command=on_command)
+voice.start()   # background thread
+# ... later ...
+voice.stop()
+```
+
+Wake words are configured in `config_Voice.json::stt.wake_words_en`. The brain's `_init_voice()` wires `on_command` to `process_command(text)` + `audio_api.speak(reply)`.
+
+### AudioAPI — `API.audio_api.AudioAPI`
+
+Orchestration layer. Owns the `AudioClient`, manages mute/unmute, exposes a clean `speak` + `record` API.
+
+```python
+from API.audio_api import AudioAPI
+audio = AudioAPI()
+audio.speak("Hello")                 # English only; non-ASCII returns early
+pcm = audio.record(seconds=5)         # int16 mono 16 kHz — uses BuiltinMic
+audio.play_pcm(pcm)                   # raw PCM playback via Unitree RPC
+```
+
+Config: `config_Voice.json::tts.backend = "builtin_ttsmaker"`, `mic.backend = "builtin_udp"` (or `"pactl_parec"` to fall back to Hollyland).
+
+---
+
 *Marcus — YS Lootah Technology | Kassam | April 2026*
--- a/Doc/MARCUS_progress.pdf
+++ b/Doc/MARCUS_progress.pdf
--- a/Doc/Marcus_Project.pdf
+++ b/Doc/Marcus_Project.pdf
--- a/Doc/architecture.md
+++ b/Doc/architecture.md
@ -1,20 +1,39 @@
 # Marcus — System Architecture

 **Project**: Marcus | YS Lootah Technology
-**Hardware**: Unitree G1 EDU Humanoid (29 DOF) + Jetson Orin NX (16GB)
-**Updated**: 2026-04-06
+**Hardware**: Unitree G1 EDU Humanoid (29 DOF) + Jetson Orin NX 16 GB
+**Robot persona**: **Sanad** (wake word + self-intro; project code still lives under `Marcus/`)
+**Updated**: 2026-04-21
+
+---
+
+## Recent deltas (since 2026-04-06)
+
+- **GPU-only YOLO** — `_resolve_device()` raises `RuntimeError` if CUDA is missing. `yolo_device=cuda`, `yolo_half=true` by default.
+- **Ollama compute-graph caps** — `num_batch=128`, `num_ctx=2048` in `config_Brain.json` (otherwise llama.cpp OOMs on the 16 GB Jetson).
+- **`num_predict_main: 120`** (was 200) — saves ~400-600 ms per open-ended command.
+- **ZMQ bind moved to `init_zmq()`** — no longer runs at import time; multiprocessing children (LiDAR SLAM worker) can safely re-import.
+- **G1 built-in microphone** via UDP multicast `239.168.123.161:5555` — `Voice/builtin_mic.py` replaces Hollyland/`parec` as the default mic.
+- **G1 built-in TTS** via `client.TtsMaker()` — `Voice/builtin_tts.py`. English only. Edge-tts / Piper / XTTS paths removed.
+- **Gemini voice module deleted** — Whisper wake-word + command STT path is now authoritative (`Voice/marcus_voice.py`).
+- **Subsystem flags** — `config_Brain.json::subsystems.{lidar, voice, imgsearch, autonomous}` let you selectively skip heavy boot stages.
+- **Conditional inner-loop sleeps** — goal_nav / autonomous / imgsearch no longer pay unconditional per-step naps.
+- **Core/Logger.py → Core/log_backend.py** — case-only name collision with `logger.py` resolved; repo clones cleanly on macOS/Windows.
+- **Robot persona = "Sanad"** — wake words, prompts, banner, and self-intro all use "Sanad". Project identity ("Marcus") remains in file names, class names, directory, logs.
+
+See `Doc/environment.md` for the verified Jetson software stack and `Doc/pipeline.md` for the end-to-end data flow.

 ---

 ## Overview

-Marcus is a fully offline humanoid robot AI system. The brain runs on Jetson Orin NX with no cloud dependencies. It uses vision-language models (Qwen2.5-VL via Ollama) for understanding commands, YOLO for real-time object detection, dead reckoning for position tracking, and persistent memory across sessions.
+Marcus is a mostly-offline humanoid robot AI system. The brain runs on Jetson Orin NX using a local vision-language model (Qwen2.5-VL via Ollama) for open-ended commands, YOLOv8m for real-time object detection (CUDA + FP16), dead reckoning + optional ROS2 odometry for pose, Livox Mid-360 LiDAR + a custom SLAM worker for mapping, and persistent memory across sessions.

 Two operating modes:
- **Terminal mode** (`run_marcus.py`) — direct keyboard control on the Jetson
- **Server mode** (`Server/marcus_server.py`) — WebSocket server allowing remote CLI or GUI clients
+- **Terminal mode** (`run_marcus.py`) — direct keyboard control on the Jetson. Voice subsystem runs alongside by default.
+- **Server mode** (`Server/marcus_server.py`) — WebSocket server allowing remote CLI or GUI clients.

-Both modes use the **same brain** — identical command processing, same YOLO, same memory, same movement control.
+Both modes use the **same brain** — identical command processing, same YOLO, same memory, same movement control. Voice, LiDAR, image-search and autonomous-patrol are gated behind `config_Brain.json::subsystems` flags.

 ---

@ -28,14 +47,14 @@ Marcus/
 ├── Core/                         # Foundation layer — no external deps
 │   ├── env_loader.py             # Reads .env, resolves PROJECT_ROOT
 │   ├── config_loader.py          # load_config(name) → reads Config/config_{name}.json
-│   ├── Logger.py                 # Logging engine (file-based, no console output)
+│   ├── log_backend.py            # Logging engine (file-based, no console output) — was Logger.py
 │   └── logger.py                 # Project wrapper: log(), log_and_print(), get_logger()
 │
 ├── Config/                       # ALL configuration — one JSON per module
 │   ├── config_ZMQ.json           # ZMQ host, port, stop params
 │   ├── config_Camera.json        # RealSense resolution, fps, quality
-│   ├── config_Brain.json         # Ollama model, prompts, num_predict values
-│   ├── config_Vision.json        # YOLO model path, confidence, tracked classes
+│   ├── config_Brain.json         # Ollama model, prompts, num_predict, num_batch/ctx, subsystems
+│   ├── config_Vision.json        # YOLO model path, device=cuda, half=true, confidence, tracked classes
 │   ├── config_Navigation.json    # move_map, goal aliases, YOLO goal classes
 │   ├── config_Patrol.json        # patrol duration, proximity threshold
 │   ├── config_Arm.json           # arm actions, aliases, availability flag
@ -43,17 +62,26 @@ Marcus/
 │   ├── config_Memory.json        # session/places paths
 │   ├── config_Network.json       # Jetson IPs (eth0/wlan0), ports
 │   ├── config_ImageSearch.json   # search defaults
-│   └── marcus_prompts.yaml       # All LLaVA/Qwen prompts (main, goal, patrol, talk, verify)
+│   ├── config_Voice.json         # mic (builtin_udp|pactl_parec), TTS backend, wake words, mic_udp group/port
+│   ├── config_LiDAR.json         # Livox Mid-360 connection + SLAM engine params
+│   └── marcus_prompts.yaml       # All Qwen-VL prompts (main, goal, patrol, talk, verify)
 │
 ├── API/                          # Interface layer — one file per subsystem
-│   ├── zmq_api.py                # ZMQ PUB socket: send_vel(), gradual_stop(), send_cmd()
+│   ├── zmq_api.py                # ZMQ PUB socket: init_zmq(), send_vel(), gradual_stop(), send_cmd()
 │   ├── camera_api.py             # RealSense thread: start/stop_camera(), get_frame()
-│   ├── llava_api.py              # LLaVA queries: call_llava(), ask(), ask_goal(), ask_patrol()
-│   ├── yolo_api.py               # YOLO interface: init_yolo(), yolo_sees(), yolo_summary()...
+│   ├── llava_api.py              # Qwen2.5-VL queries via Ollama: call_llava(), ask(), ask_goal()…
+│   ├── yolo_api.py               # YOLO interface: init_yolo(), yolo_sees(), yolo_summary()…
 │   ├── odometry_api.py           # Odometry wrapper: init_odometry(), get_position()
 │   ├── memory_api.py             # Memory wrapper: init_memory(), log_cmd(), place_save/goto()
-│   ├── arm_api.py                # Arm gestures: do_arm(), ARM_ACTIONS, ALL_ARM_NAMES
-│   └── imgsearch_api.py          # Image search wrapper: init_imgsearch(), get_searcher()
+│   ├── arm_api.py                # Arm gestures: do_arm(), ARM_ACTIONS, ALL_ARM_NAMES (stub)
+│   ├── imgsearch_api.py          # Image search wrapper: init_imgsearch(), get_searcher()
+│   ├── audio_api.py              # AudioAPI — speak() via G1 TtsMaker, record() via BuiltinMic
+│   └── lidar_api.py              # LiDAR wrapper: init_lidar(), obstacle_ahead(), get_lidar_status()
+│
+├── Voice/                        # Mic + TTS + wake-word STT
+│   ├── builtin_mic.py            # G1 array mic via UDP multicast 239.168.123.161:5555
+│   ├── builtin_tts.py            # BuiltinTTS — client.TtsMaker(text, speaker_id)
+│   └── marcus_voice.py           # VoiceModule — Whisper tiny (wake) + small (command) state machine
 │
 ├── Brain/                        # Decision logic — imports ONLY from API/
 │   ├── marcus_brain.py           # Orchestrator: init_brain(), process_command(), run_terminal()
@ -127,39 +155,40 @@ Marcus/
 │  Server/marcus_server.py (WebSocket)             │
 └──────────────────┬──────────────────────────────┘
                   │
-┌──────────────────▼──────────────────────────────┐
-│                Brain Layer                       │
-│  marcus_brain.py    — init_brain()               │
-│                     — process_command(cmd)        │
-│  command_parser.py  — 14 regex local commands    │
-│  executor.py        — execute LLaVA decisions    │
-│  marcus_memory.py   — session + place memory     │
-└──────────────────┬──────────────────────────────┘
+┌──────────────────▼──────────────────────────────────┐
+│                   Brain Layer                        │
+│  marcus_brain.py    — init_brain() / process_command  │
+│  command_parser.py  — regex-table local commands     │
+│  executor.py        — execute Qwen-VL decisions      │
+│  marcus_memory.py   — session + place memory         │
+└──────────────────┬──────────────────────────────────┘
                   │ imports only from API/
-┌──────────────────▼──────────────────────────────┐
-│                 API Layer                        │
-│  zmq_api     camera_api    llava_api            │
-│  yolo_api    odometry_api  memory_api           │
-│  arm_api     imgsearch_api                      │
-└──────────────────┬──────────────────────────────┘
-                   │ wraps
-┌──────────────────▼──────────────────────────────┐
-│            Navigation / Vision                   │
-│  goal_nav.py        marcus_yolo.py              │
-│  patrol.py          marcus_imgsearch.py         │
-│  marcus_odometry.py                              │
-└──────────────────┬──────────────────────────────┘
-                   │
-┌──────────────────▼──────────────────────────────┐
-│               Core Layer                         │
-│  env_loader.py   config_loader.py               │
-│  Logger.py       logger.py                      │
-└──────────────────┬──────────────────────────────┘
+┌──────────────────▼──────────────────────────────────┐
+│                    API Layer                         │
+│  zmq_api   camera_api   llava_api   audio_api       │
+│  yolo_api  odometry_api memory_api  imgsearch_api   │
+│  arm_api   lidar_api                                 │
+└──────────────┬───────────────────────┬──────────────┘
+               │ wraps                 │ wraps
+┌──────────────▼───────────┐  ┌────────▼────────────────┐
+│   Navigation / Vision    │  │        Voice            │
+│  goal_nav.py             │  │  builtin_mic.py         │
+│  patrol.py               │  │  builtin_tts.py         │
+│  marcus_odometry.py      │  │  marcus_voice.py        │
+│  marcus_yolo.py          │  │  (Whisper + TtsMaker)   │
+│  marcus_imgsearch.py     │  └──────────┬──────────────┘
+└──────────────┬───────────┘             │
+               │                         │
+┌──────────────▼─────────────────────────▼────────────┐
+│                   Core Layer                         │
+│  env_loader.py   config_loader.py                   │
+│  log_backend.py  logger.py                          │
+└──────────────────┬──────────────────────────────────┘
                   │ reads
-┌──────────────────▼──────────────────────────────┐
-│              Config / .env                       │
-│  11 JSON files + marcus_prompts.yaml            │
-└─────────────────────────────────────────────────┘
+┌──────────────────▼──────────────────────────────────┐
+│                 Config / .env                        │
+│  13 JSON files + marcus_prompts.yaml                │
+└──────────────────────────────────────────────────────┘
 ```

 **Rule**: Brain never imports from Vision/ or Navigation/ directly. It goes through the API layer.
@ -176,11 +205,11 @@ Reads `.env` from the project root to resolve `PROJECT_ROOT`. Uses a minimal bui
 #### `config_loader.py` (30 lines)
 `load_config(name)` reads `Config/config_{name}.json` and caches the result. All modules call this instead of hardcoding constants. Also provides `config_path(relative)` to resolve relative paths (e.g., `"Models/yolov8m.pt"`) to absolute paths from PROJECT_ROOT.

-#### `Logger.py` (186 lines)
-Full logging engine from AI_Photographer. File-based only (no console output by default). Creates per-module log files in `logs/`. Handles write permission fallbacks, log name normalization, and corrupt log recovery.
+#### `log_backend.py` (186 lines, was `Logger.py`)
+Full logging engine ported from AI_Photographer. File-based only (no console output by default). Creates per-module log files in `logs/`. Handles write permission fallbacks, log name normalization, and corrupt log recovery. Renamed from `Logger.py` on 2026-04-21 to eliminate a case-only collision with `logger.py` that prevented the repo from cloning on case-insensitive filesystems (macOS/Windows).

 #### `logger.py` (51 lines)
-Project wrapper around `Logger.py`. Provides:
+Project wrapper around `log_backend.Logs`. Provides:
 - `log(message, level, module)` — write to `logs/{module}.log`
 - `log_and_print(message, level, module)` — write + print
 - `get_logger(module)` — get configured Logs instance
@ -191,12 +220,15 @@ Project wrapper around `Logger.py`. Provides:

 Each API file wraps one subsystem. They read their own config via `load_config()`, handle import errors gracefully with fallback stubs, and export clean public functions.

-#### `zmq_api.py` (49 lines)
-Creates a ZMQ PUB socket on startup (binds to `tcp://127.0.0.1:{zmq_port}`). Holosoma's RL policy connects to this socket as SUB and receives velocity commands at 50Hz.
+#### `zmq_api.py` (~75 lines)
+Holds the ZMQ PUB socket used to drive Holosoma at 50 Hz. **The bind is not a module import side effect any more** — it runs only when `init_zmq()` is called from the main (parent) process. This lets the LiDAR SLAM worker (spawned via `multiprocessing.spawn`) re-import the module without rebinding port 5556 and crashing.

 **Exports:**
+- `init_zmq()` — idempotent bind, called once by `init_brain()`
 - `send_vel(vx, vy, vyaw)` — send velocity to Holosoma
 - `gradual_stop()` — 20 zero-velocity messages over 1 second
+- `send_cmd(cmd)` — Holosoma state machine (`start` / `walk` / `stand` / `stop`)
+- `get_socket()` — access the bound socket (for odometry to reuse)
 - `send_cmd(cmd)` — send state command: "start", "walk", "stand", "stop"
 - `get_socket()` — return the shared PUB socket (for odometry to reuse)
 - `MOVE_MAP` — direction-to-velocity lookup: `{"forward": (0.3, 0, 0), "left": (0, 0, 0.3), ...}`
@ -440,6 +472,37 @@ Supports text-only search (no reference image) using hint description.

 ---

+### Voice/
+
+Mic, TTS and wake-word pipeline. All three files run only when `config_Brain.json::subsystems.voice == true`. Everything is local — no internet, no WebSocket, no cloud API. TTS is English-only by design (the G1 firmware maps non-English to Chinese, which is unusable).
+
+#### `builtin_mic.py` (~180 lines, new 2026-04-21)
+Ported from `Project/Sanad/voice/audio_io.py::BuiltinMic`. Joins the G1's on-board audio multicast group (`239.168.123.161:5555`) and buffers incoming int16 mono 16 kHz PCM. Thread-safe ring buffer.
+
+**Exports:**
+- `BuiltinMic(group, port, buf_max, read_timeout)` — init (idempotent)
+- `start()` / `stop()` — socket lifecycle
+- `read_chunk(n)` — pull exactly `n` bytes (blocks up to `read_timeout`, pads silence otherwise)
+- `read_seconds(s)` — convenience for "record `s` seconds"
+- `flush()` — drop buffered audio (called while TTS plays, to avoid echo)
+
+#### `builtin_tts.py` (~70 lines, new 2026-04-21)
+Thin wrapper around `unitree_sdk2py.g1.audio.g1_audio_client.AudioClient.TtsMaker(text, speaker_id)`. Synchronous — blocks until the estimated playback duration elapses. Refuses non-ASCII input (the G1 silently maps Arabic to Chinese, which confuses everyone).
+
+**Exports:**
+- `BuiltinTTS(audio_client, default_speaker_id=0)` — init
+- `speak(text, speaker_id=None, block=True)` — synth+play on G1 body speaker
+
+#### `marcus_voice.py` (~340 lines, rewired 2026-04-21)
+Always-listening English voice loop with a four-state machine (`IDLE → WAKE_HEARD → PROCESSING → SPEAKING`). Whisper `tiny` listens for the wake word "Sanad" on 2-second chunks; Whisper `small` transcribes the full command. Mic input comes from `BuiltinMic`; responses go through `audio_api.speak()` → `BuiltinTTS`.
+
+**Exports:**
+- `VoiceModule(audio_api, on_command=cb)` — init
+- `start()` — spawn background thread
+- `stop()` — graceful teardown
+
+---
+
 ### Server/

 #### `marcus_server.py` (224 lines)
--- a/Doc/controlling.md
+++ b/Doc/controlling.md
@ -1,15 +1,16 @@
 # Marcus — Control & Startup Guide

-**Updated**: 2026-04-06
+**Robot persona:** Sanad (wake word + self-intro; project code lives under `Marcus/`)
+**Updated**: 2026-04-21

 ---

 ## Quick Start

-### Prerequisites (Jetson Orin NX)
+### Prerequisites (Jetson Orin NX, JetPack 5.1.1)

 ```bash
-# Terminal 1 — Start Holosoma (locomotion policy)
+# Terminal 1 — Start Holosoma (locomotion policy, in hsinference env)
 source ~/.holosoma_deps/miniconda3/bin/activate hsinference
 cd ~/holosoma
 ~/.holosoma_deps/miniconda3/envs/hsinference/bin/python3 \
@ -19,28 +20,46 @@ cd ~/holosoma
  --task.velocity-input zmq \
  --task.state-input zmq \
  --task.interface eth0
+
+# Terminal 2 — Ollama server (leave running)
+ollama serve > /tmp/ollama.log 2>&1 &
+sleep 3
+ollama list                # confirm qwen2.5vl:3b present
 ```

 ### Option A — Terminal Mode (on Jetson)

 ```bash
-# Terminal 2 — Start Marcus Brain
-conda activate Marcus
-ollama serve & sleep 3
+# Terminal 3 — Start Marcus Brain
+conda activate marcus
 cd ~/Marcus
 python3 run_marcus.py
 ```

-Direct keyboard control. All commands typed locally.
+Direct keyboard control + voice input (say **"Sanad"** to wake). Expected banner on boot:
+
+```
+================================================
+         SANAD AI BRAIN — READY
+================================================
+  model     : qwen2.5vl:3b
+  yolo      : True
+  odometry  : True
+  memory    : True
+  lidar     : True
+  voice     : True
+  camera    : 424x240@15
+```

 ### Option B — Server + Client (remote)

 ```bash
-# Terminal 2 (Jetson) — Start Server
+# Terminal 3 (Jetson) — Start Server
+conda activate marcus
 cd ~/Marcus
 python3 -m Server.marcus_server

-# Terminal 3 (Workstation) — Connect Client
+# Terminal 4 (Workstation) — Connect Client
 cd ~/Robotics_workspace/yslootahtech/Project/Marcus
 python3 -m Client.marcus_cli
 ```
@ -58,6 +77,20 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765`

 ---

+## Voice
+
+- **Wake word:** "Sanad" (variants "sannad", "sanat", "sunnat" — see `config_Voice.json::stt.wake_words_en`)
+- **Mic:** G1 on-board array mic, captured via UDP multicast `239.168.123.161:5555` (16 kHz mono, 16-bit PCM). No USB mic needed.
+- **STT:** Whisper `tiny` (wake detection) + Whisper `small` (command transcription) — both run locally.
+- **TTS:** Unitree `client.TtsMaker()` → G1 body speaker. English only.
+- **Barge-in:** say something while Marcus is speaking and the mic buffer flushes on the next command.
+
+Interaction flow: say "Sanad" → hear *"Listening"* → speak your command → see transcript on console → Marcus answers through the speaker.
+
+To disable voice entirely, set `subsystems.voice: false` in `config_Brain.json` — Marcus will boot text-only ~2 s faster.
+
+---
+
 ## Command Reference

 ### Movement
@ -75,17 +108,17 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765`
 ### Vision
 | Command | Action |
 |---------|--------|
-| `what do you see` | LLaVA describes camera view |
-| `describe the room` | LLaVA scene description |
-| `is anyone here` | LLaVA person check |
+| `what do you see` | Qwen2.5-VL describes camera view |
+| `describe the room` | Qwen2.5-VL scene description |
+| `is anyone here` | Qwen2.5-VL person check |
 | `yolo` | Show YOLO detection status |

 ### Goal Navigation
 | Command | Action |
 |---------|--------|
 | `goal/ stop when you see a person` | YOLO fast search + stop |
-| `goal/ find a laptop` | YOLO + LLaVA search |
-| `goal/ stop when you see a guy holding a phone` | YOLO + LLaVA compound verification |
+| `goal/ find a laptop` | YOLO + Qwen-VL search |
+| `goal/ stop when you see a guy holding a phone` | YOLO + Qwen-VL compound verification |
 | `find a person` | Auto-detected as goal (no prefix needed) |
 | `look for a bottle` | Auto-detected as goal |

@ -106,7 +139,7 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765`
 | `patrol` | Autonomous patrol (prompts for duration) |
 | `patrol: door → desk → exit` | Named waypoint patrol |

-### Image Search
+### Image Search (requires `subsystems.imgsearch: true`)
 | Command | Action |
 |---------|--------|
 | `search/ /path/to/photo.jpg` | Find target from reference image |
@ -122,11 +155,20 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765`
 | `last session` | Previous session summary |
 | `session summary` | Current session stats |

+### Autonomous Mode
+| Command | Action |
+|---------|--------|
+| `auto on` | Start autonomous exploration |
+| `auto off` | Stop |
+| `auto status` | Current step / observations |
+| `auto save` | Snapshot observations to disk |
+
 ### System
 | Command | Action |
 |---------|--------|
 | `help` | Command reference |
 | `example` | Usage examples |
+| `lidar` / `lidar status` | SLAM engine pose + health |
 | `q` / `quit` | Shutdown |

 ### Client-Only Commands (CLI)
@ -139,35 +181,43 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765`

 ---

+## Subsystem flags (`Config/config_Brain.json`)
+
+Control what initializes at boot. Defaults:
+
+```jsonc
+"subsystems": {
+  "lidar":      true,
+  "voice":      true,
+  "imgsearch":  false,
+  "autonomous": true
+}
+```
+
+Set any to `false` to skip that subsystem's init. Boot time drops roughly:
+- `voice: false`   → ~2 s faster (no Whisper model load)
+- `lidar: false`   → ~1 s faster (no SLAM subprocess spawn)
+- `imgsearch: false` → already the default; re-enable only when you need `search/ …`
+- `autonomous: false` → minor, but removes the AutonomousMode init
+
+---
+
 ## Network Configuration

 | Interface | IP | Use |
 |-----------|-----|------|
-| `eth0` | 192.168.123.164 | Robot internal network (Jetson - G1 - LiDAR) |
-| `wlan0` | 10.255.254.86 | Office WiFi (Jetson - Workstation) |
+| `eth0` | 192.168.123.164 | Robot internal network (Jetson ↔ G1 ↔ LiDAR) |
+| `wlan0` | 10.255.254.86 | Office WiFi (Jetson ↔ Workstation) |

 | Service | Port | Protocol |
 |---------|------|----------|
 | Marcus WebSocket | 8765 | ws:// |
-| ZMQ Velocity | 5556 | tcp:// (PUB/SUB) |
-| Ollama API | 11434 | HTTP |
-| LiDAR | 192.168.123.120 | Livox Mid360 |
+| ZMQ velocity (→ Holosoma) | 5556 | tcp:// (PUB/SUB) |
+| Ollama API | 11434 | HTTP (localhost only) |
+| G1 audio multicast (mic) | 5555 | UDP multicast 239.168.123.161 |
+| Livox Mid-360 (LiDAR) | 192.168.123.120 | UDP (Livox SDK) |

-All configurable in `Config/config_Network.json`.
-
---
-
-## Subsystem Status
-
-On startup, the server/brain shows:
-```
-YOLO      : active       (19 tracked classes, CPU, yolov8m.pt)
-Odometry  : active       (dead reckoning, +/-10cm)
-Memory    : active       (session_016_2026-04-06)
-Camera    : 424x240@15   (RealSense D435I)
-LiDAR     : ALIVE        (Livox Mid360 at 192.168.123.120)
-Arms      : pending      (GR00T N1.5 not yet integrated)
-```
+Most values configurable in `Config/config_Network.json` and `config_Voice.json::mic_udp`.

 ---

@ -175,13 +225,15 @@ Arms      : pending      (GR00T N1.5 not yet integrated)

 | Issue | Cause | Fix |
 |-------|-------|-----|
-| `ModuleNotFoundError: No module named 'Server'` | Wrong directory | `cd ~/Marcus` then run |
-| Robot doesn't move | Holosoma not running | Start Holosoma first (Terminal 1) |
-| Robot doesn't move | ZMQ port conflict | Only run one of Server or Brain, not both |
-| `Camera: {e} reconnecting` | USB bandwidth | Reduce to `low` profile |
-| LLaVA slow (>10s) | GPU VRAM full | Kill other GPU processes, or use `qwen2.5vl:3b` |
-| `YOLO not available` | ultralytics not installed | `pip install ultralytics` |
-| Client can't connect | Wrong IP or server not running | Check `status` command, verify IP |
+| Banner shows `SANAD AI BRAIN — READY` but nothing moves | Holosoma not running | Start Holosoma (Terminal 1) first |
+| `RuntimeError: CUDA not available` on boot | Wrong torch build on Jetson | See `Doc/environment.md` section 9.2 — reinstall the NVIDIA Jetson torch wheel |
+| `llama runner process has terminated: %!w(<nil>)` | Ollama compute graph OOM | Already capped at `num_batch=128 / num_ctx=2048`. Check `free -h`; kill stale Ollama runners: `pkill -f "ollama runner"` |
+| Traceback mentioning `multiprocessing/spawn.py` + ZMQ port 5556 | Old import-time ZMQ bind regressed | Pull latest `API/zmq_api.py` — must call `init_zmq()` from the parent only |
+| `[Camera] No frame for 10s` during warmup | Ollama blocking the main thread, or USB bandwidth | Warmup is ~10–15 s on first Qwen load; subsequent commands are fast |
+| Wake word never fires | Whisper hearing something else | Check `logs/voice.log` — if it transcribes as "sunnat"/"sannat", add your variant to `config_Voice.json::stt.wake_words_en` |
+| Mic silent | G1 audio service not publishing | Run `python3 Voice/builtin_mic.py` standalone — must print "OK — mic is capturing audio" |
+| `[LiDAR] No data yet (will keep trying)` | SLAM worker still spawning (normal) or Livox network | First ~5 s normal. If persists, `ping 192.168.123.120` |
+| Client can't connect | Wrong IP or server not running | Verify `ollama serve &` and `python3 -m Server.marcus_server` are both up |

 ---

@ -191,6 +243,7 @@ Arms      : pending      (GR00T N1.5 not yet integrated)
 |------|------|
 | Brain code | `~/Marcus/Brain/` |
 | Server | `~/Marcus/Server/marcus_server.py` |
+| Voice | `~/Marcus/Voice/{builtin_mic,builtin_tts,marcus_voice}.py` |
 | Config | `~/Marcus/Config/` |
 | Prompts | `~/Marcus/Config/marcus_prompts.yaml` |
 | YOLO model | `~/Marcus/Models/yolov8m.pt` |
@ -199,3 +252,5 @@ Arms      : pending      (GR00T N1.5 not yet integrated)
 | Logs | `~/Marcus/logs/` |

 See `Doc/architecture.md` for full project structure and file-by-file documentation.
+See `Doc/environment.md` for the verified Jetson software stack.
+See `Doc/pipeline.md` for the end-to-end data flow.
--- a/Doc/environment.md
+++ b/Doc/environment.md
@ -1,10 +1,11 @@
 # Marcus — Environment & Version Reference

 **Project**: Marcus | YS Lootah Technology
+**Robot persona**: Sanad (wake word + self-intro; codebase stays under `Marcus/`)
 **Hardware**: Unitree G1 EDU Humanoid (29 DOF) + Jetson Orin NX 16 GB
 **Deployment host**: `unitree@192.168.123.164` (hostname `ubuntu`)
 **Conda env**: `marcus`
-**Captured**: 2026-04-12
+**Captured**: 2026-04-12 (updated 2026-04-21)

 This document is the canonical record of the verified GPU-accelerated software stack running on the Jetson Orin NX. It covers system software, Python environment, Marcus runtime dependencies, installation recipe, verification commands, and known quirks. Pair it with `architecture.md` (what the code does) and `controlling.md` (how to drive it).

@ -136,29 +137,23 @@ Captured from `importlib` on 2026-04-12, `marcus` env on the Jetson.

 ## 8. Marcus project modules — import status

-All 16 project modules import cleanly from the `marcus` env at `/home/unitree/Marcus`:
+All 25 project modules import cleanly from the `marcus` env at `/home/unitree/Marcus`:

 ```
-OK   Core.config_loader
-OK   Core.env_loader
-OK   Vision.marcus_yolo
-OK   Vision.marcus_imgsearch
-OK   API.llava_api
-OK   API.yolo_api
-OK   API.camera_api
-OK   API.zmq_api
-OK   API.imgsearch_api
-OK   API.odometry_api
-OK   API.memory_api
-OK   API.arm_api
-OK   Navigation.goal_nav
-OK   Navigation.patrol
-OK   Navigation.marcus_odometry
-OK   Brain.marcus_brain
-OK   Brain.marcus_memory
+OK   Core.config_loader      Core.env_loader
+OK   Core.log_backend        Core.logger
+OK   Voice.builtin_mic       Voice.builtin_tts       Voice.marcus_voice
+OK   Vision.marcus_yolo      Vision.marcus_imgsearch
+OK   API.llava_api           API.yolo_api            API.camera_api
+OK   API.zmq_api             API.imgsearch_api       API.odometry_api
+OK   API.memory_api          API.arm_api             API.audio_api
+OK   Navigation.goal_nav     Navigation.patrol       Navigation.marcus_odometry
+OK   Brain.marcus_brain      Brain.marcus_memory     Brain.command_parser
 OK   Autonomous.marcus_autonomous
 ```

+Notable removals: `Voice/marcus_gemini_voice.py` deleted on 2026-04-21. `Core/Logger.py` renamed to `Core/log_backend.py`.
+
 ---

 ## 9. Installation recipe (reproducing this environment)
@ -378,3 +373,7 @@ Config file (`Config/config_Vision.json`):
 | 2026-04-12 | Initial environment.md — full stack captured, GPU bring-up verified end to end. Steady-state YOLOv8m FPS on Orin NX measured at 21.9. Ollama Qwen2.5-VL verified at 100% GPU. |
 | 2026-04-12 | `Vision/marcus_yolo.py` rewired to load `config_Vision.json`, added `_resolve_device()` with hard-fail on missing CUDA (GPU-only policy). `API/yolo_api.py` updated to propagate `RuntimeError`. `Config/config_Vision.json` set `yolo_device=cuda`, `yolo_half=true`. |
 | 2026-04-12 | Installed NVIDIA Jetson torch `2.1.0a0+41361538.nv23.06` (replacing CPU-only PyPI `2.4.1`) + built torchvision `0.16.1` from source against it. Verified `nms device = cuda:0`. |
+| 2026-04-12 | Fixed llama.cpp compute-graph OOM on Jetson: added `num_batch=128` + `num_ctx=2048` caps in `Config/config_Brain.json`, propagated through `API/llava_api.py` and `Vision/marcus_imgsearch.py`. Qwen2.5-VL compute graph drops from ~7.5 GiB to ~1.8 GiB. |
+| 2026-04-21 | **Restructure**: moved ZMQ bind out of `API/zmq_api.py` import time into `init_zmq()`; fixes LiDAR SLAM worker spawn crash. Added loud GPU-requirement banner in `API/yolo_api.py`. Dropped `num_predict_main` 200→120. Made inner-loop sleeps in goal_nav/autonomous/imgsearch conditional. Renamed `Core/Logger.py` → `Core/log_backend.py` (case-collision fix). Updated `Doc/MARCUS_API.md` to current state. |
+| 2026-04-21 | **Voice restructure**: added `Voice/builtin_mic.py` (G1 array mic via UDP multicast `239.168.123.161:5555`) and `Voice/builtin_tts.py` (thin `AudioClient.TtsMaker` wrapper). Rewired `Voice/marcus_voice.py` to use BuiltinMic. Refactored `API/audio_api.py::speak()` to use BuiltinTTS — removed ~110 lines of edge-tts + pydub + Piper plumbing. Deleted `Voice/marcus_gemini_voice.py`. Added `subsystems.{lidar,voice,imgsearch,autonomous}` gate in `config_Brain.json::init_brain()`. |
+| 2026-04-21 | **Persona swap**: robot identifies as Sanad. Wake words `["sanad","sannad","sanat","sunnat"]`, `speaker.app_name="sanad"`, all Qwen prompts say "You are Sanad", banner reads `SANAD AI BRAIN — READY`, hardcoded self-intro says "I am Sanad". Project directory, class names, filenames, and `PROJECT_NAME=Marcus` env var unchanged. |
--- a/Doc/note.txt
+++ b/Doc/note.txt
@ -38,15 +38,12 @@ rm ~/Robotics_workspace/yslootahtech/Project/Marcus_fine_tune/marcus-gguf/marcus



-
-https://ingrid789.github.io/SkillMimic/
-https://github.com/wyhuai/SkillMimic
-
-https://vla-survey.github.io/
-
-
-
-
+https://github.com/AnjieCheng/NaVILA
+https://rchalyang.github.io/EgoVLA/
+https://github.com/RchalYang/EgoVLA_Release
+https://github.com/openvla/openvla
+https://github.com/unitreerobotics/unifolm-vla
+https://github.com/OpenDriveLab/WholebodyVLA



--- a/Doc/pipeline.md
+++ b/Doc/pipeline.md
@ -0,0 +1,187 @@
+# Marcus — End-to-End Pipeline
+
+**Robot persona:** Sanad (wake word + self-intro)
+**Updated:** 2026-04-21
+
+One map of every data path from sensor to motor, voice to speech. Cross-reference with `architecture.md` (what each file is) and `MARCUS_API.md` (function signatures).
+
+---
+
+## Boot sequence
+
+`Brain/marcus_brain.py::init_brain()` — called once from `run_marcus.py` or `marcus_server.py`.
+
+```
+run_marcus.py
+      │
+      ▼
+init_brain()
+      │
+      ├─ init_zmq()                            PUB bind tcp://127.0.0.1:5556 → Holosoma
+      ├─ start_camera()                        RealSense 424×240@15fps → shared _raw_frame
+      ├─ init_yolo(raw_frame, raw_lock)        YOLOv8m CUDA FP16, 19 classes — background thread
+      ├─ init_odometry()                       ROS2 /dog_odom → dead reckoning fallback
+      ├─ init_memory()                         loads Data/Brain/Sessions/session_NNN/
+      │
+      ├─ if subsystems.lidar:       init_lidar()         multiprocessing spawn SLAM_worker
+      ├─ if subsystems.imgsearch:   init_imgsearch()     (off by default)
+      ├─ if subsystems.autonomous:  AutonomousMode()     patrol state machine
+      │
+      ├─ send_cmd("start") + 0.5s + send_cmd("walk") + 0.5s   Holosoma handshake
+      │
+      ├─ if subsystems.voice:       _init_voice()        ▼ voice pipeline below
+      └─ _warmup_llava()                        first Qwen2.5-VL inference
+                                                "SANAD AI BRAIN — READY"
+```
+
+Subsystem flags live in `config_Brain.json::subsystems`. Current defaults:
+
+```json
+"subsystems": { "lidar": true, "voice": true, "imgsearch": false, "autonomous": true }
+```
+
+---
+
+## Voice pipeline (when `subsystems.voice = true`)
+
+```
+G1 body mic (array)
+  └─ UDP multicast 239.168.123.161:5555 ── int16 mono 16 kHz PCM
+        ▼
+Voice/builtin_mic.py::BuiltinMic
+  ring buffer (64 KB) + read_chunk(n)
+        ▼
+Voice/marcus_voice.py::VoiceModule   (IDLE → WAKE_HEARD → PROCESSING → SPEAKING)
+  ├─ IDLE        : 2-s chunks → Whisper tiny → wake-word match ("sanad"/"sannad"/…)
+  ├─ WAKE_HEARD  : audio_api.speak("Listening") → G1 body speaker
+  ├─ PROCESSING  : record-until-silence → Whisper small → transcribed text
+  └─ on_command(text, "en")
+        ▼
+Brain/marcus_brain.py::process_command(text)
+  ├─ regex fast-path → Brain/command_parser.py::try_local_command()
+  │    places · odometry walk/turn · patrol · session recall · goal_nav · auto on/off
+  └─ else → _handle_llava(text)
+        ├─ get_frame()  (10×50 ms poll, no 1 s stall)
+        ├─ API/llava_api.py::ask(text, img)
+        │    ollama.chat(qwen2.5vl:3b, num_batch=128, num_ctx=2048, num_predict=120)
+        │    → parse_json() → {actions, arm, speak, abort}
+        └─ Brain/executor.py::execute(d)
+                ├─ actions → API/zmq_api.py::send_vel(vx, vy, vyaw) → Holosoma
+                ├─ arm     → API/arm_api.py  (stub for now)
+                └─ abort   → gradual_stop()
+        ▼
+result["speak"]  →  audio_api.speak(reply)
+        ▼
+API/audio_api.py::speak(text, lang="en")
+  ├─ mute mic (flush BuiltinMic buffer)
+  ├─ Voice/builtin_tts.py::BuiltinTTS.speak(text)
+  │    client.TtsMaker(text, speaker_id=0)   — G1 on-board engine, English only
+  │    time.sleep(len(text) * 0.08)
+  └─ unmute mic → back to IDLE
+```
+
+---
+
+## Terminal / WebSocket command pipeline (same brain, skips voice)
+
+```
+run_marcus.py stdin   OR   Server/marcus_server.py WebSocket
+        ▼
+Brain/marcus_brain.py::process_command(text)
+        ▼  (same parser → LLaVA → executor → ZMQ as above)
+        ▼
+result dict  →  stdout   OR   WebSocket reply frame
+```
+
+---
+
+## Vision pipeline (continuous, consumed by brain on demand)
+
+```
+RealSense D435 (USB)
+  └─ 424×240 BGR 15 fps
+      → API/camera_api.py — shared _raw_frame (thread-safe)
+                    │                  │
+                    │                  └─ get_frame() → JPEG base64 on demand
+                    ▼
+       Vision/marcus_yolo.py (daemon thread)
+       YOLOv8m @ cuda:0 FP16 imgsz=320
+       → _latest_detections (thread-safe list)
+         yolo_sees / yolo_closest / yolo_summary / yolo_fps
+                    ▼
+       Navigation/goal_nav.py  (fast YOLO check → Qwen-VL fallback)
+       Autonomous/marcus_autonomous.py  (patrol scan every N steps)
+       Brain/marcus_brain.py  (status / alerts)
+```
+
+---
+
+## Movement pipeline
+
+```
+Brain/executor.py  OR  Brain/command_parser.py  OR  Navigation/*
+        │   uses MOVE_MAP from config_Navigation.json
+        ▼
+API/zmq_api.py::send_vel(vx, vy, vyaw)  JSON over ZMQ PUB (port 5556)
+        ▼
+Holosoma RL policy (separate process, hsinference env)
+        ▼
+G1 low-level joint commands over DDS/eth0
+        ▼
+29-DOF body motion
+```
+
+---
+
+## LiDAR pipeline (when `subsystems.lidar = true`)
+
+```
+Livox Mid-360 (192.168.123.120, UDP)
+        ▼
+Lidar/SLAM_worker.py  (multiprocessing.spawn subprocess — CUDA-safe spawn)
+    ├─ SLAM_engine, SLAM_Filter, SLAM_LoopClosure, SLAM_Submap, SLAM_NavRuntime
+    ├─ publishes pose + obstacle flags back to parent via Queue
+    └─ writes occupancy grids to Data/Navigation/Maps/
+        ▼
+API/lidar_api.py  (reads the queues, exposes:)
+        ├─ obstacle_ahead() → bool
+        ├─ get_lidar_status() → dict (pose, loc_state, frame age, FPS, ICP ms)
+        └─ LIDAR_AVAILABLE
+        ▼
+Navigation/goal_nav.py rotation thread — pauses motion on obstacle_ahead()
+Brain/command_parser.py — responds to "lidar status" queries
+```
+
+---
+
+## Knobs that control each stage
+
+| Knob | Location | Effect |
+|---|---|---|
+| `subsystems.lidar` | config_Brain.json | SLAM subprocess on/off |
+| `subsystems.voice` | config_Brain.json | BuiltinMic + Whisper + TtsMaker loop on/off |
+| `subsystems.imgsearch` | config_Brain.json | image-guided search init on/off |
+| `subsystems.autonomous` | config_Brain.json | auto-patrol state machine init on/off |
+| `num_batch`, `num_ctx` | config_Brain.json | llama.cpp compute-graph size (128 / 2048 ≈ 1.8 GiB graph — **do not raise** on 16 GB Jetson) |
+| `num_predict_main` | config_Brain.json | 120 tokens max for the main JSON reply |
+| `yolo_device`, `yolo_half` | config_Vision.json | `cuda` / FP16 (hard-required; CPU not allowed) |
+| `mic.backend` | config_Voice.json | `builtin_udp` (G1 array) or `pactl_parec` (Hollyland fallback) |
+| `mic_udp.group/port` | config_Voice.json | where to join the G1 audio multicast |
+| `tts.backend` | config_Voice.json | `builtin_ttsmaker` (only supported option) |
+| `stt.wake_words_en` | config_Voice.json | Whisper matcher (`sanad` + variants) |
+
+---
+
+## Per-command latency (estimated, post-fixes)
+
+| Step | Typical | Notes |
+|---|---|---|
+| Wake-word detect | 200–500 ms | Whisper tiny on 2 s chunk |
+| Record until silence | 1–8 s | depends on user speech |
+| Whisper small STT | 500–1500 ms | once per command |
+| Camera frame fetch | <50 ms | poll loop, no 1 s blocking stall |
+| Ollama Qwen2.5-VL | 800–1500 ms | `num_batch=128 / num_ctx=2048 / num_predict=120` |
+| Executor + ZMQ send | <10 ms | fire-and-forget PUB |
+| TtsMaker playback | ~len(text) × 80 ms | synthesizes + plays on robot |
+
+**Total wake → answer-playback:** ~**2.5–4 s** for a short vision question like "what do you see" (vs. 5–8 s with the pre-restructure edge-tts/Gemini overhead).
--- a/Navigation/goal_nav.py
+++ b/Navigation/goal_nav.py
@ -123,26 +123,36 @@ def navigate_to_goal(goal: str, max_steps: int = 0):
    reached = False
    try:
        for step in range(1, max_steps + 1):
-            time.sleep(SCAN_INTERVAL)
+            # Track whether real work happened this iteration. If it did,
+            # the work itself already ate wall time — don't pay an extra
+            # SCAN_INTERVAL nap on top.
+            did_work = False

            # --- YOLO fast check ---
            if yolo_target and yolo_sees(yolo_target):
                img_b64 = get_frame()
+                did_work = True
                if condition:
                    if not _verify_condition(yolo_target, condition, img_b64):
                        print(f"  [GoalNav] YOLO sees {yolo_target} but condition "
                              f"'{condition}' not met — continuing")
-                        continue
-
-                print(f"  [GoalNav] YOLO confirmed '{yolo_target}' at step {step}")
-                log_detection(yolo_target, position="goal", distance="close")
-                reached = True
-                break
+                        # fall through to the sleep-skip path
+                    else:
+                        print(f"  [GoalNav] YOLO confirmed '{yolo_target}' at step {step}")
+                        log_detection(yolo_target, position="goal", distance="close")
+                        reached = True
+                        break
+                else:
+                    print(f"  [GoalNav] YOLO confirmed '{yolo_target}' at step {step}")
+                    log_detection(yolo_target, position="goal", distance="close")
+                    reached = True
+                    break

            # --- LLaVA fallback (less frequent — every few steps) ---
            if step >= MIN_STEPS and step % MIN_STEPS == 0:
                img_b64 = get_frame()
                if img_b64:
+                    did_work = True
                    d = ask_goal(goal, img_b64)
                    if d.get("reached"):
                        print(f"  [GoalNav] LLaVA says goal reached at step {step}")
@ -152,6 +162,11 @@ def navigate_to_goal(goal: str, max_steps: int = 0):
                    if speak:
                        print(f"  [GoalNav] LLaVA: {speak}")

+            # Only pay the scan interval when nothing happened this step.
+            # If YOLO hit or LLaVA fired, they already took 50–1000 ms.
+            if not did_work:
+                time.sleep(SCAN_INTERVAL)
+
    finally:
        rotating[0] = False
        rot_thread.join(timeout=1.0)
--- a/Vision/marcus_imgsearch.py
+++ b/Vision/marcus_imgsearch.py
@ -59,7 +59,9 @@ except ImportError:
 # ══════════════════════════════════════════════════════════════════════════════

 DEFAULT_MAX_STEPS    = 60      # max rotation steps before giving up
-STEP_DELAY           = 0.4     # seconds between YOLO checks
+STEP_DELAY           = 0.15    # min gap between YOLO checks (was 0.4 — reduced
+                               # because the rotation thread paces motion already
+                               # and each LLaVA call is 600-1500 ms of real work)
 ROTATE_SPEED         = 0.25    # rad/s rotation speed during search
 MIN_STEPS_WARMUP     = 3       # skip first N steps (stale frame)
 MATCH_CONFIDENCE_THR = 0.6     # LLaVA confidence threshold (not used directly,
--- a/Voice/builtin_mic.py
+++ b/Voice/builtin_mic.py
@ -0,0 +1,202 @@
+"""
+builtin_mic.py — G1 built-in microphone (UDP multicast capture)
+================================================================
+The G1 humanoid's on-board microphone is published by the Unitree firmware
+as an RTP-like UDP multicast stream on 239.168.123.161:5555, carrying
+16 kHz mono int16 PCM. Any host on the robot's 192.168.123.0/24 network
+can join the group and read the audio — no extra SDK call required.
+
+This module intentionally has no dependency on pyaudio, pulseaudio, or the
+unitree_sdk2py package. Joining the multicast group is all that's needed.
+
+Usage:
+    from Voice.builtin_mic import BuiltinMic
+    mic = BuiltinMic()
+    mic.start()
+    try:
+        chunk = mic.read_chunk(1024)   # 512 samples, 32 ms at 16 kHz
+        ...
+    finally:
+        mic.stop()
+
+Ported from Project/Sanad/voice/audio_io.py (Sanad's production implementation).
+"""
+
+from __future__ import annotations
+
+import socket
+import struct
+import subprocess
+import threading
+import time
+from typing import Optional
+
+
+DEFAULT_GROUP        = "239.168.123.161"
+DEFAULT_PORT         = 5555
+DEFAULT_BUF_MAX      = 64_000           # ~2 s of 16 kHz mono int16
+DEFAULT_READ_TIMEOUT = 0.04             # 40 ms budget per read_chunk call
+SAMPLE_RATE          = 16_000           # hardware rate — do not change
+
+
+def _find_g1_local_ip() -> str:
+    """
+    Return the host IPv4 on the G1's internal 192.168.123.0/24 network.
+    Required by IP_ADD_MEMBERSHIP so the kernel knows which NIC to join on.
+    """
+    out = subprocess.run(
+        ["ip", "-4", "-o", "addr"], capture_output=True, text=True,
+    ).stdout
+    for line in out.splitlines():
+        for tok in line.split():
+            if tok.startswith("192.168.123."):
+                return tok.split("/")[0]
+    raise RuntimeError(
+        "BuiltinMic: no interface on 192.168.123.0/24 — "
+        "host is not on the G1's internal network"
+    )
+
+
+class BuiltinMic:
+    """
+    G1 on-board microphone over UDP multicast.
+
+    Thread-safe: a background daemon thread receives datagrams into an
+    internal ring buffer; `read_chunk(n)` pulls the next `n` bytes or
+    blocks up to `read_timeout` before returning zeros.
+    """
+
+    sample_rate = SAMPLE_RATE
+
+    def __init__(
+        self,
+        group: str = DEFAULT_GROUP,
+        port: int = DEFAULT_PORT,
+        buf_max: int = DEFAULT_BUF_MAX,
+        read_timeout: float = DEFAULT_READ_TIMEOUT,
+    ):
+        self._group        = group
+        self._port         = port
+        self._buf_max      = buf_max
+        self._read_timeout = read_timeout
+        self._sock: Optional[socket.socket] = None
+        self._buf          = bytearray()
+        self._lock         = threading.Lock()
+        self._running      = False
+        self._thread: Optional[threading.Thread] = None
+
+    def start(self) -> None:
+        if self._running:
+            return
+        local_ip = _find_g1_local_ip()
+        self._sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+        self._sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        self._sock.bind(("", self._port))
+        mreq = struct.pack(
+            "4s4s",
+            socket.inet_aton(self._group),
+            socket.inet_aton(local_ip),
+        )
+        self._sock.setsockopt(socket.IPPROTO_IP, socket.IP_ADD_MEMBERSHIP, mreq)
+        self._sock.settimeout(1.0)
+        self._running = True
+        self._thread = threading.Thread(
+            target=self._recv_loop, daemon=True, name="builtin_mic_rx",
+        )
+        self._thread.start()
+        print(f"  [BuiltinMic] joined {self._group}:{self._port} on {local_ip}")
+
+    def _recv_loop(self) -> None:
+        while self._running:
+            try:
+                data, _ = self._sock.recvfrom(4096)
+                with self._lock:
+                    self._buf.extend(data)
+                    # ring-buffer: drop oldest when we'd exceed buf_max
+                    if len(self._buf) > self._buf_max:
+                        del self._buf[: len(self._buf) - self._buf_max]
+            except socket.timeout:
+                continue
+            except Exception:
+                if self._running:
+                    time.sleep(0.01)
+
+    def read_chunk(self, num_bytes: int) -> bytes:
+        """
+        Return exactly `num_bytes` of 16 kHz mono int16 PCM.
+
+        Waits up to `read_timeout` for that many bytes to be available.
+        If the buffer is still short after the timeout, returns whatever
+        is available padded with silence. Never blocks forever.
+        """
+        deadline = time.time() + self._read_timeout
+        while time.time() < deadline:
+            with self._lock:
+                if len(self._buf) >= num_bytes:
+                    chunk = bytes(self._buf[:num_bytes])
+                    del self._buf[:num_bytes]
+                    return chunk
+            time.sleep(0.003)
+        with self._lock:
+            avail = len(self._buf)
+            if avail > 0:
+                chunk = bytes(self._buf[:avail])
+                del self._buf[:avail]
+                return chunk + b"\x00" * (num_bytes - avail)
+        return b"\x00" * num_bytes
+
+    def read_seconds(self, seconds: float) -> bytes:
+        """
+        Convenience: capture `seconds` of audio and return as bytes.
+        Blocks for the full duration (not a real-time producer).
+        """
+        num_bytes = int(seconds * self.sample_rate * 2)  # 2 bytes/sample (int16)
+        out = bytearray()
+        chunk_bytes = 1024
+        while len(out) < num_bytes:
+            out.extend(self.read_chunk(min(chunk_bytes, num_bytes - len(out))))
+        return bytes(out)
+
+    def flush(self) -> None:
+        """Drop all buffered audio (e.g. after the robot spoke)."""
+        with self._lock:
+            self._buf.clear()
+
+    def stop(self) -> None:
+        self._running = False
+        if self._sock is not None:
+            try:
+                self._sock.close()
+            except Exception:
+                pass
+            self._sock = None
+        if self._thread is not None:
+            self._thread.join(timeout=1.5)
+            self._thread = None
+
+
+# ────────────────────────────────────────────────────────────────
+# Standalone test — capture 3 s and print energy stats
+# ────────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    import array
+
+    print("BuiltinMic standalone test — capturing 3 s from G1...")
+    mic = BuiltinMic()
+    mic.start()
+    time.sleep(0.3)  # let the receiver thread warm up
+    raw = mic.read_seconds(3.0)
+    mic.stop()
+
+    samples = array.array("h", raw)
+    if not samples:
+        print("  FAIL — got zero samples")
+    else:
+        mn = min(samples); mx = max(samples)
+        mean_abs = sum(abs(s) for s in samples) / len(samples)
+        print(f"  samples={len(samples)}  min={mn}  max={mx}  mean|s|={mean_abs:.0f}")
+        if mean_abs > 30:
+            print("  OK — mic is capturing audio")
+        else:
+            print("  WARN — signal very low, check G1 audio service is running")
--- a/Voice/builtin_tts.py
+++ b/Voice/builtin_tts.py
@ -0,0 +1,88 @@
+"""
+builtin_tts.py — Unitree G1 built-in TTS (English only)
+========================================================
+Thin wrapper around AudioClient.TtsMaker(text, speaker_id). The G1's on-board
+TTS engine synthesizes and plays directly through the body speaker — no
+internet, no MP3/WAV roundtrip, no audio SDK plumbing on our side.
+
+Supported languages (firmware-side):
+    English  — works  (Marcus uses this)
+    Chinese  — works  (unused)
+    Arabic   — silently falls back to Chinese (unusable — we refuse these)
+
+Signature:
+    client.TtsMaker(text: str, speaker_id: int) -> int    # 0 = success
+    speaker_id ∈ {0, 1, 2}   — different voice timbres
+
+Usage:
+    from Voice.builtin_tts import BuiltinTTS
+    tts = BuiltinTTS(audio_client)
+    tts.speak("Hello, I am Sanad", speaker_id=0)
+"""
+
+from __future__ import annotations
+
+import logging
+import time
+from typing import Optional
+
+log = logging.getLogger("builtin_tts")
+
+
+class BuiltinTTS:
+    """Synchronous English-only TTS via the G1's on-board engine."""
+
+    # Rough playback duration per character — enough margin that `speak()`
+    # returns after audio has actually finished on the robot.
+    SECONDS_PER_CHAR = 0.08
+    MIN_SECONDS      = 1.5
+
+    def __init__(self, audio_client, default_speaker_id: int = 0):
+        """
+        Args:
+            audio_client       : initialized unitree_sdk2py AudioClient
+            default_speaker_id : 0, 1, or 2 (default voice timbre)
+        """
+        self._client = audio_client
+        self._default_speaker = default_speaker_id
+
+    def speak(
+        self,
+        text: str,
+        speaker_id: Optional[int] = None,
+        block: bool = True,
+    ) -> int:
+        """
+        Play `text` on the G1 speaker via TtsMaker.
+
+        English-only by policy. Non-ASCII (Arabic) input is rejected rather
+        than silently played back as Chinese. Returns the TtsMaker status
+        code (0 = success) or -1 if input was rejected.
+        """
+        if not text or not text.strip():
+            return -1
+
+        # Reject non-English. TtsMaker "falls back" by playing Arabic text
+        # as Chinese phonemes — intelligible to nobody — so we refuse it
+        # rather than surprise the operator.
+        if any(ord(c) > 127 for c in text):
+            log.warning("builtin_tts refusing non-ASCII text: %r", text[:60])
+            return -1
+
+        sid = self._default_speaker if speaker_id is None else speaker_id
+        log.info("[TtsMaker sid=%d] %s", sid, text[:80])
+
+        try:
+            code = self._client.TtsMaker(text, sid)
+        except Exception as e:
+            log.error("TtsMaker call failed: %s", e)
+            return -1
+
+        if block:
+            # Estimate how long the G1 is going to take to finish speaking.
+            # TtsMaker is fire-and-forget — we need to wait so the mic loop
+            # knows when to unmute.
+            duration = max(self.MIN_SECONDS, len(text) * self.SECONDS_PER_CHAR)
+            time.sleep(duration)
+
+        return code
--- a/Voice/marcus_gemini_voice.py
+++ b/Voice/marcus_gemini_voice.py
@ -1,608 +0,0 @@
-#!/usr/bin/env python3
-"""
-Voice/marcus_gemini_voice.py — Marcus Gemini Live Voice Module v2
-==================================================================
-Real-time bidirectional voice conversation using Gemini 2.5 Flash native audio.
-Uses G1 built-in speaker + Hollyland wireless mic.
-
-Based on SanadVoice/gemini_interact architecture:
- PyAudio for mic (not parec)
- Echo suppression (silence when speaking)
- Gemini VAD (automatic activity detection)
- thinkingBudget=0 (no thinking text)
- ASR buffering for full sentences
- Vision routed to brain's Qwen camera
-
-Usage:
-    from Voice.marcus_gemini_voice import GeminiVoiceModule
-    voice = GeminiVoiceModule(audio_api, on_transcript=callback)
-    voice.start()
-"""
-
-import array
-import asyncio
-import base64
-import json
-import logging
-import os
-import subprocess
-import threading
-import time
-import numpy as np
-
-from dotenv import load_dotenv
-load_dotenv()
-
-BASE_PATH = os.environ.get("PROJECT_BASE", "/home/unitree")
-PROJECT_NAME = "Marcus"
-PROJECT_ROOT = os.path.join(BASE_PATH, PROJECT_NAME)
-
-LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
-os.makedirs(LOG_DIR, exist_ok=True)
-
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
-    handlers=[
-        logging.FileHandler(os.path.join(LOG_DIR, "voice.log")),
-        logging.StreamHandler(),
-    ],
-)
-log = logging.getLogger("gemini_voice")
-
-
-def load_config(name: str) -> dict:
-    path = os.path.join(PROJECT_ROOT, "Config", f"config_{name}.json")
-    with open(path, "r") as f:
-        return json.load(f)
-
-
-# ─── CONFIGURATION ────────────────────────────────────────
-
-API_KEY = "AIzaSyDt9Xi83MDZuuPpfwfHyMD92X7ZKdGkqf8"
-MODEL = "models/gemini-2.5-flash-native-audio-preview-12-2025"
-URI = (
-    "wss://generativelanguage.googleapis.com/ws/"
-    "google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent"
-    f"?key={API_KEY}"
-)
-
-VOICE_NAME = "Charon"
-SEND_RATE = 16000
-RECEIVE_RATE = 24000
-CHUNK_SIZE = 512
-CHANNELS = 1
-
-
-def load_system_prompt():
-    paths = [
-        os.path.join(PROJECT_ROOT, "Config", "marcus_voice_prompt.txt"),
-    ]
-    for p in paths:
-        if os.path.exists(p):
-            with open(p, "r", encoding="utf-8-sig") as f:
-                return f.read().strip()
-    return (
-        "You are Marcus, a humanoid robot by YS Lootah Technology, Dubai. "
-        "Speak Arabic (UAE dialect) and English. Be concise — 1-2 sentences max."
-    )
-
-
-# ─── AUDIO HELPERS ────────────────────────────────────────
-
-def audio_energy(pcm: bytes) -> int:
-    try:
-        samples = array.array("h", pcm)
-        if not samples:
-            return 0
-        return sum(abs(s) for s in samples) // len(samples)
-    except Exception:
-        return 0
-
-
-SILENCE_PCM = b'\x00' * (CHUNK_SIZE * 2)
-
-
-# ─── GEMINI VOICE MODULE ─────────────────────────────────
-
-class GeminiVoiceModule:
-    """Real-time voice conversation via Gemini, using G1 speaker + Hollyland mic."""
-
-    def __init__(self, audio_api, on_transcript=None):
-        self._audio = audio_api
-        self._on_transcript = on_transcript
-        self._config = load_config("Voice")
-        self._mic_source = getattr(audio_api, '_mic_source',
-            self._config["mic"].get("source_index", "0"))
-
-        # State
-        self.speaking = False
-        self.interrupted = False
-        self._running = False
-        self._thread = None
-        self._audio_queue = None  # Created in async context
-
-        # Tuning
-        self.MIN_THRESHOLD = 3000
-        self.barge_in_threshold = self.MIN_THRESHOLD
-        self.REQUIRED_LOUD_CHUNKS = 10
-        self.PREBUFFER_CHUNKS = 2
-        self.PLAYBACK_TIMEOUT = 0.25
-        self.BARGE_IN_COOLDOWN = 0.7
-        self.AI_SPEAK_GRACE = 0.20
-        self.ECHO_GUARD_SEC = 0.8
-        self.SPEAKING_ENERGY_GATE = 0.85
-        self.SEND_SILENCE_WHEN_SPEAKING = True
-
-        # Timing
-        self._ai_speaking_since = 0.0
-        self._last_ai_audio_time = 0.0
-        self._barge_in_block_until = 0.0
-        self._ignore_input_until = 0.0
-
-        # ASR buffer
-        self._asr_buf = ""
-        self._asr_last_time = 0.0
-        self.ASR_WINDOW_SEC = 2.0
-
-        # Find Hollyland mic PyAudio device index
-        self._mic_device_idx = self._find_mic_device()
-
-        log.info("GeminiVoiceModule v2 initialized")
-
-    # ─── MIC DEVICE DETECTION ─────────────────────────────
-
-    def _find_mic_device(self) -> int:
-        """Find Hollyland wireless mic in PyAudio devices. Returns device index."""
-        import pyaudio
-        import ctypes
-        ERROR_HANDLER_FUNC = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p)
-        def _alsa_error_handler(filename, line, function, err, fmt):
-            pass  # suppress
-        c_error_handler = ERROR_HANDLER_FUNC(_alsa_error_handler)
-        try:
-            asound = ctypes.cdll.LoadLibrary("libasound.so.2")
-            asound.snd_lib_error_set_handler(c_error_handler)
-        except: pass  # ALSA_suppress
-        pa = pyaudio.PyAudio()
-        try:
-            # First: set PulseAudio default source to Hollyland
-            subprocess.run(["pactl", "set-source-mute", self._mic_source, "0"], capture_output=True)
-            subprocess.run(["pactl", "set-source-volume", self._mic_source, "100%"], capture_output=True)
-
-            # Search for wireless mic by name
-            for i in range(pa.get_device_count()):
-                info = pa.get_device_info_by_index(i)
-                name = info.get("name", "").lower()
-                if info["maxInputChannels"] > 0 and ("wireless" in name or "hollyland" in name):
-                    log.info("Mic found: [%d] %s (%dHz)", i, info["name"], int(info["defaultSampleRate"]))
-                    return i
-
-            # Fallback to 'default' or 'pulse' device
-            for i in range(pa.get_device_count()):
-                info = pa.get_device_info_by_index(i)
-                if info["maxInputChannels"] > 0 and info.get("name", "") in ("default", "pulse"):
-                    log.info("Mic fallback: [%d] %s", i, info["name"])
-                    return i
-
-            log.warning("No mic found, using device 0")
-            return 0
-        finally:
-            pa.terminate()
-
-    # ─── MIC CALIBRATION ──────────────────────────────────
-
-    def _calibrate_mic(self):
-        """Calibrate barge-in threshold from ambient noise."""
-        import pyaudio
-        import ctypes
-        ERROR_HANDLER_FUNC = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p)
-        def _alsa_error_handler(filename, line, function, err, fmt):
-            pass  # suppress
-        c_error_handler = ERROR_HANDLER_FUNC(_alsa_error_handler)
-        try:
-            asound = ctypes.cdll.LoadLibrary("libasound.so.2")
-            asound.snd_lib_error_set_handler(c_error_handler)
-        except: pass  # ALSA_suppress
-        pa = pyaudio.PyAudio()
-        mic_info = pa.get_device_info_by_index(self._mic_device_idx)
-        mic_rate = int(mic_info["defaultSampleRate"])
-        mic_channels = 1
-        try:
-            stream = pa.open(format=pyaudio.paInt16, channels=mic_channels,
-                             rate=mic_rate, input=True,
-                             input_device_index=self._mic_device_idx,
-                             frames_per_buffer=CHUNK_SIZE)
-            values = []
-            for _ in range(40):
-                data = stream.read(CHUNK_SIZE, exception_on_overflow=False)
-                values.append(audio_energy(data))
-            stream.stop_stream()
-            stream.close()
-            avg_noise = sum(values) / len(values) if values else 0
-            self.barge_in_threshold = max(self.MIN_THRESHOLD, avg_noise * 3.0)
-            log.info("Mic calibrated: noise=%.0f, threshold=%.0f", avg_noise, self.barge_in_threshold)
-        except Exception as e:
-            log.warning("Calibration failed: %s", e)
-        finally:
-            pa.terminate()
-
-    # ─── G1 SPEAKER PLAYBACK ─────────────────────────────
-
-    def _play_buffer_on_g1(self, pcm_24k: np.ndarray):
-        """Play 24kHz audio on G1 speaker (resample to 16kHz, single call)."""
-        if len(pcm_24k) < 100:
-            return
-
-        # Resample 24kHz → 16kHz
-        tl = int(len(pcm_24k) * 16000 / 24000)
-        audio_16k = np.interp(
-            np.linspace(0, len(pcm_24k), tl, endpoint=False),
-            np.arange(len(pcm_24k)),
-            pcm_24k.astype(np.float64),
-        ).astype(np.int16)
-
-        from unitree_sdk2py.g1.audio.g1_audio_api import (
-            ROBOT_API_ID_AUDIO_START_PLAY,
-            ROBOT_API_ID_AUDIO_STOP_PLAY,
-        )
-
-        client = self._audio._client
-        if not client:
-            return
-
-        app_name = "gemini"
-        client._Call(ROBOT_API_ID_AUDIO_STOP_PLAY, json.dumps({"app_name": app_name}))
-        time.sleep(0.1)
-
-        pcm = audio_16k.tobytes()
-        sid = f"s_{int(time.time() * 1000)}"
-        param = json.dumps({
-            "app_name": app_name,
-            "stream_id": sid,
-            "sample_rate": 16000,
-            "channels": 1,
-            "bits_per_sample": 16,
-        })
-        client._CallRequestWithParamAndBin(ROBOT_API_ID_AUDIO_START_PLAY, param, list(pcm))
-
-        duration = len(audio_16k) / 16000
-        time.sleep(duration + 0.3)
-        client._Call(ROBOT_API_ID_AUDIO_STOP_PLAY, json.dumps({"app_name": app_name}))
-
-    # ─── WEBSOCKET TASKS ─────────────────────────────────
-
-    async def _capture_mic(self, ws):
-        """Continuously capture mic via PyAudio and send to Gemini."""
-        import pyaudio
-        import ctypes
-        ERROR_HANDLER_FUNC = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p)
-        def _alsa_error_handler(filename, line, function, err, fmt):
-            pass  # suppress
-        c_error_handler = ERROR_HANDLER_FUNC(_alsa_error_handler)
-        try:
-            asound = ctypes.cdll.LoadLibrary("libasound.so.2")
-            asound.snd_lib_error_set_handler(c_error_handler)
-        except: pass  # ALSA_suppress
-        pa = pyaudio.PyAudio()
-
-        mic_info = pa.get_device_info_by_index(self._mic_device_idx)
-        mic_rate = int(mic_info["defaultSampleRate"])
-        mic_channels = 1
-
-        # Open mic at native rate/channels
-        stream = pa.open(format=pyaudio.paInt16, channels=mic_channels,
-                         rate=mic_rate, input=True,
-                         input_device_index=self._mic_device_idx,
-                         frames_per_buffer=CHUNK_SIZE)
-
-        log.info("Mic stream: device=%d, rate=%d, ch=%d", self._mic_device_idx, mic_rate, mic_channels)
-
-        loud_chunks = 0
-        loop = asyncio.get_event_loop()
-        needs_resample = mic_rate != SEND_RATE or mic_channels != 1
-
-        try:
-            while self._running:
-                data = await loop.run_in_executor(
-                    None, lambda: stream.read(CHUNK_SIZE, exception_on_overflow=False))
-
-                # Convert to mono 16kHz if needed
-                if needs_resample:
-                    audio = np.frombuffer(data, dtype=np.int16)
-                    # Stereo to mono
-                    if mic_channels == 2:
-                        audio = audio.reshape(-1, 2).mean(axis=1).astype(np.int16)
-                    # Resample to 16kHz
-                    if mic_rate != SEND_RATE:
-                        tl = int(len(audio) * SEND_RATE / mic_rate)
-                        if tl > 0:
-                            audio = np.interp(
-                                np.linspace(0, len(audio), tl, endpoint=False),
-                                np.arange(len(audio)),
-                                audio.astype(np.float64),
-                            ).astype(np.int16)
-                    data = audio.tobytes()
-
-                energy = audio_energy(data)
-                now = time.time()
-
-                # Barge-in detection
-                if self.speaking and now >= self._barge_in_block_until:
-                    if (now - self._ai_speaking_since) >= self.AI_SPEAK_GRACE:
-                        if energy > self.barge_in_threshold:
-                            loud_chunks += 1
-                        else:
-                            loud_chunks = 0
-                        if loud_chunks > self.REQUIRED_LOUD_CHUNKS:
-                            log.info("Barge-in detected!")
-                            self.interrupted = True
-                            self.speaking = False
-                            while not self._audio_queue.empty():
-                                try: self._audio_queue.get_nowait()
-                                except: break
-                            loud_chunks = 0
-                            self._barge_in_block_until = now + self.BARGE_IN_COOLDOWN
-
-                # Echo suppression: send silence while speaking
-                data_to_send = data
-                if self.SEND_SILENCE_WHEN_SPEAKING and self.speaking:
-                    gate = self.barge_in_threshold * self.SPEAKING_ENERGY_GATE
-                    if energy < gate:
-                        data_to_send = SILENCE_PCM
-
-                # Send to Gemini
-                b64 = base64.b64encode(data_to_send).decode()
-                msg = {
-                    "realtime_input": {
-                        "media_chunks": [
-                            {"mime_type": f"audio/pcm;rate={SEND_RATE}", "data": b64}
-                        ]
-                    }
-                }
-                await ws.send(json.dumps(msg))
-
-        except Exception as e:
-            if self._running:
-                log.error("Mic error: %s", e)
-        finally:
-            stream.stop_stream()
-            stream.close()
-            pa.terminate()
-
-    async def _receive_audio(self, ws):
-        """Receive audio responses and transcriptions from Gemini."""
-        async for msg in ws:
-            if not self._running:
-                break
-            try:
-                response = json.loads(msg)
-                server_content = response.get("serverContent", {})
-
-                if server_content.get("interrupted"):
-                    self.interrupted = False
-
-                # User transcription (partial/streaming)
-                input_tr = (
-                    server_content.get("inputTranscription")
-                    or server_content.get("input_transcription")
-                    or server_content.get("inputAudioTranscription")
-                    or server_content.get("input_audio_transcription")
-                )
-                if isinstance(input_tr, dict):
-                    text = (input_tr.get("text") or "").strip()
-                    now = time.time()
-                    if text and now >= self._ignore_input_until and not self.speaking:
-                        # Buffer ASR text
-                        if now - self._asr_last_time > self.ASR_WINDOW_SEC:
-                            self._asr_buf = ""
-                        self._asr_buf = text  # Gemini sends cumulative transcription
-                        self._asr_last_time = now
-
-                if self.interrupted:
-                    continue
-
-                # Audio from Gemini
-                model_turn = server_content.get("modelTurn")
-                if model_turn:
-                    for part in model_turn.get("parts", []):
-                        inline_data = part.get("inlineData")
-                        if inline_data:
-                            audio_b64 = inline_data.get("data")
-                            if audio_b64:
-                                now = time.time()
-                                if not self.speaking:
-                                    self._ai_speaking_since = now
-                                    # Gemini started responding — fire transcript callback
-                                    if self._asr_buf and self._on_transcript:
-                                        self._on_transcript(self._asr_buf, "user")
-                                self.speaking = True
-                                self._last_ai_audio_time = now
-                                self._ignore_input_until = now + self.ECHO_GUARD_SEC
-                                audio_bytes = base64.b64decode(audio_b64)
-                                await self._audio_queue.put(audio_bytes)
-
-                        # Text from Gemini (thinking/response text)
-                        text_part = part.get("text", "").strip()
-                        if text_part and self._on_transcript:
-                            self._on_transcript(text_part, "marcus")
-
-                # Turn complete — Gemini finished speaking
-                turn_complete = server_content.get("turnComplete")
-                if turn_complete:
-                    # Clear ASR buffer after turn
-                    self._asr_buf = ""
-
-            except Exception as e:
-                log.error("Receive error: %s", e)
-
-    async def _play_audio(self):
-        """Collect Gemini audio chunks and play on G1 speaker."""
-        while self._running:
-            try:
-                if not self.speaking:
-                    await asyncio.sleep(0.05)
-                    continue
-
-                # Pre-buffer
-                buffered = False
-                while self.speaking and not buffered:
-                    if self._audio_queue.qsize() >= self.PREBUFFER_CHUNKS:
-                        buffered = True
-                    else:
-                        await asyncio.sleep(0.01)
-
-                # Collect all audio chunks
-                buffer_chunks = []
-                while self.speaking:
-                    try:
-                        data = await asyncio.wait_for(
-                            self._audio_queue.get(), timeout=self.PLAYBACK_TIMEOUT)
-                        audio = np.frombuffer(data, dtype=np.int16)
-                        buffer_chunks.append(audio)
-                        self._last_ai_audio_time = time.time()
-                    except asyncio.TimeoutError:
-                        if self._audio_queue.empty():
-                            if time.time() - self._last_ai_audio_time > 0.3:
-                                break
-
-                # Play on G1 speaker
-                if buffer_chunks:
-                    full_audio = np.concatenate(buffer_chunks)
-                    duration = len(full_audio) / RECEIVE_RATE
-                    log.info("Playing %.1fs on G1", duration)
-
-                    await asyncio.get_event_loop().run_in_executor(
-                        None, self._play_buffer_on_g1, full_audio)
-
-                self.speaking = False
-
-            except Exception as e:
-                log.error("Play error: %s", e)
-                self.speaking = False
-
-    # ─── MAIN LOOP ────────────────────────────────────────
-
-    async def _run_async(self):
-        import websockets
-        import inspect
-
-        system_prompt = load_system_prompt()
-
-        # Unmute mic
-        subprocess.run(["pactl", "set-source-mute", self._mic_source, "0"], capture_output=True)
-        subprocess.run(["pactl", "set-source-volume", self._mic_source, "100%"], capture_output=True)
-
-        # Calibrate
-        self._calibrate_mic()
-
-        ws_kwargs = {"max_size": None}
-        try:
-            sig = inspect.signature(websockets.connect)
-            if "extra_headers" in sig.parameters:
-                ws_kwargs["extra_headers"] = {"Content-Type": "application/json"}
-            else:
-                ws_kwargs["additional_headers"] = {"Content-Type": "application/json"}
-        except Exception:
-            ws_kwargs["extra_headers"] = {"Content-Type": "application/json"}
-
-        while self._running:
-            try:
-                log.info("Connecting to Gemini...")
-                async with websockets.connect(URI, **ws_kwargs) as ws:
-                    setup_msg = {
-                        "setup": {
-                            "model": MODEL,
-                            "generationConfig": {
-                                "responseModalities": ["AUDIO"],
-                                "thinkingConfig": {"thinkingBudget": 0},
-                                "speechConfig": {
-                                    "voiceConfig": {
-                                        "prebuiltVoiceConfig": {"voiceName": VOICE_NAME}
-                                    }
-                                },
-                            },
-                            "realtimeInputConfig": {
-                                "automaticActivityDetection": {
-                                    "startOfSpeechSensitivity": "START_SENSITIVITY_HIGH",
-                                    "prefixPaddingMs": 40,
-                                    "endOfSpeechSensitivity": "END_SENSITIVITY_HIGH",
-                                    "silenceDurationMs": 250,
-                                }
-                            },
-                            "inputAudioTranscription": {},
-                            "systemInstruction": {"parts": [{"text": system_prompt}]},
-                        }
-                    }
-                    await ws.send(json.dumps(setup_msg))
-                    await ws.recv()
-                    log.info("Connected! Always listening...")
-
-                    self._audio_queue = asyncio.Queue()
-
-                    await asyncio.gather(
-                        self._capture_mic(ws),
-                        self._receive_audio(ws),
-                        self._play_audio(),
-                    )
-
-            except Exception as e:
-                if self._running:
-                    log.error("Connection error: %s — reconnecting in 3s", e)
-                    await asyncio.sleep(3)
-
-    def _voice_thread(self):
-        asyncio.run(self._run_async())
-
-    # ─── START / STOP ─────────────────────────────────────
-
-    def start(self):
-        if self._running:
-            return
-        self._running = True
-        self._thread = threading.Thread(target=self._voice_thread, daemon=True, name="gemini_voice")
-        self._thread.start()
-        log.info("Gemini voice module started")
-
-    def stop(self):
-        self._running = False
-        if self._thread:
-            self._thread.join(timeout=5)
-            self._thread = None
-        log.info("Gemini voice module stopped")
-
-    @property
-    def is_running(self) -> bool:
-        return self._running
-
-    @property
-    def state(self) -> str:
-        return "LISTENING" if self._running else "STOPPED"
-
-    @property
-    def is_speaking(self) -> bool:
-        return self.speaking
-
-
-# ─── STANDALONE TEST ─────────────────────────────────────
-
-if __name__ == "__main__":
-    import sys
-    sys.path.insert(0, PROJECT_ROOT)
-    from API.audio_api import AudioAPI
-
-    def on_transcript(text, role):
-        print(f"  [{role.upper()}] {text}")
-
-    audio = AudioAPI()
-    voice = GeminiVoiceModule(audio, on_transcript=on_transcript)
-
-    print("Gemini voice v2 — speak anytime. Ctrl+C to stop.\n")
-    voice.start()
-
-    try:
-        while voice.is_running:
-            time.sleep(0.5)
-    except KeyboardInterrupt:
-        print("\nStopping...")
-        voice.stop()
--- a/Voice/marcus_voice.py
+++ b/Voice/marcus_voice.py
@ -1,19 +1,20 @@
 #!/usr/bin/env python3
 """
-Features/Voice/marcus_voice.py — Marcus Always-Listening Voice Module
-======================================================================
+Voice/marcus_voice.py — Marcus Always-Listening Voice Module (English)
+=======================================================================
 State machine:
  IDLE → (wake word detected) → WAKE_HEARD
  WAKE_HEARD → (record command) → PROCESSING
  PROCESSING → (Whisper transcribe) → send to brain → SPEAKING
  SPEAKING → (TTS done) → IDLE

-Wake word: "Marcus" / "ماركوس" (detected by Whisper tiny)
+Wake word: "Marcus"  (detected by Whisper tiny)
 Commands:  Transcribed by Whisper small
-TTS:       Handled by API/audio_api.py
+Mic:       G1 built-in array mic via UDP multicast (Voice/builtin_mic.py)
+TTS:       English only, Unitree built-in TtsMaker (API/audio_api.py)

 Usage:
-    from Features.Voice.marcus_voice import VoiceModule
+    from Voice.marcus_voice import VoiceModule
    voice = VoiceModule(audio_api, on_command=brain.handle_voice_command)
    voice.start()   # background thread
    voice.stop()
@ -21,7 +22,6 @@ Usage:

 import logging
 import os
-import subprocess
 import threading
 import time
 import numpy as np
@ -74,7 +74,8 @@ class VoiceModule:
        """
        Args:
            audio_api:   AudioAPI instance (from API/audio_api.py)
-            on_command:  callback(text: str, lang: str) — called when command is transcribed
+            on_command:  callback(text: str, lang: str) — "lang" is always "en"
+                         now; kept in the signature for interface stability.
        """
        self._audio = audio_api
        self._on_command = on_command
@ -83,13 +84,23 @@ class VoiceModule:
        self._stt = self._config["stt"]
        self._mic = self._config["mic"]

-        # Whisper models — lazy loaded
+        # Whisper models — lazy loaded on first _voice_loop() iteration
        self._wake_model = None
        self._cmd_model = None

-        # Wake words
-        self._wake_en = [w.lower() for w in self._stt["wake_words_en"]]
-        self._wake_ar = self._stt["wake_words_ar"]
+        # Wake words (English only — built-in TTS doesn't do Arabic)
+        self._wake_en = [w.lower() for w in self._stt.get("wake_words_en",
+                                                           ["marcus", "marcos"])]
+
+        # G1 built-in mic (UDP multicast).
+        from Voice.builtin_mic import BuiltinMic
+        _mcfg = self._config.get("mic_udp", {})
+        self._mic_capture = BuiltinMic(
+            group=_mcfg.get("group", "239.168.123.161"),
+            port=_mcfg.get("port",  5555),
+            buf_max=_mcfg.get("buffer_max_bytes", 64000),
+        )
+        self._sample_rate = self._mic_capture.sample_rate    # 16000

        # State
        self._state = State.IDLE
@ -97,7 +108,7 @@ class VoiceModule:
        self._thread = None
        self._lock = threading.Lock()

-        log.info("VoiceModule initialized")
+        log.info("VoiceModule initialized (mic: G1 built-in UDP)")

    # ─── MODEL LOADING ────────────────────────────────────

@ -115,69 +126,49 @@ class VoiceModule:
            self._cmd_model = whisper.load_model(self._stt["command_model"])
            log.info("Command model ready")

-    # ─── MIC RECORDING ────────────────────────────────────
+    # ─── MIC RECORDING (G1 built-in UDP) ──────────────────

    def _record_chunk(self, seconds: float) -> np.ndarray:
-        """Record audio chunk from mic via parec."""
-        source = self._mic["source_index"]
-        rate = str(self._mic["rate"])
-
-        proc = subprocess.Popen(
-            ["parec", "-d", source,
-             "--format=s16le", f"--rate={rate}", "--channels=1", "--raw"],
-            stdout=subprocess.PIPE,
-        )
-        time.sleep(seconds)
-        proc.terminate()
-        raw = proc.stdout.read()
-        return np.frombuffer(raw, dtype=np.int16)
+        """Capture a fixed-duration chunk from the G1 built-in mic."""
+        num_bytes = int(seconds * self._sample_rate * 2)   # int16 mono
+        raw = bytearray()
+        bite = 1024
+        while len(raw) < num_bytes:
+            raw.extend(self._mic_capture.read_chunk(min(bite, num_bytes - len(raw))))
+        return np.frombuffer(bytes(raw), dtype=np.int16)

    def _record_until_silence(self) -> np.ndarray:
-        """Record until silence is detected or max duration reached."""
-        source = self._mic["source_index"]
-        rate = self._mic["rate"]
-        threshold = self._stt["silence_threshold"]
-        silence_dur = self._stt["silence_duration_sec"]
-        max_dur = self._stt["max_record_sec"]
+        """Capture until RMS drops below threshold for `silence_duration_sec`."""
+        threshold   = self._stt.get("silence_threshold", 500)
+        silence_dur = self._stt.get("silence_duration_sec", 1.5)
+        max_dur     = self._stt.get("max_record_sec", 15)

-        chunk_sec = 0.5
-        chunk_samples = int(rate * chunk_sec)
-        silence_chunks_needed = int(silence_dur / chunk_sec)
-        max_chunks = int(max_dur / chunk_sec)
+        chunk_sec           = 0.5
+        chunk_bytes         = int(self._sample_rate * chunk_sec) * 2
+        silence_chunks_need = int(silence_dur / chunk_sec)
+        max_chunks          = int(max_dur / chunk_sec)

-        proc = subprocess.Popen(
-            ["parec", "-d", source,
-             "--format=s16le", f"--rate={rate}", "--channels=1", "--raw"],
-            stdout=subprocess.PIPE,
-        )
-
-        all_audio = []
+        all_audio     = []
        silence_count = 0
-        chunk_count = 0
+        chunk_count   = 0

-        try:
-            while chunk_count < max_chunks:
-                data = proc.stdout.read(chunk_samples * 2)  # 2 bytes per sample
-                if not data:
-                    break
+        while chunk_count < max_chunks:
+            raw = self._mic_capture.read_chunk(chunk_bytes)
+            if not raw:
+                break
+            chunk = np.frombuffer(raw, dtype=np.int16)
+            all_audio.append(chunk)
+            chunk_count += 1

-                chunk = np.frombuffer(data, dtype=np.int16)
-                all_audio.append(chunk)
-                chunk_count += 1
+            rms = np.sqrt(np.mean(chunk.astype(np.float64) ** 2))
+            if rms < threshold:
+                silence_count += 1
+            else:
+                silence_count = 0

-                # Check for silence
-                rms = np.sqrt(np.mean(chunk.astype(np.float64) ** 2))
-                if rms < threshold:
-                    silence_count += 1
-                else:
-                    silence_count = 0
-
-                if silence_count >= silence_chunks_needed and chunk_count > 2:
-                    log.info("Silence detected after %.1fs", chunk_count * chunk_sec)
-                    break
-        finally:
-            proc.terminate()
-            proc.stdout.read()  # drain
+            if silence_count >= silence_chunks_need and chunk_count > 2:
+                log.info("Silence detected after %.1fs", chunk_count * chunk_sec)
+                break

        if all_audio:
            return np.concatenate(all_audio)
@ -205,38 +196,18 @@ class VoiceModule:
        return text

    def _check_wake_word(self, text: str) -> bool:
-        """Check if transcribed text contains a wake word."""
+        """Check if transcribed text contains an English wake word."""
        text_lower = text.lower().strip()
-
-        # English wake words
-        for w in self._wake_en:
-            if w in text_lower:
-                return True
-
-        # Arabic wake words
-        for w in self._wake_ar:
-            if w in text:
-                return True
-
-        return False
+        return any(w in text_lower for w in self._wake_en)

    # ─── MAIN LOOP ────────────────────────────────────────

    def _voice_loop(self):
        """Main voice processing loop — runs in background thread."""
        self._load_whisper()
+        self._mic_capture.start()
        log.info("Voice loop started — listening for wake word...")

-        # Unmute mic once
-        subprocess.run(
-            ["pactl", "set-source-mute", self._mic["source_index"], "0"],
-            capture_output=True,
-        )
-        subprocess.run(
-            ["pactl", "set-source-volume", self._mic["source_index"], "100%"],
-            capture_output=True,
-        )
-
        while self._running:
            try:
                if self._state == State.IDLE:
@ -279,9 +250,7 @@ class VoiceModule:
            self._state = State.WAKE_HEARD

            # Acknowledge
-            self._audio.speak(
-                self._config["messages"]["wake_heard"], "en"
-            )
+            self._audio.speak(self._config["messages"]["wake_heard"])

    def _do_wake_heard(self):
        """Record the command until silence."""
@ -294,7 +263,7 @@ class VoiceModule:

        if len(audio) < 4000:  # < 0.25s at 16kHz
            log.info("Too short, ignoring")
-            self._audio.speak(self._config["messages"]["no_speech"], "en")
+            self._audio.speak(self._config["messages"]["no_speech"])
            self._state = State.IDLE
            return

@ -308,18 +277,16 @@ class VoiceModule:

        if not text or len(text.strip()) < 2:
            log.info("Empty transcription")
-            self._audio.speak(self._config["messages"]["no_speech"], "en")
+            self._audio.speak(self._config["messages"]["no_speech"])
            self._state = State.IDLE
            return

-        # Detect language
-        lang = "ar" if any('\u0600' <= c <= '\u06FF' for c in text) else "en"
-        log.info("Command [%s]: %s", lang, text)
+        log.info("Command: %s", text)

-        # Send to brain callback
+        # Send to brain callback (lang always "en" in this build)
        if self._on_command:
            try:
-                self._on_command(text, lang)
+                self._on_command(text, "en")
            except Exception as e:
                log.error("Brain callback error: %s", e)

@ -342,6 +309,10 @@ class VoiceModule:
    def stop(self):
        """Stop voice listening."""
        self._running = False
+        try:
+            self._mic_capture.stop()
+        except Exception:
+            pass
        if self._thread:
            self._thread.join(timeout=5)
            self._thread = None