Update 2026-04-21 16:10:00

2026-04-21 16:10:03 +04:00 · 2026-04-21 16:10:03 +04:00 · e0f6acd5c7
commit e0f6acd5c7
parent 8491be7f1e
24 changed files with 1291 additions and 1157 deletions
--- a/API/audio_api.py
+++ b/API/audio_api.py
@ -2,19 +2,24 @@
 """
 API/audio_api.py — Marcus Audio API Layer
 ==========================================
-Provides speak() and record_audio() for the Brain layer.
+Provides speak() and record() for the Brain layer.
 Brain imports ONLY from this API — never from unitree SDK directly.
-Speaker: _CallRequestWithParamAndBin (single call, full buffer)
+Speaker: Unitree built-in TtsMaker (G1 on-board engine, English only,
-Mic:     parec -d 3 (Hollyland wireless, PulseAudio source index from config)
+         no MP3/WAV plumbing, no internet).  Optional raw-PCM playback path
-TTS EN:  Unitree built-in TtsMaker
+         via _play_pcm() is kept for future modules that synthesize their
-TTS AR:  Piper ar_JO-kareem-medium → resample → G1 speaker
+         own audio (e.g. offline Piper).
 Mic:     G1 built-in mic (UDP multicast 239.168.123.161:5555, 16 kHz mono).
         Legacy Hollyland/parec path retained as fallback when
         config_Voice.json has mic.backend="pactl_parec".
 TTS:     English only.  Arabic is rejected (the G1 firmware silently maps
         Arabic to Chinese, which confuses everyone — if Arabic TTS is ever
         needed again, use a separate offline backend like Piper).
 Usage:
    from API.audio_api import AudioAPI
    audio = AudioAPI()
-    audio.speak("Hello", "en")
+    audio.speak("Hello, I am Sanad")
    audio.speak("مرحبا", "ar")
    recording = audio.record(seconds=5)
    audio.play_pcm(recording)
 """
@ -71,7 +76,24 @@ class AudioAPI:
        self._tts = self._config["tts"]
        self._mic = self._config["mic"]
        self._spk = self._config["speaker"]
-        self._target_rate = self._tts["target_sample_rate"]
+        self._target_rate = self._tts.get("target_sample_rate", 16000)
        # Default mic backend: G1 built-in UDP multicast.
        # Set mic.backend="pactl_parec" in config_Voice.json to fall back
        # to the legacy Hollyland/PulseAudio path.
        self._mic_backend = self._mic.get("backend", "builtin_udp")
        self._builtin_mic = None    # lazy-initialized on first record()
        # Built-in TTS wrapper (uses the already-initialized AudioClient).
        # Keeps TTS synchronous so `is_speaking` is meaningful to the voice
        # loop that needs to skip mic input during playback.
        self._tts_engine = None
        if self._sdk_available:
            from Voice.builtin_tts import BuiltinTTS
            self._tts_engine = BuiltinTTS(
                self._client,
                default_speaker_id=self._tts.get("builtin_speaker_id", 0),
            )
        # Data dir
        data_dir = os.path.join(PROJECT_ROOT, self._config["audio"]["data_dir"])
@ -82,7 +104,10 @@ class AudioAPI:
        self._speaking = False
        self._speak_lock = threading.Lock()
-        log.info(self._config["messages"]["ready"])
+        log.info("%s (mic=%s, tts=%s)",
                 self._config["messages"]["ready"],
                 self._mic_backend,
                 "builtin_ttsmaker" if self._tts_engine else "disabled")
    def _init_sdk(self):
        """Initialize Unitree AudioClient."""
@ -105,55 +130,63 @@ class AudioAPI:
    # ─── SPEAK ────────────────────────────────────────────
-    def speak(self, text: str, lang: str = "auto"):
+    def speak(self, text: str, lang: str = "en"):
        """
-        Speak text in the given language.
+        Speak `text` in English through the G1 built-in TTS (TtsMaker).
        Mutes mic during playback to prevent self-listening.
          lang="en" → built-in TtsMaker
          lang="ar" → Piper → resample → G1 speaker
          lang="auto" → detect from text
        """
        if lang == "auto":
            lang = self._detect_lang(text)
-        log.info("[%s] speak: %s", lang.upper(), text[:80])
+        Mutes (flushes) the mic during playback so the voice loop doesn't
        hear the robot's own voice and transcribe itself. The `lang`
        argument is accepted for API compatibility but only "en" plays —
        non-ASCII text (Arabic) is rejected by BuiltinTTS.
        """
        if lang and lang not in ("en", "auto"):
            log.warning("builtin_tts only supports English; got lang=%r — skipping", lang)
            return
        if self._tts_engine is None:
            log.error("No TTS engine initialized — audio SDK unavailable")
            return
        log.info("speak: %s", text[:80])
        with self._speak_lock:
            self._speaking = True
            self._mute_mic()
            try:
-                if lang == "en":
+                self._tts_engine.speak(text, block=True)
                    self._speak_english(text)
                elif lang == "ar":
                    self._speak_arabic(text)
                else:
                    log.warning("Unknown lang '%s', falling back to English", lang)
                    self._speak_english(text)
            except Exception as e:
                log.error("%s: %s", self._config["messages"]["error_tts"], e)
            finally:
-                # Small delay so speaker fully stops before mic reopens
+                # Small tail so the speaker fully finishes before the mic is
-                time.sleep(0.3)
+                # re-opened for capture
                time.sleep(0.2)
                self._unmute_mic()
                self._speaking = False
    def _mute_mic(self):
-        """Mute the wireless mic to prevent self-listening."""
+        """
        Suppress mic input during TTS playback.
        For the UDP built-in mic, flush the buffer so we don't capture any
        echo that's already been queued. For the legacy PulseAudio path,
        actually mute the source.
        """
        if self._mic_backend == "builtin_udp":
            if self._builtin_mic is not None:
                self._builtin_mic.flush()
            return
        source = self._mic["source_index"]
-        subprocess.run(
+        subprocess.run(["pactl", "set-source-mute", source, "1"],
-            ["pactl", "set-source-mute", source, "1"],
+                       capture_output=True)
            capture_output=True,
        )
        log.debug("Mic muted")
    def _unmute_mic(self):
-        """Unmute the wireless mic."""
+        """Re-enable mic after TTS playback (pactl path only)."""
        if self._mic_backend == "builtin_udp":
            if self._builtin_mic is not None:
                self._builtin_mic.flush()
            return
        source = self._mic["source_index"]
-        subprocess.run(
+        subprocess.run(["pactl", "set-source-mute", source, "0"],
-            ["pactl", "set-source-mute", source, "0"],
+                       capture_output=True)
            capture_output=True,
        )
        log.debug("Mic unmuted")
    @property
@ -161,88 +194,8 @@ class AudioAPI:
        """True while TTS is playing — voice module checks this."""
        return self._speaking
    def _speak_english(self, text: str):
        """English TTS via edge-tts."""
        self._speak_edge_tts(text, "en")
    def _speak_arabic(self, text: str):
        """Arabic TTS via edge-tts."""
        self._speak_edge_tts(text, "ar")
    def speak_piper_en(self, text: str):
        """Alternative: English via Piper instead of built-in."""
        voice = self._tts["piper_voice_en"]
        audio, rate = self._piper_synthesize(text, voice)
        audio_16k = self._resample(audio, rate)
        self._play_pcm(audio_16k)
    # ─── PIPER TTS ────────────────────────────────────────
    def _piper_synthesize(self, text: str, voice: str) -> tuple:
        """Run Piper CLI, return (audio_int16, sample_rate)."""
        cmd = ["piper", "--model", voice, "--output_raw"]
        timeout = self._tts["piper_timeout_sec"]
        proc = subprocess.run(
            cmd,
            input=text.encode("utf-8"),
            capture_output=True,
            timeout=timeout,
        )
        if proc.returncode != 0:
            stderr = proc.stderr.decode()[:300]
            raise RuntimeError(f"Piper failed: {stderr}")
        audio = np.frombuffer(proc.stdout, dtype=np.int16)
        piper_rate = self._tts["piper_sample_rate"]
        log.info("Piper: %d samples @ %dHz (%.1fs)", len(audio), piper_rate, len(audio) / piper_rate)
        return audio, piper_rate
    # ─── RESAMPLE ─────────────────────────────────────────
    def _speak_edge_tts(self, text: str, lang: str):
        """Generate speech via edge-tts and play on G1."""
        import os as _os
        voice = "ar-AE-HamdanNeural" if lang == "ar" else "en-US-GuyNeural"
        ts = int(time.time() * 1000)
        mp3_path = f"/tmp/edge_{lang}_{ts}.mp3"
        wav_path = f"/tmp/edge_{lang}_{ts}.wav"
        safe_text = text.replace('"', '\\"')
        code = f'import edge_tts, asyncio; asyncio.run(edge_tts.Communicate(\"{safe_text}\", voice=\"{voice}\").save(\"{mp3_path}\"))'
        result = subprocess.run(["python3", "-c", code], capture_output=True, text=True, timeout=30)
        if result.returncode != 0:
            log.error("edge-tts failed: %s", result.stderr[:200])
            if lang == "en" and self._sdk_available:
                self._client.TtsMaker(text, self._tts.get("builtin_speaker_id", 1))
                time.sleep(max(2.0, len(text) * 0.06))
            return
        try:
            from pydub import AudioSegment
            a = AudioSegment.from_mp3(mp3_path)
            a = a.set_frame_rate(16000).set_channels(1).set_sample_width(2)
            a.export(wav_path, format="wav")
            import wave
            with wave.open(wav_path, "rb") as wf:
                audio = np.frombuffer(wf.readframes(wf.getnframes()), dtype=np.int16)
            _os.unlink(mp3_path)
            _os.unlink(wav_path)
            self._play_pcm(audio)
        except Exception as e:
            log.error("edge-tts conversion error: %s", e)
            try: _os.unlink(mp3_path)
            except: pass
            try: _os.unlink(wav_path)
            except: pass
    def _resample(self, audio: np.ndarray, src_rate: int) -> np.ndarray:
-        """Resample to target rate (16kHz)."""
+        """Linear resample int16 PCM to self._target_rate (16 kHz)."""
        if src_rate == self._target_rate:
            return audio
        tl = int(len(audio) * self._target_rate / src_rate)
@ -252,7 +205,7 @@ class AudioAPI:
            audio.astype(np.float64),
        ).astype(np.int16)
-    # ─── G1 SPEAKER PLAYBACK ─────────────────────────────
+    # ─── G1 SPEAKER PLAYBACK (raw PCM, kept for future backends) ─────────
    def _play_pcm(self, audio_16k: np.ndarray) -> float:
        """Play 16kHz mono int16 on G1 speaker. Returns duration."""
@ -308,24 +261,50 @@ class AudioAPI:
    # ─── MIC RECORDING ───────────────────────────────────
    def record(self, seconds: float = 5.0) -> np.ndarray:
-        """Record from Hollyland wireless mic via parec. Returns int16 array."""
+        """
        Capture `seconds` of int16 mono 16 kHz PCM.
        Default backend is the G1 built-in mic (UDP multicast). Set
        mic.backend="pactl_parec" in config_Voice.json to use the
        legacy Hollyland/parec path instead.
        """
        if self._mic_backend == "builtin_udp":
            return self._record_builtin(seconds)
        return self._record_parec(seconds)
    def _record_builtin(self, seconds: float) -> np.ndarray:
        """Built-in mic path — join UDP multicast, read the requested duration."""
        if self._builtin_mic is None:
            from Voice.builtin_mic import BuiltinMic
            mcfg = self._config.get("mic_udp", {})
            self._builtin_mic = BuiltinMic(
                group=mcfg.get("group", "239.168.123.161"),
                port=mcfg.get("port", 5555),
                buf_max=mcfg.get("buffer_max_bytes", 64000),
            )
            self._builtin_mic.start()
            time.sleep(0.2)  # let the receiver thread fill in
        log.info("Recording %.1fs from G1 built-in mic", seconds)
        raw = self._builtin_mic.read_seconds(seconds)
        audio = np.frombuffer(raw, dtype=np.int16)
        log.info("Recorded: %d samples, std=%.0f", len(audio), audio.std())
        if audio.std() < 50:
            log.warning(self._config["messages"]["error_mic"] +
                        " — G1 mic silent (check audio service on robot)")
        return audio
    def _record_parec(self, seconds: float) -> np.ndarray:
        """Legacy Hollyland/PulseAudio path — only used if mic.backend='pactl_parec'."""
        source = self._mic["source_index"]
        rate = str(self._mic["rate"])
        channels = str(self._mic["channels"])
        fmt = self._mic["format"]
-        # Unmute mic
+        subprocess.run(["pactl", "set-source-mute",  source, "0"],   capture_output=True)
-        subprocess.run(
+        subprocess.run(["pactl", "set-source-volume", source, "100%"], capture_output=True)
            ["pactl", "set-source-mute", source, "0"],
            capture_output=True,
        )
        subprocess.run(
            ["pactl", "set-source-volume", source, "100%"],
            capture_output=True,
        )
        log.info("Recording %.1fs from mic source %s", seconds, source)
        log.info("Recording %.1fs from mic source %s (parec)", seconds, source)
        proc = subprocess.Popen(
            ["parec", "-d", source,
             f"--format={fmt}", f"--rate={rate}", f"--channels={channels}", "--raw"],
@ -337,10 +316,8 @@ class AudioAPI:
        audio = np.frombuffer(raw, dtype=np.int16)
        log.info("Recorded: %d samples, std=%.0f", len(audio), audio.std())
        if audio.std() < 50:
            log.warning(self._config["messages"]["error_mic"] + " — mic may be silent")
        return audio
    def save_recording(self, audio: np.ndarray, name: str) -> str:
@ -355,16 +332,6 @@ class AudioAPI:
        log.info("Saved: %s", path)
        return path
    # ─── LANGUAGE DETECTION ───────────────────────────────
    @staticmethod
    def _detect_lang(text: str) -> str:
        """Detect language from text — Arabic Unicode range check."""
        for c in text:
            if '\u0600' <= c <= '\u06FF':
                return "ar"
        return "en"
    # ─── STATUS ───────────────────────────────────────────
    @property
@ -378,27 +345,16 @@ if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Marcus Audio API Test")
-    parser.add_argument("--test", action="store_true", help="Run speak tests")
+    parser.add_argument("--test", action="store_true", help="Run TTS + record test")
-    parser.add_argument("--speak", type=str, help="Speak this text")
+    parser.add_argument("--speak", type=str, help="Speak this English text")
    parser.add_argument("--lang", default="auto", help="Language: en, ar, auto")
    parser.add_argument("--record", type=float, default=0, help="Record N seconds")
    args = parser.parse_args()
    api = AudioAPI()
    if args.test:
-        print("\n--- English built-in ---")
+        print("\n--- English (TtsMaker) ---")
-        api.speak("Hello, I am Marcus.", "en")
+        api.speak("Hello, I am Sanad.")
        time.sleep(1)
        print("\n--- Arabic Piper ---")
        api.speak("مرحبا، أنا ماركوس", "ar")
        time.sleep(1)
        print("\n--- Auto-detect ---")
        api.speak("How are you?")
        time.sleep(1)
        api.speak("كيف حالك؟")
        time.sleep(1)
        print("\n--- Record 3s + playback ---")
@ -408,7 +364,7 @@ if __name__ == "__main__":
        print("\nDone.")
    elif args.speak:
-        api.speak(args.speak, args.lang)
+        api.speak(args.speak)
    elif args.record > 0:
        rec = api.record(args.record)
--- a/API/yolo_api.py
+++ b/API/yolo_api.py
@ -49,9 +49,28 @@ def init_yolo(raw_frame_ref, frame_lock) -> bool:
        print(f"marcus_yolo.py not found ({e})")
        return False
-    # GPU is required — let RuntimeError from _resolve_device propagate so
+    # GPU is required. _resolve_device() raises RuntimeError when CUDA is
-    # Marcus hard-fails at startup instead of silently running without vision.
+    # missing — surface that with an actionable banner before re-raising so
-    ok = start_yolo(raw_frame_ref=raw_frame_ref, frame_lock=frame_lock)
+    # Marcus hard-fails with a clear error instead of a raw stack trace.
    try:
        ok = start_yolo(raw_frame_ref=raw_frame_ref, frame_lock=frame_lock)
    except RuntimeError as e:
        print()
        print("╔" + "═" * 68 + "╗")
        print("║  MARCUS STARTUP ABORTED — GPU REQUIRED".ljust(69) + "║")
        print("╠" + "═" * 68 + "╣")
        print(f"║  {str(e)[:66]:<66}  ║")
        print("║" + " " * 68 + "║")
        print("║  On the Jetson, verify:".ljust(69) + "║")
        print("║    tegrastats           # GPU exists & is not throttled".ljust(69) + "║")
        print("║    python3 -c 'import torch; print(torch.cuda.is_available())'".ljust(69) + "║")
        print("║    nvcc --version       # CUDA toolkit reachable".ljust(69) + "║")
        print("║  Expected: torch 2.1.0 nv23.06, CUDA 11.4, GPU=Orin.".ljust(69) + "║")
        print("║  See Doc/environment.md section 9 for the reinstall recipe.".ljust(69) + "║")
        print("╚" + "═" * 68 + "╝")
        print()
        raise
    if ok:
        YOLO_AVAILABLE       = True
        yolo_sees            = _ys
--- a/API/zmq_api.py
+++ b/API/zmq_api.py
@ -1,7 +1,16 @@
 """
 zmq_api.py — ZMQ velocity + command interface to Holosoma
 Previously the PUB socket was bound at module import time. That made the
 module unsafe to re-import from any multiprocessing child (e.g. the LiDAR
 SLAM_worker spawn), because the child would try to rebind the same port
 and crash with `Address already in use`.
 The bind now lives in init_zmq() — call it once from the brain entrypoint.
 Child processes can import this module without any network side effects.
 """
 import json
 import os
 import time
 import zmq
 from Core.config_loader import load_config
@ -15,35 +24,62 @@ STOP_ITERATIONS = _cfg["stop_iterations"]
 STOP_DELAY      = _cfg["stop_delay"]
 STEP_PAUSE      = _cfg["step_pause"]
-ctx  = zmq.Context()
+# Shared state. These stay None until init_zmq() is called.
-sock = ctx.socket(zmq.PUB)
+ctx: zmq.Context = None
-sock.bind(f"tcp://{ZMQ_HOST}:{ZMQ_PORT}")
+sock: zmq.Socket = None
-time.sleep(0.5)
+_INIT_SETTLE = 0.5       # seconds to let PUB tell subscribers it's alive
-log(f"ZMQ PUB bound on tcp://{ZMQ_HOST}:{ZMQ_PORT}", "info", "zmq")
+
 def init_zmq() -> zmq.Socket:
    """
    Bind the PUB socket. Idempotent — safe to call more than once.
    Call from the main (parent) process only. Do NOT call from multiprocessing
    children — they inherit nothing useful from the bound socket anyway.
    """
    global ctx, sock
    if sock is not None:
        return sock
    ctx  = zmq.Context()
    sock = ctx.socket(zmq.PUB)
    sock.bind(f"tcp://{ZMQ_HOST}:{ZMQ_PORT}")
    time.sleep(_INIT_SETTLE)
    log(f"ZMQ PUB bound on tcp://{ZMQ_HOST}:{ZMQ_PORT} (pid={os.getpid()})",
        "info", "zmq")
    return sock
 def _ensure_sock() -> zmq.Socket:
    if sock is None:
        raise RuntimeError(
            "zmq_api not initialized — call init_zmq() from the brain "
            "entrypoint before using send_vel/send_cmd/gradual_stop"
        )
    return sock
 def get_socket():
    """Return the shared ZMQ PUB socket (for odometry to reuse)."""
-    return sock
+    return _ensure_sock()
 def send_vel(vx: float = 0.0, vy: float = 0.0, vyaw: float = 0.0):
    """Send velocity to Holosoma. vx m/s | vy m/s | vyaw rad/s"""
-    sock.send_string(json.dumps({"vel": {"vx": vx, "vy": vy, "vyaw": vyaw}}))
+    _ensure_sock().send_string(json.dumps({"vel": {"vx": vx, "vy": vy, "vyaw": vyaw}}))
 def gradual_stop():
    """Smooth deceleration to zero over ~1 second."""
    s = _ensure_sock()
    for _ in range(STOP_ITERATIONS):
-        send_vel(0.0, 0.0, 0.0)
+        s.send_string(json.dumps({"vel": {"vx": 0.0, "vy": 0.0, "vyaw": 0.0}}))
        time.sleep(STOP_DELAY)
 def send_cmd(cmd: str):
    """Send Holosoma state command: start | walk | stand | stop"""
-    sock.send_string(json.dumps({"cmd": cmd}))
+    _ensure_sock().send_string(json.dumps({"cmd": cmd}))
-# Load MOVE_MAP from navigation config
+# Load MOVE_MAP from navigation config (pure data, safe at import time)
 _nav = load_config("Navigation")
 MOVE_MAP = {k: tuple(v) for k, v in _nav["move_map"].items()}
--- a/Autonomous/marcus_autonomous.py
+++ b/Autonomous/marcus_autonomous.py
@ -292,7 +292,10 @@ class AutonomousMode:
                self._enabled = False
                break
-            time.sleep(YOLO_CHECK_INTERVAL)
+            # No trailing sleep — _move_forward() takes FORWARD_DURATION,
            # _turn() takes TURN_DURATION, and LLaVA assessment is ~1-2s.
            # The body always consumes real wall time, so an extra sleep here
            # would be pure dead time.
        # Clean up
        self._gradual_stop()
--- a/Brain/marcus_brain.py
+++ b/Brain/marcus_brain.py
@ -17,7 +17,7 @@ PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 if PROJECT_DIR not in sys.path:
    sys.path.insert(0, PROJECT_DIR)
-from API.zmq_api import send_vel, gradual_stop, send_cmd
+from API.zmq_api import init_zmq, send_vel, gradual_stop, send_cmd
 from API.camera_api import start_camera, stop_camera, get_frame
 from API.yolo_api import (
    init_yolo, yolo_summary, yolo_fps,
@ -70,7 +70,19 @@ _NAT_GOAL_RE = re.compile(
 # ══════════════════════════════════════════════════════════════════════════════
 def init_brain():
-    """Initialize all subsystems. Call once at startup."""
+    """Initialize all subsystems. Call once at startup from the parent process.
    Optional subsystems (lidar / voice / imgsearch / autonomous) are gated on
    `config_Brain.json::subsystems.<name>`. Disabling the ones you don't need
    brings Marcus's boot time down from ~18 s to ~5-7 s.
    """
    subsys = _cfg.get("subsystems", {}) or {}
    # Bind the ZMQ PUB socket before anything tries to publish on it.
    # This is now explicit (previously it happened as an import side effect,
    # which crashed every multiprocessing child that re-imported zmq_api).
    init_zmq()
    raw_frame, raw_lock = start_camera()
    init_yolo(raw_frame, raw_lock)
@ -79,53 +91,65 @@ def init_brain():
    init_memory()
-    # LiDAR (optional — continues without it)
+    # LiDAR — optional
-    try:
+    if subsys.get("lidar", True):
-        from API.lidar_api import init_lidar
+        try:
-        init_lidar()
+            from API.lidar_api import init_lidar
-    except Exception as e:
+            init_lidar()
-        print(f"  [LiDAR] Init failed: {e} — continuing without LiDAR")
+        except Exception as e:
            print(f"  [LiDAR] Init failed: {e} — continuing without LiDAR")
    else:
        print("  [LiDAR] disabled by config")
-    init_imgsearch(
+    # Image search — optional
-        get_frame_fn=get_frame,
+    if subsys.get("imgsearch", False):
-        send_vel_fn=send_vel,
+        init_imgsearch(
-        gradual_stop_fn=gradual_stop,
+            get_frame_fn=get_frame,
-        llava_fn=call_llava,
+            send_vel_fn=send_vel,
-        yolo_sees_fn=yolo_sees,
+            gradual_stop_fn=gradual_stop,
-        model=OLLAMA_MODEL,
+            llava_fn=call_llava,
-    )
+            yolo_sees_fn=yolo_sees,
            model=OLLAMA_MODEL,
        )
    else:
        print("  [ImgSearch] disabled by config")
-    # Autonomous exploration mode
+    # Autonomous exploration mode — optional
-    from API.memory_api import mem as _mem_ref
+    if subsys.get("autonomous", True):
-    from API.llava_api import PATROL_PROMPT
+        from API.memory_api import mem as _mem_ref
-    auto = AutonomousMode(
+        from API.llava_api import PATROL_PROMPT
-        get_frame_fn=get_frame,
+        auto = AutonomousMode(
-        send_vel_fn=send_vel,
+            get_frame_fn=get_frame,
-        gradual_stop_fn=gradual_stop,
+            send_vel_fn=send_vel,
-        yolo_sees_fn=yolo_sees,
+            gradual_stop_fn=gradual_stop,
-        yolo_summary_fn=yolo_summary,
+            yolo_sees_fn=yolo_sees,
-        yolo_all_classes_fn=yolo_all_classes,
+            yolo_summary_fn=yolo_summary,
-        yolo_closest_fn=yolo_closest,
+            yolo_all_classes_fn=yolo_all_classes,
-        odom_fn=lambda: {"x": 0, "y": 0, "heading": 0},  # fallback if no odom
+            yolo_closest_fn=yolo_closest,
-        call_llava_fn=call_llava,
+            odom_fn=lambda: {"x": 0, "y": 0, "heading": 0},
-        patrol_prompt=PATROL_PROMPT,
+            call_llava_fn=call_llava,
-        mem=_mem_ref,
+            patrol_prompt=PATROL_PROMPT,
-    )
+            mem=_mem_ref,
-    # Wire odometry if available
+        )
-    from API.odometry_api import odom as _odom_ref, ODOM_AVAILABLE
+        from API.odometry_api import odom as _odom_ref, ODOM_AVAILABLE
-    if _odom_ref and ODOM_AVAILABLE:
+        if _odom_ref and ODOM_AVAILABLE:
-        auto._odom_pos = lambda: {
+            auto._odom_pos = lambda: {
-            "x": _odom_ref._x, "y": _odom_ref._y, "heading": _odom_ref._heading
+                "x": _odom_ref._x, "y": _odom_ref._y, "heading": _odom_ref._heading
-        }
+            }
-    init_autonomous(auto)
+        init_autonomous(auto)
    else:
        print("  [Autonomous] disabled by config")
    send_cmd("start")
    time.sleep(0.5)
    send_cmd("walk")
    time.sleep(0.5)
-    # Voice module (optional — continues without it)
+    # Voice module — optional
-    _init_voice()
+    if subsys.get("voice", True):
        _init_voice()
    else:
        print("  [Voice] disabled by config")
    _log("Brain initialized", "info", "brain")
    _warmup_llava()
@ -137,44 +161,37 @@ _voice_module = None
 def _init_voice():
-    """Initialize voice module — runs in background, calls process_command on speech."""
+    """
    Initialize the voice subsystem: G1 built-in mic + Whisper STT + G1
    built-in TtsMaker for replies. Every transcribed command flows through
    process_command(), and the resulting `speak` string is sent to the G1
    speaker.
    """
    global _audio_api, _voice_module
    try:
        from API.audio_api import AudioAPI
-        from Voice.marcus_gemini_voice import GeminiVoiceModule as VoiceModule
+        from Voice.marcus_voice import VoiceModule
        _audio_api = AudioAPI()
-        def _voice_callback(text, role):
+        def _on_command(text, lang):
-            """Gemini voice callback."""
+            text = (text or "").strip()
-            pass  # handled below
+            if not text:
            if role != "user" or not text.strip():
                return
-            t = text.strip().lower()
+            print(f"  [Voice] {text}")
-            act_kw = ["turn","move","go","walk","step","stop","come","wave","clap",
+            try:
-                "high five","shake","hug","forward","backward","left","right",
+                result = process_command(text)
-                "what do you see","what can you see","look","describe","patrol",
+            except Exception as e:
-                "دور","امشي","روح","تقدم","ارجع","وقف","قف","تعال",
+                print(f"  [Brain] Error processing voice command: {e}")
-                "يمين","يسار","قدام","ورا","لوح","صفق","سلم",
+                return
-                "شو شايف","شو تشوف","ماذا ترى","شو قدامك","لف","خطوات"]
+            if isinstance(result, dict):
-            if any(kw in t for kw in act_kw):
+                sp = (result.get("speak") or "").strip()
-                print(f"  [Brain] Action: {text.strip()}")
+                if sp and _audio_api:
-                try:
+                    _audio_api.speak(sp)
                    result = process_command(text.strip())
                    if isinstance(result, dict):
                        sp = result.get("speak", "")
                        vis_kw = ["see","look","describe","شايف","تشوف","ترى","قدامك"]
                        if any(k in t for k in vis_kw) and sp and _audio_api:
                            print(f"  [Brain] Vision: {sp}")
                            _audio_api.speak(sp)
                except Exception as e:
                    print(f"  [Brain] Error: {e}")
            else:
                print(f"  [Chat] {text.strip()}")
-        _voice_module = VoiceModule(_audio_api, on_transcript=_voice_callback)
+        _voice_module = VoiceModule(_audio_api, on_command=_on_command)
        _voice_module.start()
-        print(f"  [Voice] Always listening (Gemini voice)")
+        print("  [Voice] Always listening (Whisper + G1 mic + TtsMaker)")
    except Exception as e:
        print(f"  [Voice] Init failed: {e} — continuing without voice")
        _audio_api = None
@ -255,7 +272,7 @@ def process_command(cmd: str) -> dict:
    # ── Greeting ─────────────────────────────────────────────────────────
    if re.match(r"^(?:hi+|hey+|hello+|hola|salam|marhaba|sup|yo+|ahlan|السلام عليكم|مرحبا|أهلا|هلا|يا هلا)\s*[!.]*$", cmd, re.IGNORECASE):
-        response = "Hello! I am Marcus. How can I help you?"
+        response = "Hello! I am Sanad. How can I help you?"
        print(f"Marcus: {response}")
        add_to_history(cmd, response)
        log_cmd(cmd, response)
@ -346,10 +363,15 @@ def _handle_llava(cmd):
    t0 = time.time()
    img = get_frame()
    # Poll up to 500 ms in 50 ms slices instead of blocking a full second.
    # Returns the moment a frame is available — most drops recover in <100 ms.
    if img is None:
        print("  Waiting for camera...")
-        time.sleep(1.0)
+        for _ in range(10):
-        img = get_frame()
+            time.sleep(0.05)
            img = get_frame()
            if img is not None:
                break
    if img is None:
        print("  Camera not ready — command cancelled")
@ -461,7 +483,7 @@ def run_terminal():
    status = get_brain_status()
    print()
    print("=" * 48)
-    print("         MARCUS AI BRAIN — READY")
+    print("         SANAD AI BRAIN — READY")
    print("=" * 48)
    for k, v in status.items():
        print(f"  {k:<10}: {v}")
--- a/Config/config_Brain.json
+++ b/Config/config_Brain.json
@ -3,13 +3,19 @@
  "max_history": 6,
  "num_batch": 128,
  "num_ctx": 2048,
-  "num_predict_main": 200,
+  "subsystems": {
    "lidar": true,
    "voice": true,
    "imgsearch": false,
    "autonomous": true
  },
  "num_predict_main": 120,
  "num_predict_goal": 80,
  "num_predict_patrol": 100,
  "num_predict_talk": 80,
  "num_predict_verify": 10,
  "warmup_num_predict": 5,
-  "main_prompt": "You are Marcus, a humanoid robot. Look at the image and follow the command.\n{facts}\n\nCommand: \"{command}\"\n\nReply with ONLY this JSON — no markdown, no explanation:\n{{\"actions\":[{{\"move\":\"forward|backward|left|right|stop\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"one sentence\",\"abort\":null}}\n\nRULES:\n- actions is a list of movement steps, max duration 5.0s each\n- move: \"forward\" \"backward\" \"left\" \"right\" \"stop\"\n- arm: \"wave\" \"raise_right\" \"raise_left\" \"clap\" \"high_five\" \"hug\" \"heart\" \"shake_hand\" \"face_wave\" or null\n- arm is NEVER a move value\n- questions/descriptions: actions=[]\n- obstacle < 0.5m: abort = \"obstacle detected\"\n- \"90 degrees\" = 5.0s | \"45 degrees\" = 2.5s | \"1 step\" = 1.0s\n\nEXAMPLES:\n\"turn right\"             -> {{\"actions\":[{{\"move\":\"right\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Turning right\",\"abort\":null}}\n\"turn right 90 degrees\"  -> {{\"actions\":[{{\"move\":\"right\",\"duration\":5.0}}],\"arm\":null,\"speak\":\"Turning 90 degrees\",\"abort\":null}}\n\"move back then left\"    -> {{\"actions\":[{{\"move\":\"backward\",\"duration\":2.0}},{{\"move\":\"left\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Moving back then left\",\"abort\":null}}\n\"wave\"                   -> {{\"actions\":[],\"arm\":\"wave\",\"speak\":\"Waving\",\"abort\":null}}\n\"raise your right arm\"   -> {{\"actions\":[],\"arm\":\"raise_right\",\"speak\":\"Raising right arm\",\"abort\":null}}\n\"walk forward and wave\"  -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":\"wave\",\"speak\":\"Walking and waving\",\"abort\":null}}\n\"what do you see\"        -> {{\"actions\":[],\"arm\":null,\"speak\":\"I see...\",\"abort\":null}}\n\"stop\"                   -> {{\"actions\":[{{\"move\":\"stop\",\"duration\":0}}],\"arm\":null,\"speak\":\"Stopping\",\"abort\":null}}\n\"come to me\"             -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming to you\",\"abort\":null}}\n\"come back\"              -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming back\",\"abort\":null}}\n\"come here\"              -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming\",\"abort\":null}}\n\"get closer\"             -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":1.0}}],\"arm\":null,\"speak\":\"Moving closer\",\"abort\":null}}\n\"go away\"                -> {{\"actions\":[{{\"move\":\"backward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Moving away\",\"abort\":null}}\n\nCommand: \"{command}\"\nJSON:",
+  "main_prompt": "You are Sanad, a humanoid robot. Look at the image and follow the command.\n{facts}\n\nCommand: \"{command}\"\n\nReply with ONLY this JSON — no markdown, no explanation:\n{{\"actions\":[{{\"move\":\"forward|backward|left|right|stop\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"one sentence\",\"abort\":null}}\n\nRULES:\n- actions is a list of movement steps, max duration 5.0s each\n- move: \"forward\" \"backward\" \"left\" \"right\" \"stop\"\n- arm: \"wave\" \"raise_right\" \"raise_left\" \"clap\" \"high_five\" \"hug\" \"heart\" \"shake_hand\" \"face_wave\" or null\n- arm is NEVER a move value\n- questions/descriptions: actions=[]\n- obstacle < 0.5m: abort = \"obstacle detected\"\n- \"90 degrees\" = 5.0s | \"45 degrees\" = 2.5s | \"1 step\" = 1.0s\n\nEXAMPLES:\n\"turn right\"             -> {{\"actions\":[{{\"move\":\"right\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Turning right\",\"abort\":null}}\n\"turn right 90 degrees\"  -> {{\"actions\":[{{\"move\":\"right\",\"duration\":5.0}}],\"arm\":null,\"speak\":\"Turning 90 degrees\",\"abort\":null}}\n\"move back then left\"    -> {{\"actions\":[{{\"move\":\"backward\",\"duration\":2.0}},{{\"move\":\"left\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Moving back then left\",\"abort\":null}}\n\"wave\"                   -> {{\"actions\":[],\"arm\":\"wave\",\"speak\":\"Waving\",\"abort\":null}}\n\"raise your right arm\"   -> {{\"actions\":[],\"arm\":\"raise_right\",\"speak\":\"Raising right arm\",\"abort\":null}}\n\"walk forward and wave\"  -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":\"wave\",\"speak\":\"Walking and waving\",\"abort\":null}}\n\"what do you see\"        -> {{\"actions\":[],\"arm\":null,\"speak\":\"I see...\",\"abort\":null}}\n\"stop\"                   -> {{\"actions\":[{{\"move\":\"stop\",\"duration\":0}}],\"arm\":null,\"speak\":\"Stopping\",\"abort\":null}}\n\"come to me\"             -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming to you\",\"abort\":null}}\n\"come back\"              -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming back\",\"abort\":null}}\n\"come here\"              -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming\",\"abort\":null}}\n\"get closer\"             -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":1.0}}],\"arm\":null,\"speak\":\"Moving closer\",\"abort\":null}}\n\"go away\"                -> {{\"actions\":[{{\"move\":\"backward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Moving away\",\"abort\":null}}\n\nCommand: \"{command}\"\nJSON:",
-  "goal_prompt": "You are Marcus navigating toward a goal.\n\nGOAL: \"{goal}\"\n\nLook at the image. Have you reached the goal?\n\nReply ONLY this JSON:\n{{\"reached\":false,\"next_move\":\"left\",\"duration\":0.5,\"speak\":\"what you see\"}}\n\nRULES:\n- reached: true ONLY if you clearly see the goal target right now\n- next_move: \"forward\" \"left\" \"right\"\n- duration: 0.3 to 0.8 seconds\n- Default next_move: \"left\" to keep scanning\n\nGOAL: \"{goal}\"\nJSON:",
+  "goal_prompt": "You are Sanad navigating toward a goal.\n\nGOAL: \"{goal}\"\n\nLook at the image. Have you reached the goal?\n\nReply ONLY this JSON:\n{{\"reached\":false,\"next_move\":\"left\",\"duration\":0.5,\"speak\":\"what you see\"}}\n\nRULES:\n- reached: true ONLY if you clearly see the goal target right now\n- next_move: \"forward\" \"left\" \"right\"\n- duration: 0.3 to 0.8 seconds\n- Default next_move: \"left\" to keep scanning\n\nGOAL: \"{goal}\"\nJSON:",
-  "patrol_prompt": "You are Marcus, an HSE inspection robot on autonomous patrol.\n\nLook at the camera and assess the scene.\n\nReply ONLY this JSON:\n{{\"observation\":\"one sentence\",\"alert\":null,\"next_move\":\"forward\",\"duration\":1.0}}\n\nRULES:\n- alert = null if safe\n- alert = \"PPE: no helmet\" if person without helmet\n- alert = \"PPE: no vest\" if person without safety vest\n- alert = \"Hazard: description\" for other hazards\n- next_move: \"forward\" \"left\" \"right\"\n- duration: 0.5 to 2.0s\n\nJSON:"
+  "patrol_prompt": "You are Sanad, an HSE inspection robot on autonomous patrol.\n\nLook at the camera and assess the scene.\n\nReply ONLY this JSON:\n{{\"observation\":\"one sentence\",\"alert\":null,\"next_move\":\"forward\",\"duration\":1.0}}\n\nRULES:\n- alert = null if safe\n- alert = \"PPE: no helmet\" if person without helmet\n- alert = \"PPE: no vest\" if person without safety vest\n- alert = \"Hazard: description\" for other hazards\n- next_move: \"forward\" \"left\" \"right\"\n- duration: 0.5 to 2.0s\n\nJSON:"
 }
--- a/Config/config_Voice.json
+++ b/Config/config_Voice.json
@ -1,53 +1,44 @@
 {
  "tts": {
-    "piper_voice_ar": "ar_JO-kareem-medium",
+    "backend": "builtin_ttsmaker",
    "piper_voice_en": "en_US-lessac-medium",
    "piper_sample_rate": 22050,
    "builtin_speaker_id": 0,
-    "target_sample_rate": 16000,
+    "target_sample_rate": 16000
    "piper_timeout_sec": 120,
    "en_backend": "edge_tts",
    "ar_backend": "edge_tts",
    "edge_voice_ar": "ar-AE-HamdanNeural",
    "edge_voice_en": "en-US-GuyNeural"
  },
  "stt": {
    "wake_model": "tiny",
    "command_model": "small",
-    "wake_words_en": [
+    "wake_words_en": ["sanad", "sannad", "sanat", "sunnat"],
-      "marcus",
+    "language": "en",
      "marcos",
      "markus"
    ],
    "wake_words_ar": [
      "ماركوس",
      "مارکوس",
      "ماركس"
    ],
    "language": null,
    "command_timeout_sec": 10,
    "silence_threshold": 500,
    "silence_duration_sec": 1.5,
    "max_record_sec": 15
  },
  "mic": {
    "backend": "builtin_udp",
    "source_index": "3",
    "format": "s16le",
    "rate": 16000,
    "channels": 1
  },
  "mic_udp": {
    "group": "239.168.123.161",
    "port": 5555,
    "buffer_max_bytes": 64000,
    "read_timeout_sec": 0.04
  },
  "speaker": {
    "dds_interface": "eth0",
    "volume": 100,
-    "app_name": "marcus"
+    "app_name": "sanad"
  },
  "audio": {
    "data_dir": "Data/Voice/Recordings",
    "log_file": "logs/voice.log"
  },
  "messages": {
-    "wake_heard": "Listening...",
+    "wake_heard": "Listening",
-    "no_speech": "I didn't catch that. Say my name again.",
+    "no_speech": "I didn't catch that, please say it again",
    "error_tts": "Speech synthesis failed",
    "error_mic": "Microphone error",
    "ready": "Voice system ready"
--- a/Config/marcus_prompts.yaml
+++ b/Config/marcus_prompts.yaml
@ -26,7 +26,7 @@
 # ══════════════════════════════════════════════════════════════════════════════
 main_prompt: |
-  You are Marcus — an advanced humanoid robot assistant built by YS Lootah Technology, Dubai.
+  You are Sanad — an advanced humanoid robot assistant built by YS Lootah Technology, Dubai.
  You are physically present in the room. You have a body, arms, and a camera.
  You follow commands from your operator and respond intelligently.
  {facts}
@ -184,7 +184,7 @@ main_prompt: |
  → {{"actions":[],"arm":null,"speak":"Nice to meet you Kassam, I will remember that","abort":null}}
  "who are you"
-  → {{"actions":[],"arm":null,"speak":"I am Marcus, a humanoid robot assistant by YS Lootah Technology","abort":null}}
+  → {{"actions":[],"arm":null,"speak":"I am Sanad, a humanoid robot assistant by YS Lootah Technology","abort":null}}
  Safety:
  "walk into the wall"
@ -307,7 +307,7 @@ main_prompt: |
 # ══════════════════════════════════════════════════════════════════════════════
 goal_prompt: |
-  You are Marcus, a humanoid robot actively navigating toward a specific target.
+  You are Sanad, a humanoid robot actively navigating toward a specific target.
  YOUR MISSION: "{goal}"
@ -392,7 +392,7 @@ goal_prompt: |
 # ══════════════════════════════════════════════════════════════════════════════
 patrol_prompt: |
-  You are Marcus, a humanoid robot autonomously exploring and mapping an office environment.
+  You are Sanad, a humanoid robot autonomously exploring and mapping an office environment.
  Your mission: move through the space intelligently, identify areas and objects,
  and build a spatial understanding of the layout.
@ -463,7 +463,7 @@ patrol_prompt: |
 # ══════════════════════════════════════════════════════════════════════════════
 talk_prompt: |
-  You are Marcus, a humanoid robot assistant. You have been asked a question
+  You are Sanad, a humanoid robot assistant. You have been asked a question
  or given information. Do NOT move — just respond intelligently.
  {facts}
@ -509,7 +509,7 @@ talk_prompt: |
  → {{"actions":[],"arm":null,"speak":"All systems are functioning normally and I am ready to assist you","abort":null}}
  "what is your name"
-  → {{"actions":[],"arm":null,"speak":"My name is Marcus, a humanoid robot assistant by YS Lootah Technology","abort":null}}
+  → {{"actions":[],"arm":null,"speak":"My name is Sanad, a humanoid robot assistant by YS Lootah Technology","abort":null}}
  "who built you"
  → {{"actions":[],"arm":null,"speak":"I was built by YS Lootah Technology in Dubai, and my programmer is Kassam","abort":null}}
--- a/Core/log_backend.py
+++ b/Core/log_backend.py
--- a/Core/logger.py
+++ b/Core/logger.py
@ -1,9 +1,13 @@
 """
-logger.py — Project-wide logging via Logger.py
+logger.py — Project-wide configured logging instance.
 Imports the `Logs` backend class from log_backend.py (formerly Logger.py;
 renamed to avoid a case-only filename collision with this module, which
 breaks any case-insensitive filesystem — macOS default HFS+/APFS, Windows).
 """
 import os
 from Core.env_loader import PROJECT_ROOT
-from Core.Logger import Logs
+from Core.log_backend import Logs
 # Single shared instance — all modules use this
 _logs = Logs(main_log_file=os.path.join(PROJECT_ROOT, "logs", "main.log"))
--- a/Doc/MARCUS_API.md
+++ b/Doc/MARCUS_API.md
@ -1,8 +1,37 @@
 # Marcus — Full API & Developer Reference
 **Project:** Marcus | YS Lootah Technology | Jetson Orin NX + G1 EDU
-**Scripts:** `~/Models_marcus/marcus_llava.py` + `~/Models_marcus/marcus_yolo.py`
+**Robot persona:** Sanad (wake word + self-intro; project code stays under `Marcus/`)
-**Updated:** April 4, 2026
+**Entry points:** `run_marcus.py` (terminal) / `Server/marcus_server.py` (WebSocket)
 **Updated:** 2026-04-21
 > **What changed since the early draft (April 4):** The project was restructured
 > from two monolithic scripts (`marcus_llava.py` + `marcus_yolo.py`) into a
 > layered architecture. See `Doc/architecture.md` for the current file tree and
 > `Doc/environment.md` for the verified Jetson software stack, exact library
 > versions, and GPU bring-up recipe. This reference still describes the
 > function-level semantics (inputs/outputs/examples) — treat any file path in
 > this document as illustrative and cross-check the actual module. Recent
 > deltas called out inline below.
 ### Recent API deltas (2026-04-21)
 | Change | Location | Note |
 |---|---|---|
 | GPU is mandatory for YOLO | `Config/config_Vision.json`, `Vision/marcus_yolo.py` | `yolo_device` defaults to `"cuda"` and is enforced; `_resolve_device()` raises `RuntimeError` on missing CUDA. `yolo_half=true` runs FP16 on Orin (capability 8.7). |
 | Ollama model | `Config/config_Brain.json` | Default `ollama_model` is `qwen2.5vl:3b` (not `llava:7b`). |
 | Ollama compute-graph caps | `Config/config_Brain.json` | `num_batch=128`, `num_ctx=2048` — required on 16 GB Orin NX to prevent the llama runner OOM. Propagated by `API/llava_api.py` and `Vision/marcus_imgsearch.py` to every `ollama.chat` call. |
 | `num_predict_main` lowered | `Config/config_Brain.json` | 200 → 120 (shaves ~400–600 ms per open-ended command; JSON still parses). |
 | ZMQ bind moved out of import | `API/zmq_api.py` | `init_zmq()` must be called from the main process before any `send_vel/send_cmd`. `init_brain()` does this. Children spawned via `multiprocessing` no longer collide on port 5556. |
 | Camera-retry poll | `Brain/marcus_brain.py::_handle_llava` | Replaced `time.sleep(1.0)` with 10×50 ms polls. |
 | Conditional scan sleeps | `Navigation/goal_nav.py`, `Autonomous/marcus_autonomous.py` | Removed unconditional per-step naps when real work (YOLO hit, LLaVA call, forward move) already consumed wall time. |
 | Image-search step delay | `Vision/marcus_imgsearch.py` | `STEP_DELAY` 0.4 s → 0.15 s. |
 | Built-in G1 microphone | `Voice/builtin_mic.py` (new), `API/audio_api.py`, `Config/config_Voice.json` | Mic now reads from UDP multicast `239.168.123.161:5555` (G1 on-board array mic) instead of the Hollyland USB. Config key `mic.backend` defaults to `"builtin_udp"`; set to `"pactl_parec"` to fall back to the old path. |
 | Built-in G1 TTS | `Voice/builtin_tts.py` (new), `API/audio_api.py` | `AudioAPI.speak(text)` now calls `client.TtsMaker(text, speaker_id)` directly. No MP3/WAV plumbing, no internet, no edge-tts/Piper. English only — `speak()` refuses non-ASCII to avoid the G1's silent Arabic→Chinese fallback. |
 | Gemini voice deleted | `Voice/marcus_gemini_voice.py` removed | `_init_voice()` now spawns `Voice.marcus_voice.VoiceModule` (Whisper wake + command STT). No more WebSocket, no more asyncio event loop, no API key. |
 | Subsystem flags | `Config/config_Brain.json::subsystems.{lidar, voice, imgsearch, autonomous}` | `init_brain()` skips any subsystem with `false`. Defaults: lidar+voice+autonomous ON, imgsearch OFF. |
 | Robot persona → Sanad | Multiple | Wake words `["sanad","sannad","sanat","sunnat"]`; all prompts say "You are Sanad"; banner reads `SANAD AI BRAIN — READY`; hardcoded self-intro says "I am Sanad". Project/file/module names unchanged. |
 | Logger rename | `Core/log_backend.py` (was `Core/Logger.py`) | Case-only collision with `Core/logger.py` removed — repo now clones cleanly on macOS/Windows. Public API unchanged: `from Core.logger import log`. |
 ---
@ -22,38 +51,54 @@
 12. [JSON Schema Reference](#12-json-schema-reference)
 13. [Environment & Paths](#13-environment--paths)
 14. [Quick Reference Card](#14-quick-reference-card)
 15. [Voice API (mic + TTS + STT)](#15-voice-api-mic--tts--stt)
 ---
 ## 1. Configuration Variables
-Defined at the top of `marcus_llava.py`. Edit here to change global behavior.
+All configuration is now **JSON-driven** and lives under `Config/`. Each module
 loads its config at startup via `Core.config_loader.load_config(name)`.
-| Variable | Default | Description |
+**`Config/config_ZMQ.json`** (Holosoma bridge)
 |----------|---------|-------------|
 | `ZMQ_HOST` | `"127.0.0.1"` | Holosoma ZMQ host |
 | `ZMQ_PORT` | `5556` | Holosoma ZMQ port |
 | `ZMQ_YOLO_PORT` | `5557` | YOLO ZMQ port (standalone mode) |
 | `OLLAMA_MODEL` | `"llava:7b"` | LLaVA model via Ollama |
 | `CAM_WIDTH` | `424` | Camera capture width (px) |
 | `CAM_HEIGHT` | `240` | Camera capture height (px) |
 | `CAM_FPS` | `15` | Camera frame rate |
 | `CAM_QUALITY` | `70` | JPEG quality sent to LLaVA |
 | `STOP_ITERATIONS` | `20` | gradual_stop message count |
 | `STOP_DELAY` | `0.05` | seconds between stop messages |
 | `STEP_PAUSE` | `0.3` | pause between consecutive action steps |
 | `ARM_SDK_PATH` | `/home/unitree/unitree_sdk2_python` | Arm SDK path |
 | `ARM_INTERFACE` | `"eth0"` | Network interface for arm SDK |
-Defined at top of `marcus_yolo.py`:
+| Key | Default | Description |
 |---|---|---|
 | `zmq_host` | `"127.0.0.1"` | Holosoma ZMQ host |
 | `zmq_port` | `5556` | Holosoma ZMQ port |
 | `stop_iterations` | `20` | `gradual_stop()` message count |
 | `stop_delay` | `0.05` | seconds between stop messages |
 | `step_pause` | `0.3` | pause between consecutive action steps |
-| Variable | Default | Description |
+**`Config/config_Brain.json`** (Ollama VL model)
-|----------|---------|-------------|
+
-| `YOLO_MODEL_PATH` | `.../Model/yolov8m.pt` | YOLO model path |
+| Key | Default | Description |
-| `YOLO_CONFIDENCE` | `0.45` | Minimum detection confidence |
+|---|---|---|
-| `YOLO_IOU` | `0.45` | NMS IOU threshold |
+| `ollama_model` | `"qwen2.5vl:3b"` | Ollama model tag |
-| `YOLO_DEVICE` | `"cpu"` | Inference device ("cpu" or "cuda") |
+| `max_history` | `6` | conversation turns retained |
-| `YOLO_IMG_SIZE` | `320` | Inference image size (smaller = faster) |
+| `num_batch` | `128` | llama.cpp batch — **cap, required for Jetson** |
 | `num_ctx` | `2048` | llama.cpp KV context length — **cap, required for Jetson** |
 | `num_predict_main` | `120` | max tokens for the main command path |
 | `num_predict_goal` | `80` | goal-navigation call |
 | `num_predict_patrol` | `100` | autonomous patrol call |
 | `num_predict_talk` | `80` | talk-only path |
 | `num_predict_verify` | `10` | YOLO condition verifier (`yes`/`no`) |
 **`Config/config_Vision.json`** (YOLO)
 | Key | Default | Description |
 |---|---|---|
 | `yolo_model_path` | `"Models/yolov8m.pt"` | weights file (auto-fetched if missing) |
 | `yolo_confidence` | `0.45` | detection confidence threshold |
 | `yolo_iou` | `0.45` | NMS IOU threshold |
 | `yolo_device` | `"cuda"` | **GPU required** — `"cpu"` raises `RuntimeError` |
 | `yolo_half` | `true` | FP16 inference (Ampere tensor cores) |
 | `yolo_img_size` | `320` | inference image size |
 | `tracked_classes` | 19 COCO classes | filter for relevant detections |
 **`Config/config_Camera.json`**: `424x240 @ 15 fps`, `JPEG quality 70`.
 **`Config/config_Voice.json`**: see section 6 below.
 **`Config/config_Network.json`**: Jetson eth0/wlan0 IPs, WebSocket port.
 ---
@ -61,20 +106,28 @@ Defined at top of `marcus_yolo.py`:
 ### Setup
 The bind is no longer an import-time side effect. It runs inside `init_zmq()`, called once by `init_brain()` from the main process. Children (e.g. the LiDAR SLAM worker spawned via `multiprocessing.spawn`) can re-import `API.zmq_api` without rebinding.
 ```python
-ctx  = zmq.Context()
+# API/zmq_api.py — bind happens here, not at module import
-sock = ctx.socket(zmq.PUB)
+def init_zmq() -> zmq.Socket:
-sock.bind("tcp://127.0.0.1:5556")
+    global ctx, sock
-time.sleep(0.5)
+    if sock is not None:
        return sock              # idempotent
    ctx  = zmq.Context()
    sock = ctx.socket(zmq.PUB)
    sock.bind(f"tcp://{ZMQ_HOST}:{ZMQ_PORT}")
    time.sleep(0.5)              # let SUBs attach
    return sock
 ```
 ### `send_vel(vx, vy, vyaw)`
-Send velocity command to Holosoma.
+Send velocity command to Holosoma. Raises `RuntimeError` if `init_zmq()` wasn't called.
 ```python
 def send_vel(vx: float = 0.0, vy: float = 0.0, vyaw: float = 0.0):
-    sock.send_string(json.dumps({"vel": {"vx": vx, "vy": vy, "vyaw": vyaw}}))
+    _ensure_sock().send_string(json.dumps({"vel": {"vx": vx, "vy": vy, "vyaw": vyaw}}))
 ```
 | Parameter | Unit | Safe range | Effect |
@ -661,14 +714,17 @@ from unitree_sdk2py.g1.arm.g1_arm_action_client import G1ArmActionClient  # Arm
 ```
 STARTUP:
-  Tab 1: source ~/.holosoma_deps/miniconda3/bin/activate hsinference
+  Tab 1 (hsinference env): Holosoma locomotion policy
          cd ~/holosoma && sudo jetson_clocks
          python3 run_policy.py inference:g1-29dof-loco \
            --task.velocity-input zmq --task.state-input zmq --task.interface eth0
-  Tab 2: ollama serve &
+  Tab 2:  ollama serve > /tmp/ollama.log 2>&1 &
-          /home/unitree/miniconda3/envs/marcus/bin/python3 ~/Models_marcus/marcus_llava.py
+          sleep 3
-          (YOLO starts automatically — no Tab 3 needed)
+
  Tab 3 (marcus env):  conda activate marcus && cd ~/Marcus && python3 run_marcus.py
          (YOLO + voice + LiDAR all start automatically per subsystems flags)
 WAKE WORD: "Sanad"
 COMMANDS:
  walk forward · turn right · turn left · move back
@ -704,4 +760,74 @@ SAFETY:
 ---
 ## 15. Voice API (mic + TTS + STT)
 New pipeline as of 2026-04-21. Replaces the Gemini live WebSocket + edge-tts/Piper stack.
 ### Mic — `Voice.builtin_mic.BuiltinMic`
 Captures the G1's on-board array microphone over UDP multicast. No USB mic required. 16 kHz mono int16 PCM natively; no resampling needed.
 ```python
 from Voice.builtin_mic import BuiltinMic
 mic = BuiltinMic(group="239.168.123.161", port=5555, buf_max=64_000)
 mic.start()
 try:
    pcm = mic.read_chunk(1024)       # 512 samples, ~32 ms, int16 mono
    # or
    pcm = mic.read_seconds(3.0)
 finally:
    mic.stop()
 ```
 Config under `config_Voice.json::mic_udp`.
 ### TTS — `Voice.builtin_tts.BuiltinTTS`
 Wrapper around `unitree_sdk2py.g1.audio.g1_audio_client.AudioClient.TtsMaker`. English only — refuses non-ASCII input.
 ```python
 from Voice.builtin_tts import BuiltinTTS
 tts = BuiltinTTS(audio_client, default_speaker_id=0)
 tts.speak("Hello, I am Sanad", block=True)    # synth + play on G1 body speaker
 ```
 Used by `AudioAPI.speak(text)` internally; application code should call `audio_api.speak(...)` rather than BuiltinTTS directly.
 ### Wake + command loop — `Voice.marcus_voice.VoiceModule`
 Four-state machine (`IDLE → WAKE_HEARD → PROCESSING → SPEAKING`). Whisper `tiny` detects the wake word, `small` transcribes commands.
 ```python
 from API.audio_api import AudioAPI
 from Voice.marcus_voice import VoiceModule
 def on_command(text, lang):
    print(f"heard: {text}")
 audio = AudioAPI()
 voice = VoiceModule(audio, on_command=on_command)
 voice.start()   # background thread
 # ... later ...
 voice.stop()
 ```
 Wake words are configured in `config_Voice.json::stt.wake_words_en`. The brain's `_init_voice()` wires `on_command` to `process_command(text)` + `audio_api.speak(reply)`.
 ### AudioAPI — `API.audio_api.AudioAPI`
 Orchestration layer. Owns the `AudioClient`, manages mute/unmute, exposes a clean `speak` + `record` API.
 ```python
 from API.audio_api import AudioAPI
 audio = AudioAPI()
 audio.speak("Hello")                 # English only; non-ASCII returns early
 pcm = audio.record(seconds=5)         # int16 mono 16 kHz — uses BuiltinMic
 audio.play_pcm(pcm)                   # raw PCM playback via Unitree RPC
 ```
 Config: `config_Voice.json::tts.backend = "builtin_ttsmaker"`, `mic.backend = "builtin_udp"` (or `"pactl_parec"` to fall back to Hollyland).
 ---
 *Marcus — YS Lootah Technology | Kassam | April 2026*
--- a/Doc/MARCUS_progress.pdf
+++ b/Doc/MARCUS_progress.pdf
--- a/Doc/Marcus_Project.pdf
+++ b/Doc/Marcus_Project.pdf
--- a/Doc/architecture.md
+++ b/Doc/architecture.md
@ -1,20 +1,39 @@
 # Marcus — System Architecture
 **Project**: Marcus | YS Lootah Technology
-**Hardware**: Unitree G1 EDU Humanoid (29 DOF) + Jetson Orin NX (16GB)
+**Hardware**: Unitree G1 EDU Humanoid (29 DOF) + Jetson Orin NX 16 GB
-**Updated**: 2026-04-06
+**Robot persona**: **Sanad** (wake word + self-intro; project code still lives under `Marcus/`)
 **Updated**: 2026-04-21
 ---
 ## Recent deltas (since 2026-04-06)
 - **GPU-only YOLO** — `_resolve_device()` raises `RuntimeError` if CUDA is missing. `yolo_device=cuda`, `yolo_half=true` by default.
 - **Ollama compute-graph caps** — `num_batch=128`, `num_ctx=2048` in `config_Brain.json` (otherwise llama.cpp OOMs on the 16 GB Jetson).
 - **`num_predict_main: 120`** (was 200) — saves ~400-600 ms per open-ended command.
 - **ZMQ bind moved to `init_zmq()`** — no longer runs at import time; multiprocessing children (LiDAR SLAM worker) can safely re-import.
 - **G1 built-in microphone** via UDP multicast `239.168.123.161:5555` — `Voice/builtin_mic.py` replaces Hollyland/`parec` as the default mic.
 - **G1 built-in TTS** via `client.TtsMaker()` — `Voice/builtin_tts.py`. English only. Edge-tts / Piper / XTTS paths removed.
 - **Gemini voice module deleted** — Whisper wake-word + command STT path is now authoritative (`Voice/marcus_voice.py`).
 - **Subsystem flags** — `config_Brain.json::subsystems.{lidar, voice, imgsearch, autonomous}` let you selectively skip heavy boot stages.
 - **Conditional inner-loop sleeps** — goal_nav / autonomous / imgsearch no longer pay unconditional per-step naps.
 - **Core/Logger.py → Core/log_backend.py** — case-only name collision with `logger.py` resolved; repo clones cleanly on macOS/Windows.
 - **Robot persona = "Sanad"** — wake words, prompts, banner, and self-intro all use "Sanad". Project identity ("Marcus") remains in file names, class names, directory, logs.
 See `Doc/environment.md` for the verified Jetson software stack and `Doc/pipeline.md` for the end-to-end data flow.
 ---
 ## Overview
-Marcus is a fully offline humanoid robot AI system. The brain runs on Jetson Orin NX with no cloud dependencies. It uses vision-language models (Qwen2.5-VL via Ollama) for understanding commands, YOLO for real-time object detection, dead reckoning for position tracking, and persistent memory across sessions.
+Marcus is a mostly-offline humanoid robot AI system. The brain runs on Jetson Orin NX using a local vision-language model (Qwen2.5-VL via Ollama) for open-ended commands, YOLOv8m for real-time object detection (CUDA + FP16), dead reckoning + optional ROS2 odometry for pose, Livox Mid-360 LiDAR + a custom SLAM worker for mapping, and persistent memory across sessions.
 Two operating modes:
- **Terminal mode** (`run_marcus.py`) — direct keyboard control on the Jetson
+- **Terminal mode** (`run_marcus.py`) — direct keyboard control on the Jetson. Voice subsystem runs alongside by default.
- **Server mode** (`Server/marcus_server.py`) — WebSocket server allowing remote CLI or GUI clients
+- **Server mode** (`Server/marcus_server.py`) — WebSocket server allowing remote CLI or GUI clients.
-Both modes use the **same brain** — identical command processing, same YOLO, same memory, same movement control.
+Both modes use the **same brain** — identical command processing, same YOLO, same memory, same movement control. Voice, LiDAR, image-search and autonomous-patrol are gated behind `config_Brain.json::subsystems` flags.
 ---
@ -28,14 +47,14 @@ Marcus/
 ├── Core/                         # Foundation layer — no external deps
 │   ├── env_loader.py             # Reads .env, resolves PROJECT_ROOT
 │   ├── config_loader.py          # load_config(name) → reads Config/config_{name}.json
-│   ├── Logger.py                 # Logging engine (file-based, no console output)
+│   ├── log_backend.py            # Logging engine (file-based, no console output) — was Logger.py
 │   └── logger.py                 # Project wrapper: log(), log_and_print(), get_logger()
 │
 ├── Config/                       # ALL configuration — one JSON per module
 │   ├── config_ZMQ.json           # ZMQ host, port, stop params
 │   ├── config_Camera.json        # RealSense resolution, fps, quality
-│   ├── config_Brain.json         # Ollama model, prompts, num_predict values
+│   ├── config_Brain.json         # Ollama model, prompts, num_predict, num_batch/ctx, subsystems
-│   ├── config_Vision.json        # YOLO model path, confidence, tracked classes
+│   ├── config_Vision.json        # YOLO model path, device=cuda, half=true, confidence, tracked classes
 │   ├── config_Navigation.json    # move_map, goal aliases, YOLO goal classes
 │   ├── config_Patrol.json        # patrol duration, proximity threshold
 │   ├── config_Arm.json           # arm actions, aliases, availability flag
@ -43,17 +62,26 @@ Marcus/
 │   ├── config_Memory.json        # session/places paths
 │   ├── config_Network.json       # Jetson IPs (eth0/wlan0), ports
 │   ├── config_ImageSearch.json   # search defaults
-│   └── marcus_prompts.yaml       # All LLaVA/Qwen prompts (main, goal, patrol, talk, verify)
+│   ├── config_Voice.json         # mic (builtin_udp|pactl_parec), TTS backend, wake words, mic_udp group/port
 │   ├── config_LiDAR.json         # Livox Mid-360 connection + SLAM engine params
 │   └── marcus_prompts.yaml       # All Qwen-VL prompts (main, goal, patrol, talk, verify)
 │
 ├── API/                          # Interface layer — one file per subsystem
-│   ├── zmq_api.py                # ZMQ PUB socket: send_vel(), gradual_stop(), send_cmd()
+│   ├── zmq_api.py                # ZMQ PUB socket: init_zmq(), send_vel(), gradual_stop(), send_cmd()
 │   ├── camera_api.py             # RealSense thread: start/stop_camera(), get_frame()
-│   ├── llava_api.py              # LLaVA queries: call_llava(), ask(), ask_goal(), ask_patrol()
+│   ├── llava_api.py              # Qwen2.5-VL queries via Ollama: call_llava(), ask(), ask_goal()…
-│   ├── yolo_api.py               # YOLO interface: init_yolo(), yolo_sees(), yolo_summary()...
+│   ├── yolo_api.py               # YOLO interface: init_yolo(), yolo_sees(), yolo_summary()…
 │   ├── odometry_api.py           # Odometry wrapper: init_odometry(), get_position()
 │   ├── memory_api.py             # Memory wrapper: init_memory(), log_cmd(), place_save/goto()
-│   ├── arm_api.py                # Arm gestures: do_arm(), ARM_ACTIONS, ALL_ARM_NAMES
+│   ├── arm_api.py                # Arm gestures: do_arm(), ARM_ACTIONS, ALL_ARM_NAMES (stub)
-│   └── imgsearch_api.py          # Image search wrapper: init_imgsearch(), get_searcher()
+│   ├── imgsearch_api.py          # Image search wrapper: init_imgsearch(), get_searcher()
 │   ├── audio_api.py              # AudioAPI — speak() via G1 TtsMaker, record() via BuiltinMic
 │   └── lidar_api.py              # LiDAR wrapper: init_lidar(), obstacle_ahead(), get_lidar_status()
 │
 ├── Voice/                        # Mic + TTS + wake-word STT
 │   ├── builtin_mic.py            # G1 array mic via UDP multicast 239.168.123.161:5555
 │   ├── builtin_tts.py            # BuiltinTTS — client.TtsMaker(text, speaker_id)
 │   └── marcus_voice.py           # VoiceModule — Whisper tiny (wake) + small (command) state machine
 │
 ├── Brain/                        # Decision logic — imports ONLY from API/
 │   ├── marcus_brain.py           # Orchestrator: init_brain(), process_command(), run_terminal()
@ -127,39 +155,40 @@ Marcus/
 │  Server/marcus_server.py (WebSocket)             │
 └──────────────────┬──────────────────────────────┘
                   │
-┌──────────────────▼──────────────────────────────┐
+┌──────────────────▼──────────────────────────────────┐
-│                Brain Layer                       │
+│                   Brain Layer                        │
-│  marcus_brain.py    — init_brain()               │
+│  marcus_brain.py    — init_brain() / process_command  │
-│                     — process_command(cmd)        │
+│  command_parser.py  — regex-table local commands     │
-│  command_parser.py  — 14 regex local commands    │
+│  executor.py        — execute Qwen-VL decisions      │
-│  executor.py        — execute LLaVA decisions    │
+│  marcus_memory.py   — session + place memory         │
-│  marcus_memory.py   — session + place memory     │
+└──────────────────┬──────────────────────────────────┘
 └──────────────────┬──────────────────────────────┘
                   │ imports only from API/
-┌──────────────────▼──────────────────────────────┐
+┌──────────────────▼──────────────────────────────────┐
-│                 API Layer                        │
+│                    API Layer                         │
-│  zmq_api     camera_api    llava_api            │
+│  zmq_api   camera_api   llava_api   audio_api       │
-│  yolo_api    odometry_api  memory_api           │
+│  yolo_api  odometry_api memory_api  imgsearch_api   │
-│  arm_api     imgsearch_api                      │
+│  arm_api   lidar_api                                 │
-└──────────────────┬──────────────────────────────┘
+└──────────────┬───────────────────────┬──────────────┘
-                   │ wraps
+               │ wraps                 │ wraps
-┌──────────────────▼──────────────────────────────┐
+┌──────────────▼───────────┐  ┌────────▼────────────────┐
-│            Navigation / Vision                   │
+│   Navigation / Vision    │  │        Voice            │
-│  goal_nav.py        marcus_yolo.py              │
+│  goal_nav.py             │  │  builtin_mic.py         │
-│  patrol.py          marcus_imgsearch.py         │
+│  patrol.py               │  │  builtin_tts.py         │
-│  marcus_odometry.py                              │
+│  marcus_odometry.py      │  │  marcus_voice.py        │
-└──────────────────┬──────────────────────────────┘
+│  marcus_yolo.py          │  │  (Whisper + TtsMaker)   │
-                   │
+│  marcus_imgsearch.py     │  └──────────┬──────────────┘
-┌──────────────────▼──────────────────────────────┐
+└──────────────┬───────────┘             │
-│               Core Layer                         │
+               │                         │
-│  env_loader.py   config_loader.py               │
+┌──────────────▼─────────────────────────▼────────────┐
-│  Logger.py       logger.py                      │
+│                   Core Layer                         │
-└──────────────────┬──────────────────────────────┘
+│  env_loader.py   config_loader.py                   │
 │  log_backend.py  logger.py                          │
 └──────────────────┬──────────────────────────────────┘
                   │ reads
-┌──────────────────▼──────────────────────────────┐
+┌──────────────────▼──────────────────────────────────┐
-│              Config / .env                       │
+│                 Config / .env                        │
-│  11 JSON files + marcus_prompts.yaml            │
+│  13 JSON files + marcus_prompts.yaml                │
-└─────────────────────────────────────────────────┘
+└──────────────────────────────────────────────────────┘
 ```
 **Rule**: Brain never imports from Vision/ or Navigation/ directly. It goes through the API layer.
@ -176,11 +205,11 @@ Reads `.env` from the project root to resolve `PROJECT_ROOT`. Uses a minimal bui
 #### `config_loader.py` (30 lines)
 `load_config(name)` reads `Config/config_{name}.json` and caches the result. All modules call this instead of hardcoding constants. Also provides `config_path(relative)` to resolve relative paths (e.g., `"Models/yolov8m.pt"`) to absolute paths from PROJECT_ROOT.
-#### `Logger.py` (186 lines)
+#### `log_backend.py` (186 lines, was `Logger.py`)
-Full logging engine from AI_Photographer. File-based only (no console output by default). Creates per-module log files in `logs/`. Handles write permission fallbacks, log name normalization, and corrupt log recovery.
+Full logging engine ported from AI_Photographer. File-based only (no console output by default). Creates per-module log files in `logs/`. Handles write permission fallbacks, log name normalization, and corrupt log recovery. Renamed from `Logger.py` on 2026-04-21 to eliminate a case-only collision with `logger.py` that prevented the repo from cloning on case-insensitive filesystems (macOS/Windows).
 #### `logger.py` (51 lines)
-Project wrapper around `Logger.py`. Provides:
+Project wrapper around `log_backend.Logs`. Provides:
 - `log(message, level, module)` — write to `logs/{module}.log`
 - `log_and_print(message, level, module)` — write + print
 - `get_logger(module)` — get configured Logs instance
@ -191,12 +220,15 @@ Project wrapper around `Logger.py`. Provides:
 Each API file wraps one subsystem. They read their own config via `load_config()`, handle import errors gracefully with fallback stubs, and export clean public functions.
-#### `zmq_api.py` (49 lines)
+#### `zmq_api.py` (~75 lines)
-Creates a ZMQ PUB socket on startup (binds to `tcp://127.0.0.1:{zmq_port}`). Holosoma's RL policy connects to this socket as SUB and receives velocity commands at 50Hz.
+Holds the ZMQ PUB socket used to drive Holosoma at 50 Hz. **The bind is not a module import side effect any more** — it runs only when `init_zmq()` is called from the main (parent) process. This lets the LiDAR SLAM worker (spawned via `multiprocessing.spawn`) re-import the module without rebinding port 5556 and crashing.
 **Exports:**
 - `init_zmq()` — idempotent bind, called once by `init_brain()`
 - `send_vel(vx, vy, vyaw)` — send velocity to Holosoma
 - `gradual_stop()` — 20 zero-velocity messages over 1 second
 - `send_cmd(cmd)` — Holosoma state machine (`start` / `walk` / `stand` / `stop`)
 - `get_socket()` — access the bound socket (for odometry to reuse)
 - `send_cmd(cmd)` — send state command: "start", "walk", "stand", "stop"
 - `get_socket()` — return the shared PUB socket (for odometry to reuse)
 - `MOVE_MAP` — direction-to-velocity lookup: `{"forward": (0.3, 0, 0), "left": (0, 0, 0.3), ...}`
@ -440,6 +472,37 @@ Supports text-only search (no reference image) using hint description.
 ---
 ### Voice/
 Mic, TTS and wake-word pipeline. All three files run only when `config_Brain.json::subsystems.voice == true`. Everything is local — no internet, no WebSocket, no cloud API. TTS is English-only by design (the G1 firmware maps non-English to Chinese, which is unusable).
 #### `builtin_mic.py` (~180 lines, new 2026-04-21)
 Ported from `Project/Sanad/voice/audio_io.py::BuiltinMic`. Joins the G1's on-board audio multicast group (`239.168.123.161:5555`) and buffers incoming int16 mono 16 kHz PCM. Thread-safe ring buffer.
 **Exports:**
 - `BuiltinMic(group, port, buf_max, read_timeout)` — init (idempotent)
 - `start()` / `stop()` — socket lifecycle
 - `read_chunk(n)` — pull exactly `n` bytes (blocks up to `read_timeout`, pads silence otherwise)
 - `read_seconds(s)` — convenience for "record `s` seconds"
 - `flush()` — drop buffered audio (called while TTS plays, to avoid echo)
 #### `builtin_tts.py` (~70 lines, new 2026-04-21)
 Thin wrapper around `unitree_sdk2py.g1.audio.g1_audio_client.AudioClient.TtsMaker(text, speaker_id)`. Synchronous — blocks until the estimated playback duration elapses. Refuses non-ASCII input (the G1 silently maps Arabic to Chinese, which confuses everyone).
 **Exports:**
 - `BuiltinTTS(audio_client, default_speaker_id=0)` — init
 - `speak(text, speaker_id=None, block=True)` — synth+play on G1 body speaker
 #### `marcus_voice.py` (~340 lines, rewired 2026-04-21)
 Always-listening English voice loop with a four-state machine (`IDLE → WAKE_HEARD → PROCESSING → SPEAKING`). Whisper `tiny` listens for the wake word "Sanad" on 2-second chunks; Whisper `small` transcribes the full command. Mic input comes from `BuiltinMic`; responses go through `audio_api.speak()` → `BuiltinTTS`.
 **Exports:**
 - `VoiceModule(audio_api, on_command=cb)` — init
 - `start()` — spawn background thread
 - `stop()` — graceful teardown
 ---
 ### Server/
 #### `marcus_server.py` (224 lines)
--- a/Doc/controlling.md
+++ b/Doc/controlling.md
@ -1,15 +1,16 @@
 # Marcus — Control & Startup Guide
-**Updated**: 2026-04-06
+**Robot persona:** Sanad (wake word + self-intro; project code lives under `Marcus/`)
 **Updated**: 2026-04-21
 ---
 ## Quick Start
-### Prerequisites (Jetson Orin NX)
+### Prerequisites (Jetson Orin NX, JetPack 5.1.1)
 ```bash
-# Terminal 1 — Start Holosoma (locomotion policy)
+# Terminal 1 — Start Holosoma (locomotion policy, in hsinference env)
 source ~/.holosoma_deps/miniconda3/bin/activate hsinference
 cd ~/holosoma
 ~/.holosoma_deps/miniconda3/envs/hsinference/bin/python3 \
@ -19,28 +20,46 @@ cd ~/holosoma
  --task.velocity-input zmq \
  --task.state-input zmq \
  --task.interface eth0
 # Terminal 2 — Ollama server (leave running)
 ollama serve > /tmp/ollama.log 2>&1 &
 sleep 3
 ollama list                # confirm qwen2.5vl:3b present
 ```
 ### Option A — Terminal Mode (on Jetson)
 ```bash
-# Terminal 2 — Start Marcus Brain
+# Terminal 3 — Start Marcus Brain
-conda activate Marcus
+conda activate marcus
 ollama serve & sleep 3
 cd ~/Marcus
 python3 run_marcus.py
 ```
-Direct keyboard control. All commands typed locally.
+Direct keyboard control + voice input (say **"Sanad"** to wake). Expected banner on boot:
 ```
 ================================================
         SANAD AI BRAIN — READY
 ================================================
  model     : qwen2.5vl:3b
  yolo      : True
  odometry  : True
  memory    : True
  lidar     : True
  voice     : True
  camera    : 424x240@15
 ```
 ### Option B — Server + Client (remote)
 ```bash
-# Terminal 2 (Jetson) — Start Server
+# Terminal 3 (Jetson) — Start Server
 conda activate marcus
 cd ~/Marcus
 python3 -m Server.marcus_server
-# Terminal 3 (Workstation) — Connect Client
+# Terminal 4 (Workstation) — Connect Client
 cd ~/Robotics_workspace/yslootahtech/Project/Marcus
 python3 -m Client.marcus_cli
 ```
@ -58,6 +77,20 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765`
 ---
 ## Voice
 - **Wake word:** "Sanad" (variants "sannad", "sanat", "sunnat" — see `config_Voice.json::stt.wake_words_en`)
 - **Mic:** G1 on-board array mic, captured via UDP multicast `239.168.123.161:5555` (16 kHz mono, 16-bit PCM). No USB mic needed.
 - **STT:** Whisper `tiny` (wake detection) + Whisper `small` (command transcription) — both run locally.
 - **TTS:** Unitree `client.TtsMaker()` → G1 body speaker. English only.
 - **Barge-in:** say something while Marcus is speaking and the mic buffer flushes on the next command.
 Interaction flow: say "Sanad" → hear *"Listening"* → speak your command → see transcript on console → Marcus answers through the speaker.
 To disable voice entirely, set `subsystems.voice: false` in `config_Brain.json` — Marcus will boot text-only ~2 s faster.
 ---
 ## Command Reference
 ### Movement
@ -75,17 +108,17 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765`
 ### Vision
 | Command | Action |
 |---------|--------|
-| `what do you see` | LLaVA describes camera view |
+| `what do you see` | Qwen2.5-VL describes camera view |
-| `describe the room` | LLaVA scene description |
+| `describe the room` | Qwen2.5-VL scene description |
-| `is anyone here` | LLaVA person check |
+| `is anyone here` | Qwen2.5-VL person check |
 | `yolo` | Show YOLO detection status |
 ### Goal Navigation
 | Command | Action |
 |---------|--------|
 | `goal/ stop when you see a person` | YOLO fast search + stop |
-| `goal/ find a laptop` | YOLO + LLaVA search |
+| `goal/ find a laptop` | YOLO + Qwen-VL search |
-| `goal/ stop when you see a guy holding a phone` | YOLO + LLaVA compound verification |
+| `goal/ stop when you see a guy holding a phone` | YOLO + Qwen-VL compound verification |
 | `find a person` | Auto-detected as goal (no prefix needed) |
 | `look for a bottle` | Auto-detected as goal |
@ -106,7 +139,7 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765`
 | `patrol` | Autonomous patrol (prompts for duration) |
 | `patrol: door → desk → exit` | Named waypoint patrol |
-### Image Search
+### Image Search (requires `subsystems.imgsearch: true`)
 | Command | Action |
 |---------|--------|
 | `search/ /path/to/photo.jpg` | Find target from reference image |
@ -122,11 +155,20 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765`
 | `last session` | Previous session summary |
 | `session summary` | Current session stats |
 ### Autonomous Mode
 | Command | Action |
 |---------|--------|
 | `auto on` | Start autonomous exploration |
 | `auto off` | Stop |
 | `auto status` | Current step / observations |
 | `auto save` | Snapshot observations to disk |
 ### System
 | Command | Action |
 |---------|--------|
 | `help` | Command reference |
 | `example` | Usage examples |
 | `lidar` / `lidar status` | SLAM engine pose + health |
 | `q` / `quit` | Shutdown |
 ### Client-Only Commands (CLI)
@ -139,35 +181,43 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765`
 ---
 ## Subsystem flags (`Config/config_Brain.json`)
 Control what initializes at boot. Defaults:
 ```jsonc
 "subsystems": {
  "lidar":      true,
  "voice":      true,
  "imgsearch":  false,
  "autonomous": true
 }
 ```
 Set any to `false` to skip that subsystem's init. Boot time drops roughly:
 - `voice: false`   → ~2 s faster (no Whisper model load)
 - `lidar: false`   → ~1 s faster (no SLAM subprocess spawn)
 - `imgsearch: false` → already the default; re-enable only when you need `search/ …`
 - `autonomous: false` → minor, but removes the AutonomousMode init
 ---
 ## Network Configuration
 | Interface | IP | Use |
 |-----------|-----|------|
-| `eth0` | 192.168.123.164 | Robot internal network (Jetson - G1 - LiDAR) |
+| `eth0` | 192.168.123.164 | Robot internal network (Jetson ↔ G1 ↔ LiDAR) |
-| `wlan0` | 10.255.254.86 | Office WiFi (Jetson - Workstation) |
+| `wlan0` | 10.255.254.86 | Office WiFi (Jetson ↔ Workstation) |
 | Service | Port | Protocol |
 |---------|------|----------|
 | Marcus WebSocket | 8765 | ws:// |
-| ZMQ Velocity | 5556 | tcp:// (PUB/SUB) |
+| ZMQ velocity (→ Holosoma) | 5556 | tcp:// (PUB/SUB) |
-| Ollama API | 11434 | HTTP |
+| Ollama API | 11434 | HTTP (localhost only) |
-| LiDAR | 192.168.123.120 | Livox Mid360 |
+| G1 audio multicast (mic) | 5555 | UDP multicast 239.168.123.161 |
 | Livox Mid-360 (LiDAR) | 192.168.123.120 | UDP (Livox SDK) |
-All configurable in `Config/config_Network.json`.
+Most values configurable in `Config/config_Network.json` and `config_Voice.json::mic_udp`.
 ---
 ## Subsystem Status
 On startup, the server/brain shows:
 ```
 YOLO      : active       (19 tracked classes, CPU, yolov8m.pt)
 Odometry  : active       (dead reckoning, +/-10cm)
 Memory    : active       (session_016_2026-04-06)
 Camera    : 424x240@15   (RealSense D435I)
 LiDAR     : ALIVE        (Livox Mid360 at 192.168.123.120)
 Arms      : pending      (GR00T N1.5 not yet integrated)
 ```
 ---
@ -175,13 +225,15 @@ Arms      : pending      (GR00T N1.5 not yet integrated)
 | Issue | Cause | Fix |
 |-------|-------|-----|
-| `ModuleNotFoundError: No module named 'Server'` | Wrong directory | `cd ~/Marcus` then run |
+| Banner shows `SANAD AI BRAIN — READY` but nothing moves | Holosoma not running | Start Holosoma (Terminal 1) first |
-| Robot doesn't move | Holosoma not running | Start Holosoma first (Terminal 1) |
+| `RuntimeError: CUDA not available` on boot | Wrong torch build on Jetson | See `Doc/environment.md` section 9.2 — reinstall the NVIDIA Jetson torch wheel |
-| Robot doesn't move | ZMQ port conflict | Only run one of Server or Brain, not both |
+| `llama runner process has terminated: %!w(<nil>)` | Ollama compute graph OOM | Already capped at `num_batch=128 / num_ctx=2048`. Check `free -h`; kill stale Ollama runners: `pkill -f "ollama runner"` |
-| `Camera: {e} reconnecting` | USB bandwidth | Reduce to `low` profile |
+| Traceback mentioning `multiprocessing/spawn.py` + ZMQ port 5556 | Old import-time ZMQ bind regressed | Pull latest `API/zmq_api.py` — must call `init_zmq()` from the parent only |
-| LLaVA slow (>10s) | GPU VRAM full | Kill other GPU processes, or use `qwen2.5vl:3b` |
+| `[Camera] No frame for 10s` during warmup | Ollama blocking the main thread, or USB bandwidth | Warmup is ~10–15 s on first Qwen load; subsequent commands are fast |
-| `YOLO not available` | ultralytics not installed | `pip install ultralytics` |
+| Wake word never fires | Whisper hearing something else | Check `logs/voice.log` — if it transcribes as "sunnat"/"sannat", add your variant to `config_Voice.json::stt.wake_words_en` |
-| Client can't connect | Wrong IP or server not running | Check `status` command, verify IP |
+| Mic silent | G1 audio service not publishing | Run `python3 Voice/builtin_mic.py` standalone — must print "OK — mic is capturing audio" |
 | `[LiDAR] No data yet (will keep trying)` | SLAM worker still spawning (normal) or Livox network | First ~5 s normal. If persists, `ping 192.168.123.120` |
 | Client can't connect | Wrong IP or server not running | Verify `ollama serve &` and `python3 -m Server.marcus_server` are both up |
 ---
@ -191,6 +243,7 @@ Arms      : pending      (GR00T N1.5 not yet integrated)
 |------|------|
 | Brain code | `~/Marcus/Brain/` |
 | Server | `~/Marcus/Server/marcus_server.py` |
 | Voice | `~/Marcus/Voice/{builtin_mic,builtin_tts,marcus_voice}.py` |
 | Config | `~/Marcus/Config/` |
 | Prompts | `~/Marcus/Config/marcus_prompts.yaml` |
 | YOLO model | `~/Marcus/Models/yolov8m.pt` |
@ -199,3 +252,5 @@ Arms      : pending      (GR00T N1.5 not yet integrated)
 | Logs | `~/Marcus/logs/` |
 See `Doc/architecture.md` for full project structure and file-by-file documentation.
 See `Doc/environment.md` for the verified Jetson software stack.
 See `Doc/pipeline.md` for the end-to-end data flow.
--- a/Doc/environment.md
+++ b/Doc/environment.md
@ -1,10 +1,11 @@
 # Marcus — Environment & Version Reference
 **Project**: Marcus | YS Lootah Technology
 **Robot persona**: Sanad (wake word + self-intro; codebase stays under `Marcus/`)
 **Hardware**: Unitree G1 EDU Humanoid (29 DOF) + Jetson Orin NX 16 GB
 **Deployment host**: `unitree@192.168.123.164` (hostname `ubuntu`)
 **Conda env**: `marcus`
-**Captured**: 2026-04-12
+**Captured**: 2026-04-12 (updated 2026-04-21)
 This document is the canonical record of the verified GPU-accelerated software stack running on the Jetson Orin NX. It covers system software, Python environment, Marcus runtime dependencies, installation recipe, verification commands, and known quirks. Pair it with `architecture.md` (what the code does) and `controlling.md` (how to drive it).
@ -136,29 +137,23 @@ Captured from `importlib` on 2026-04-12, `marcus` env on the Jetson.
 ## 8. Marcus project modules — import status
-All 16 project modules import cleanly from the `marcus` env at `/home/unitree/Marcus`:
+All 25 project modules import cleanly from the `marcus` env at `/home/unitree/Marcus`:
 ```
-OK   Core.config_loader
+OK   Core.config_loader      Core.env_loader
-OK   Core.env_loader
+OK   Core.log_backend        Core.logger
-OK   Vision.marcus_yolo
+OK   Voice.builtin_mic       Voice.builtin_tts       Voice.marcus_voice
-OK   Vision.marcus_imgsearch
+OK   Vision.marcus_yolo      Vision.marcus_imgsearch
-OK   API.llava_api
+OK   API.llava_api           API.yolo_api            API.camera_api
-OK   API.yolo_api
+OK   API.zmq_api             API.imgsearch_api       API.odometry_api
-OK   API.camera_api
+OK   API.memory_api          API.arm_api             API.audio_api
-OK   API.zmq_api
+OK   Navigation.goal_nav     Navigation.patrol       Navigation.marcus_odometry
-OK   API.imgsearch_api
+OK   Brain.marcus_brain      Brain.marcus_memory     Brain.command_parser
 OK   API.odometry_api
 OK   API.memory_api
 OK   API.arm_api
 OK   Navigation.goal_nav
 OK   Navigation.patrol
 OK   Navigation.marcus_odometry
 OK   Brain.marcus_brain
 OK   Brain.marcus_memory
 OK   Autonomous.marcus_autonomous
 ```
 Notable removals: `Voice/marcus_gemini_voice.py` deleted on 2026-04-21. `Core/Logger.py` renamed to `Core/log_backend.py`.
 ---
 ## 9. Installation recipe (reproducing this environment)
@ -378,3 +373,7 @@ Config file (`Config/config_Vision.json`):
 | 2026-04-12 | Initial environment.md — full stack captured, GPU bring-up verified end to end. Steady-state YOLOv8m FPS on Orin NX measured at 21.9. Ollama Qwen2.5-VL verified at 100% GPU. |
 | 2026-04-12 | `Vision/marcus_yolo.py` rewired to load `config_Vision.json`, added `_resolve_device()` with hard-fail on missing CUDA (GPU-only policy). `API/yolo_api.py` updated to propagate `RuntimeError`. `Config/config_Vision.json` set `yolo_device=cuda`, `yolo_half=true`. |
 | 2026-04-12 | Installed NVIDIA Jetson torch `2.1.0a0+41361538.nv23.06` (replacing CPU-only PyPI `2.4.1`) + built torchvision `0.16.1` from source against it. Verified `nms device = cuda:0`. |
 | 2026-04-12 | Fixed llama.cpp compute-graph OOM on Jetson: added `num_batch=128` + `num_ctx=2048` caps in `Config/config_Brain.json`, propagated through `API/llava_api.py` and `Vision/marcus_imgsearch.py`. Qwen2.5-VL compute graph drops from ~7.5 GiB to ~1.8 GiB. |
 | 2026-04-21 | **Restructure**: moved ZMQ bind out of `API/zmq_api.py` import time into `init_zmq()`; fixes LiDAR SLAM worker spawn crash. Added loud GPU-requirement banner in `API/yolo_api.py`. Dropped `num_predict_main` 200→120. Made inner-loop sleeps in goal_nav/autonomous/imgsearch conditional. Renamed `Core/Logger.py` → `Core/log_backend.py` (case-collision fix). Updated `Doc/MARCUS_API.md` to current state. |
 | 2026-04-21 | **Voice restructure**: added `Voice/builtin_mic.py` (G1 array mic via UDP multicast `239.168.123.161:5555`) and `Voice/builtin_tts.py` (thin `AudioClient.TtsMaker` wrapper). Rewired `Voice/marcus_voice.py` to use BuiltinMic. Refactored `API/audio_api.py::speak()` to use BuiltinTTS — removed ~110 lines of edge-tts + pydub + Piper plumbing. Deleted `Voice/marcus_gemini_voice.py`. Added `subsystems.{lidar,voice,imgsearch,autonomous}` gate in `config_Brain.json::init_brain()`. |
 | 2026-04-21 | **Persona swap**: robot identifies as Sanad. Wake words `["sanad","sannad","sanat","sunnat"]`, `speaker.app_name="sanad"`, all Qwen prompts say "You are Sanad", banner reads `SANAD AI BRAIN — READY`, hardcoded self-intro says "I am Sanad". Project directory, class names, filenames, and `PROJECT_NAME=Marcus` env var unchanged. |
--- a/Doc/note.txt
+++ b/Doc/note.txt
@ -38,15 +38,12 @@ rm ~/Robotics_workspace/yslootahtech/Project/Marcus_fine_tune/marcus-gguf/marcus
-
+https://github.com/AnjieCheng/NaVILA
-https://ingrid789.github.io/SkillMimic/
+https://rchalyang.github.io/EgoVLA/
-https://github.com/wyhuai/SkillMimic
+https://github.com/RchalYang/EgoVLA_Release
-
+https://github.com/openvla/openvla
-https://vla-survey.github.io/
+https://github.com/unitreerobotics/unifolm-vla
-
+https://github.com/OpenDriveLab/WholebodyVLA
--- a/Doc/pipeline.md
+++ b/Doc/pipeline.md
@ -0,0 +1,187 @@
 # Marcus — End-to-End Pipeline
 **Robot persona:** Sanad (wake word + self-intro)
 **Updated:** 2026-04-21
 One map of every data path from sensor to motor, voice to speech. Cross-reference with `architecture.md` (what each file is) and `MARCUS_API.md` (function signatures).
 ---
 ## Boot sequence
 `Brain/marcus_brain.py::init_brain()` — called once from `run_marcus.py` or `marcus_server.py`.
 ```
 run_marcus.py
      │
      ▼
 init_brain()
      │
      ├─ init_zmq()                            PUB bind tcp://127.0.0.1:5556 → Holosoma
      ├─ start_camera()                        RealSense 424×240@15fps → shared _raw_frame
      ├─ init_yolo(raw_frame, raw_lock)        YOLOv8m CUDA FP16, 19 classes — background thread
      ├─ init_odometry()                       ROS2 /dog_odom → dead reckoning fallback
      ├─ init_memory()                         loads Data/Brain/Sessions/session_NNN/
      │
      ├─ if subsystems.lidar:       init_lidar()         multiprocessing spawn SLAM_worker
      ├─ if subsystems.imgsearch:   init_imgsearch()     (off by default)
      ├─ if subsystems.autonomous:  AutonomousMode()     patrol state machine
      │
      ├─ send_cmd("start") + 0.5s + send_cmd("walk") + 0.5s   Holosoma handshake
      │
      ├─ if subsystems.voice:       _init_voice()        ▼ voice pipeline below
      └─ _warmup_llava()                        first Qwen2.5-VL inference
                                                "SANAD AI BRAIN — READY"
 ```
 Subsystem flags live in `config_Brain.json::subsystems`. Current defaults:
 ```json
 "subsystems": { "lidar": true, "voice": true, "imgsearch": false, "autonomous": true }
 ```
 ---
 ## Voice pipeline (when `subsystems.voice = true`)
 ```
 G1 body mic (array)
  └─ UDP multicast 239.168.123.161:5555 ── int16 mono 16 kHz PCM
        ▼
 Voice/builtin_mic.py::BuiltinMic
  ring buffer (64 KB) + read_chunk(n)
        ▼
 Voice/marcus_voice.py::VoiceModule   (IDLE → WAKE_HEARD → PROCESSING → SPEAKING)
  ├─ IDLE        : 2-s chunks → Whisper tiny → wake-word match ("sanad"/"sannad"/…)
  ├─ WAKE_HEARD  : audio_api.speak("Listening") → G1 body speaker
  ├─ PROCESSING  : record-until-silence → Whisper small → transcribed text
  └─ on_command(text, "en")
        ▼
 Brain/marcus_brain.py::process_command(text)
  ├─ regex fast-path → Brain/command_parser.py::try_local_command()
  │    places · odometry walk/turn · patrol · session recall · goal_nav · auto on/off
  └─ else → _handle_llava(text)
        ├─ get_frame()  (10×50 ms poll, no 1 s stall)
        ├─ API/llava_api.py::ask(text, img)
        │    ollama.chat(qwen2.5vl:3b, num_batch=128, num_ctx=2048, num_predict=120)
        │    → parse_json() → {actions, arm, speak, abort}
        └─ Brain/executor.py::execute(d)
                ├─ actions → API/zmq_api.py::send_vel(vx, vy, vyaw) → Holosoma
                ├─ arm     → API/arm_api.py  (stub for now)
                └─ abort   → gradual_stop()
        ▼
 result["speak"]  →  audio_api.speak(reply)
        ▼
 API/audio_api.py::speak(text, lang="en")
  ├─ mute mic (flush BuiltinMic buffer)
  ├─ Voice/builtin_tts.py::BuiltinTTS.speak(text)
  │    client.TtsMaker(text, speaker_id=0)   — G1 on-board engine, English only
  │    time.sleep(len(text) * 0.08)
  └─ unmute mic → back to IDLE
 ```
 ---
 ## Terminal / WebSocket command pipeline (same brain, skips voice)
 ```
 run_marcus.py stdin   OR   Server/marcus_server.py WebSocket
        ▼
 Brain/marcus_brain.py::process_command(text)
        ▼  (same parser → LLaVA → executor → ZMQ as above)
        ▼
 result dict  →  stdout   OR   WebSocket reply frame
 ```
 ---
 ## Vision pipeline (continuous, consumed by brain on demand)
 ```
 RealSense D435 (USB)
  └─ 424×240 BGR 15 fps
      → API/camera_api.py — shared _raw_frame (thread-safe)
                    │                  │
                    │                  └─ get_frame() → JPEG base64 on demand
                    ▼
       Vision/marcus_yolo.py (daemon thread)
       YOLOv8m @ cuda:0 FP16 imgsz=320
       → _latest_detections (thread-safe list)
         yolo_sees / yolo_closest / yolo_summary / yolo_fps
                    ▼
       Navigation/goal_nav.py  (fast YOLO check → Qwen-VL fallback)
       Autonomous/marcus_autonomous.py  (patrol scan every N steps)
       Brain/marcus_brain.py  (status / alerts)
 ```
 ---
 ## Movement pipeline
 ```
 Brain/executor.py  OR  Brain/command_parser.py  OR  Navigation/*
        │   uses MOVE_MAP from config_Navigation.json
        ▼
 API/zmq_api.py::send_vel(vx, vy, vyaw)  JSON over ZMQ PUB (port 5556)
        ▼
 Holosoma RL policy (separate process, hsinference env)
        ▼
 G1 low-level joint commands over DDS/eth0
        ▼
 29-DOF body motion
 ```
 ---
 ## LiDAR pipeline (when `subsystems.lidar = true`)
 ```
 Livox Mid-360 (192.168.123.120, UDP)
        ▼
 Lidar/SLAM_worker.py  (multiprocessing.spawn subprocess — CUDA-safe spawn)
    ├─ SLAM_engine, SLAM_Filter, SLAM_LoopClosure, SLAM_Submap, SLAM_NavRuntime
    ├─ publishes pose + obstacle flags back to parent via Queue
    └─ writes occupancy grids to Data/Navigation/Maps/
        ▼
 API/lidar_api.py  (reads the queues, exposes:)
        ├─ obstacle_ahead() → bool
        ├─ get_lidar_status() → dict (pose, loc_state, frame age, FPS, ICP ms)
        └─ LIDAR_AVAILABLE
        ▼
 Navigation/goal_nav.py rotation thread — pauses motion on obstacle_ahead()
 Brain/command_parser.py — responds to "lidar status" queries
 ```
 ---
 ## Knobs that control each stage
 | Knob | Location | Effect |
 |---|---|---|
 | `subsystems.lidar` | config_Brain.json | SLAM subprocess on/off |
 | `subsystems.voice` | config_Brain.json | BuiltinMic + Whisper + TtsMaker loop on/off |
 | `subsystems.imgsearch` | config_Brain.json | image-guided search init on/off |
 | `subsystems.autonomous` | config_Brain.json | auto-patrol state machine init on/off |
 | `num_batch`, `num_ctx` | config_Brain.json | llama.cpp compute-graph size (128 / 2048 ≈ 1.8 GiB graph — **do not raise** on 16 GB Jetson) |
 | `num_predict_main` | config_Brain.json | 120 tokens max for the main JSON reply |
 | `yolo_device`, `yolo_half` | config_Vision.json | `cuda` / FP16 (hard-required; CPU not allowed) |
 | `mic.backend` | config_Voice.json | `builtin_udp` (G1 array) or `pactl_parec` (Hollyland fallback) |
 | `mic_udp.group/port` | config_Voice.json | where to join the G1 audio multicast |
 | `tts.backend` | config_Voice.json | `builtin_ttsmaker` (only supported option) |
 | `stt.wake_words_en` | config_Voice.json | Whisper matcher (`sanad` + variants) |
 ---
 ## Per-command latency (estimated, post-fixes)
 | Step | Typical | Notes |
 |---|---|---|
 | Wake-word detect | 200–500 ms | Whisper tiny on 2 s chunk |
 | Record until silence | 1–8 s | depends on user speech |
 | Whisper small STT | 500–1500 ms | once per command |
 | Camera frame fetch | <50 ms | poll loop, no 1 s blocking stall |
 | Ollama Qwen2.5-VL | 800–1500 ms | `num_batch=128 / num_ctx=2048 / num_predict=120` |
 | Executor + ZMQ send | <10 ms | fire-and-forget PUB |
 | TtsMaker playback | ~len(text) × 80 ms | synthesizes + plays on robot |
 **Total wake → answer-playback:** ~**2.5–4 s** for a short vision question like "what do you see" (vs. 5–8 s with the pre-restructure edge-tts/Gemini overhead).
--- a/Navigation/goal_nav.py
+++ b/Navigation/goal_nav.py
@ -123,26 +123,36 @@ def navigate_to_goal(goal: str, max_steps: int = 0):
    reached = False
    try:
        for step in range(1, max_steps + 1):
-            time.sleep(SCAN_INTERVAL)
+            # Track whether real work happened this iteration. If it did,
            # the work itself already ate wall time — don't pay an extra
            # SCAN_INTERVAL nap on top.
            did_work = False
            # --- YOLO fast check ---
            if yolo_target and yolo_sees(yolo_target):
                img_b64 = get_frame()
                did_work = True
                if condition:
                    if not _verify_condition(yolo_target, condition, img_b64):
                        print(f"  [GoalNav] YOLO sees {yolo_target} but condition "
                              f"'{condition}' not met — continuing")
-                        continue
+                        # fall through to the sleep-skip path
-
+                    else:
-                print(f"  [GoalNav] YOLO confirmed '{yolo_target}' at step {step}")
+                        print(f"  [GoalNav] YOLO confirmed '{yolo_target}' at step {step}")
-                log_detection(yolo_target, position="goal", distance="close")
+                        log_detection(yolo_target, position="goal", distance="close")
-                reached = True
+                        reached = True
-                break
+                        break
                else:
                    print(f"  [GoalNav] YOLO confirmed '{yolo_target}' at step {step}")
                    log_detection(yolo_target, position="goal", distance="close")
                    reached = True
                    break
            # --- LLaVA fallback (less frequent — every few steps) ---
            if step >= MIN_STEPS and step % MIN_STEPS == 0:
                img_b64 = get_frame()
                if img_b64:
                    did_work = True
                    d = ask_goal(goal, img_b64)
                    if d.get("reached"):
                        print(f"  [GoalNav] LLaVA says goal reached at step {step}")
@ -152,6 +162,11 @@ def navigate_to_goal(goal: str, max_steps: int = 0):
                    if speak:
                        print(f"  [GoalNav] LLaVA: {speak}")
            # Only pay the scan interval when nothing happened this step.
            # If YOLO hit or LLaVA fired, they already took 50–1000 ms.
            if not did_work:
                time.sleep(SCAN_INTERVAL)
    finally:
        rotating[0] = False
        rot_thread.join(timeout=1.0)
--- a/Vision/marcus_imgsearch.py
+++ b/Vision/marcus_imgsearch.py
@ -59,7 +59,9 @@ except ImportError:
 # ══════════════════════════════════════════════════════════════════════════════
 DEFAULT_MAX_STEPS    = 60      # max rotation steps before giving up
-STEP_DELAY           = 0.4     # seconds between YOLO checks
+STEP_DELAY           = 0.15    # min gap between YOLO checks (was 0.4 — reduced
                               # because the rotation thread paces motion already
                               # and each LLaVA call is 600-1500 ms of real work)
 ROTATE_SPEED         = 0.25    # rad/s rotation speed during search
 MIN_STEPS_WARMUP     = 3       # skip first N steps (stale frame)
 MATCH_CONFIDENCE_THR = 0.6     # LLaVA confidence threshold (not used directly,
--- a/Voice/builtin_mic.py
+++ b/Voice/builtin_mic.py
@ -0,0 +1,202 @@
 """
 builtin_mic.py — G1 built-in microphone (UDP multicast capture)
 ================================================================
 The G1 humanoid's on-board microphone is published by the Unitree firmware
 as an RTP-like UDP multicast stream on 239.168.123.161:5555, carrying
 16 kHz mono int16 PCM. Any host on the robot's 192.168.123.0/24 network
 can join the group and read the audio — no extra SDK call required.
 This module intentionally has no dependency on pyaudio, pulseaudio, or the
 unitree_sdk2py package. Joining the multicast group is all that's needed.
 Usage:
    from Voice.builtin_mic import BuiltinMic
    mic = BuiltinMic()
    mic.start()
    try:
        chunk = mic.read_chunk(1024)   # 512 samples, 32 ms at 16 kHz
        ...
    finally:
        mic.stop()
 Ported from Project/Sanad/voice/audio_io.py (Sanad's production implementation).
 """
 from __future__ import annotations
 import socket
 import struct
 import subprocess
 import threading
 import time
 from typing import Optional
 DEFAULT_GROUP        = "239.168.123.161"
 DEFAULT_PORT         = 5555
 DEFAULT_BUF_MAX      = 64_000           # ~2 s of 16 kHz mono int16
 DEFAULT_READ_TIMEOUT = 0.04             # 40 ms budget per read_chunk call
 SAMPLE_RATE          = 16_000           # hardware rate — do not change
 def _find_g1_local_ip() -> str:
    """
    Return the host IPv4 on the G1's internal 192.168.123.0/24 network.
    Required by IP_ADD_MEMBERSHIP so the kernel knows which NIC to join on.
    """
    out = subprocess.run(
        ["ip", "-4", "-o", "addr"], capture_output=True, text=True,
    ).stdout
    for line in out.splitlines():
        for tok in line.split():
            if tok.startswith("192.168.123."):
                return tok.split("/")[0]
    raise RuntimeError(
        "BuiltinMic: no interface on 192.168.123.0/24 — "
        "host is not on the G1's internal network"
    )
 class BuiltinMic:
    """
    G1 on-board microphone over UDP multicast.
    Thread-safe: a background daemon thread receives datagrams into an
    internal ring buffer; `read_chunk(n)` pulls the next `n` bytes or
    blocks up to `read_timeout` before returning zeros.
    """
    sample_rate = SAMPLE_RATE
    def __init__(
        self,
        group: str = DEFAULT_GROUP,
        port: int = DEFAULT_PORT,
        buf_max: int = DEFAULT_BUF_MAX,
        read_timeout: float = DEFAULT_READ_TIMEOUT,
    ):
        self._group        = group
        self._port         = port
        self._buf_max      = buf_max
        self._read_timeout = read_timeout
        self._sock: Optional[socket.socket] = None
        self._buf          = bytearray()
        self._lock         = threading.Lock()
        self._running      = False
        self._thread: Optional[threading.Thread] = None
    def start(self) -> None:
        if self._running:
            return
        local_ip = _find_g1_local_ip()
        self._sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
        self._sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        self._sock.bind(("", self._port))
        mreq = struct.pack(
            "4s4s",
            socket.inet_aton(self._group),
            socket.inet_aton(local_ip),
        )
        self._sock.setsockopt(socket.IPPROTO_IP, socket.IP_ADD_MEMBERSHIP, mreq)
        self._sock.settimeout(1.0)
        self._running = True
        self._thread = threading.Thread(
            target=self._recv_loop, daemon=True, name="builtin_mic_rx",
        )
        self._thread.start()
        print(f"  [BuiltinMic] joined {self._group}:{self._port} on {local_ip}")
    def _recv_loop(self) -> None:
        while self._running:
            try:
                data, _ = self._sock.recvfrom(4096)
                with self._lock:
                    self._buf.extend(data)
                    # ring-buffer: drop oldest when we'd exceed buf_max
                    if len(self._buf) > self._buf_max:
                        del self._buf[: len(self._buf) - self._buf_max]
            except socket.timeout:
                continue
            except Exception:
                if self._running:
                    time.sleep(0.01)
    def read_chunk(self, num_bytes: int) -> bytes:
        """
        Return exactly `num_bytes` of 16 kHz mono int16 PCM.
        Waits up to `read_timeout` for that many bytes to be available.
        If the buffer is still short after the timeout, returns whatever
        is available padded with silence. Never blocks forever.
        """
        deadline = time.time() + self._read_timeout
        while time.time() < deadline:
            with self._lock:
                if len(self._buf) >= num_bytes:
                    chunk = bytes(self._buf[:num_bytes])
                    del self._buf[:num_bytes]
                    return chunk
            time.sleep(0.003)
        with self._lock:
            avail = len(self._buf)
            if avail > 0:
                chunk = bytes(self._buf[:avail])
                del self._buf[:avail]
                return chunk + b"\x00" * (num_bytes - avail)
        return b"\x00" * num_bytes
    def read_seconds(self, seconds: float) -> bytes:
        """
        Convenience: capture `seconds` of audio and return as bytes.
        Blocks for the full duration (not a real-time producer).
        """
        num_bytes = int(seconds * self.sample_rate * 2)  # 2 bytes/sample (int16)
        out = bytearray()
        chunk_bytes = 1024
        while len(out) < num_bytes:
            out.extend(self.read_chunk(min(chunk_bytes, num_bytes - len(out))))
        return bytes(out)
    def flush(self) -> None:
        """Drop all buffered audio (e.g. after the robot spoke)."""
        with self._lock:
            self._buf.clear()
    def stop(self) -> None:
        self._running = False
        if self._sock is not None:
            try:
                self._sock.close()
            except Exception:
                pass
            self._sock = None
        if self._thread is not None:
            self._thread.join(timeout=1.5)
            self._thread = None
 # ────────────────────────────────────────────────────────────────
 # Standalone test — capture 3 s and print energy stats
 # ────────────────────────────────────────────────────────────────
 if __name__ == "__main__":
    import array
    print("BuiltinMic standalone test — capturing 3 s from G1...")
    mic = BuiltinMic()
    mic.start()
    time.sleep(0.3)  # let the receiver thread warm up
    raw = mic.read_seconds(3.0)
    mic.stop()
    samples = array.array("h", raw)
    if not samples:
        print("  FAIL — got zero samples")
    else:
        mn = min(samples); mx = max(samples)
        mean_abs = sum(abs(s) for s in samples) / len(samples)
        print(f"  samples={len(samples)}  min={mn}  max={mx}  mean|s|={mean_abs:.0f}")
        if mean_abs > 30:
            print("  OK — mic is capturing audio")
        else:
            print("  WARN — signal very low, check G1 audio service is running")
--- a/Voice/builtin_tts.py
+++ b/Voice/builtin_tts.py
@ -0,0 +1,88 @@
 """
 builtin_tts.py — Unitree G1 built-in TTS (English only)
 ========================================================
 Thin wrapper around AudioClient.TtsMaker(text, speaker_id). The G1's on-board
 TTS engine synthesizes and plays directly through the body speaker — no
 internet, no MP3/WAV roundtrip, no audio SDK plumbing on our side.
 Supported languages (firmware-side):
    English  — works  (Marcus uses this)
    Chinese  — works  (unused)
    Arabic   — silently falls back to Chinese (unusable — we refuse these)
 Signature:
    client.TtsMaker(text: str, speaker_id: int) -> int    # 0 = success
    speaker_id ∈ {0, 1, 2}   — different voice timbres
 Usage:
    from Voice.builtin_tts import BuiltinTTS
    tts = BuiltinTTS(audio_client)
    tts.speak("Hello, I am Sanad", speaker_id=0)
 """
 from __future__ import annotations
 import logging
 import time
 from typing import Optional
 log = logging.getLogger("builtin_tts")
 class BuiltinTTS:
    """Synchronous English-only TTS via the G1's on-board engine."""
    # Rough playback duration per character — enough margin that `speak()`
    # returns after audio has actually finished on the robot.
    SECONDS_PER_CHAR = 0.08
    MIN_SECONDS      = 1.5
    def __init__(self, audio_client, default_speaker_id: int = 0):
        """
        Args:
            audio_client       : initialized unitree_sdk2py AudioClient
            default_speaker_id : 0, 1, or 2 (default voice timbre)
        """
        self._client = audio_client
        self._default_speaker = default_speaker_id
    def speak(
        self,
        text: str,
        speaker_id: Optional[int] = None,
        block: bool = True,
    ) -> int:
        """
        Play `text` on the G1 speaker via TtsMaker.
        English-only by policy. Non-ASCII (Arabic) input is rejected rather
        than silently played back as Chinese. Returns the TtsMaker status
        code (0 = success) or -1 if input was rejected.
        """
        if not text or not text.strip():
            return -1
        # Reject non-English. TtsMaker "falls back" by playing Arabic text
        # as Chinese phonemes — intelligible to nobody — so we refuse it
        # rather than surprise the operator.
        if any(ord(c) > 127 for c in text):
            log.warning("builtin_tts refusing non-ASCII text: %r", text[:60])
            return -1
        sid = self._default_speaker if speaker_id is None else speaker_id
        log.info("[TtsMaker sid=%d] %s", sid, text[:80])
        try:
            code = self._client.TtsMaker(text, sid)
        except Exception as e:
            log.error("TtsMaker call failed: %s", e)
            return -1
        if block:
            # Estimate how long the G1 is going to take to finish speaking.
            # TtsMaker is fire-and-forget — we need to wait so the mic loop
            # knows when to unmute.
            duration = max(self.MIN_SECONDS, len(text) * self.SECONDS_PER_CHAR)
            time.sleep(duration)
        return code
--- a/Voice/marcus_gemini_voice.py
+++ b/Voice/marcus_gemini_voice.py
@ -1,608 +0,0 @@
 #!/usr/bin/env python3
 """
 Voice/marcus_gemini_voice.py — Marcus Gemini Live Voice Module v2
 ==================================================================
 Real-time bidirectional voice conversation using Gemini 2.5 Flash native audio.
 Uses G1 built-in speaker + Hollyland wireless mic.
 Based on SanadVoice/gemini_interact architecture:
 - PyAudio for mic (not parec)
 - Echo suppression (silence when speaking)
 - Gemini VAD (automatic activity detection)
 - thinkingBudget=0 (no thinking text)
 - ASR buffering for full sentences
 - Vision routed to brain's Qwen camera
 Usage:
    from Voice.marcus_gemini_voice import GeminiVoiceModule
    voice = GeminiVoiceModule(audio_api, on_transcript=callback)
    voice.start()
 """
 import array
 import asyncio
 import base64
 import json
 import logging
 import os
 import subprocess
 import threading
 import time
 import numpy as np
 from dotenv import load_dotenv
 load_dotenv()
 BASE_PATH = os.environ.get("PROJECT_BASE", "/home/unitree")
 PROJECT_NAME = "Marcus"
 PROJECT_ROOT = os.path.join(BASE_PATH, PROJECT_NAME)
 LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
 os.makedirs(LOG_DIR, exist_ok=True)
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
    handlers=[
        logging.FileHandler(os.path.join(LOG_DIR, "voice.log")),
        logging.StreamHandler(),
    ],
 )
 log = logging.getLogger("gemini_voice")
 def load_config(name: str) -> dict:
    path = os.path.join(PROJECT_ROOT, "Config", f"config_{name}.json")
    with open(path, "r") as f:
        return json.load(f)
 # ─── CONFIGURATION ────────────────────────────────────────
 API_KEY = "AIzaSyDt9Xi83MDZuuPpfwfHyMD92X7ZKdGkqf8"
 MODEL = "models/gemini-2.5-flash-native-audio-preview-12-2025"
 URI = (
    "wss://generativelanguage.googleapis.com/ws/"
    "google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent"
    f"?key={API_KEY}"
 )
 VOICE_NAME = "Charon"
 SEND_RATE = 16000
 RECEIVE_RATE = 24000
 CHUNK_SIZE = 512
 CHANNELS = 1
 def load_system_prompt():
    paths = [
        os.path.join(PROJECT_ROOT, "Config", "marcus_voice_prompt.txt"),
    ]
    for p in paths:
        if os.path.exists(p):
            with open(p, "r", encoding="utf-8-sig") as f:
                return f.read().strip()
    return (
        "You are Marcus, a humanoid robot by YS Lootah Technology, Dubai. "
        "Speak Arabic (UAE dialect) and English. Be concise — 1-2 sentences max."
    )
 # ─── AUDIO HELPERS ────────────────────────────────────────
 def audio_energy(pcm: bytes) -> int:
    try:
        samples = array.array("h", pcm)
        if not samples:
            return 0
        return sum(abs(s) for s in samples) // len(samples)
    except Exception:
        return 0
 SILENCE_PCM = b'\x00' * (CHUNK_SIZE * 2)
 # ─── GEMINI VOICE MODULE ─────────────────────────────────
 class GeminiVoiceModule:
    """Real-time voice conversation via Gemini, using G1 speaker + Hollyland mic."""
    def __init__(self, audio_api, on_transcript=None):
        self._audio = audio_api
        self._on_transcript = on_transcript
        self._config = load_config("Voice")
        self._mic_source = getattr(audio_api, '_mic_source',
            self._config["mic"].get("source_index", "0"))
        # State
        self.speaking = False
        self.interrupted = False
        self._running = False
        self._thread = None
        self._audio_queue = None  # Created in async context
        # Tuning
        self.MIN_THRESHOLD = 3000
        self.barge_in_threshold = self.MIN_THRESHOLD
        self.REQUIRED_LOUD_CHUNKS = 10
        self.PREBUFFER_CHUNKS = 2
        self.PLAYBACK_TIMEOUT = 0.25
        self.BARGE_IN_COOLDOWN = 0.7
        self.AI_SPEAK_GRACE = 0.20
        self.ECHO_GUARD_SEC = 0.8
        self.SPEAKING_ENERGY_GATE = 0.85
        self.SEND_SILENCE_WHEN_SPEAKING = True
        # Timing
        self._ai_speaking_since = 0.0
        self._last_ai_audio_time = 0.0
        self._barge_in_block_until = 0.0
        self._ignore_input_until = 0.0
        # ASR buffer
        self._asr_buf = ""
        self._asr_last_time = 0.0
        self.ASR_WINDOW_SEC = 2.0
        # Find Hollyland mic PyAudio device index
        self._mic_device_idx = self._find_mic_device()
        log.info("GeminiVoiceModule v2 initialized")
    # ─── MIC DEVICE DETECTION ─────────────────────────────
    def _find_mic_device(self) -> int:
        """Find Hollyland wireless mic in PyAudio devices. Returns device index."""
        import pyaudio
        import ctypes
        ERROR_HANDLER_FUNC = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p)
        def _alsa_error_handler(filename, line, function, err, fmt):
            pass  # suppress
        c_error_handler = ERROR_HANDLER_FUNC(_alsa_error_handler)
        try:
            asound = ctypes.cdll.LoadLibrary("libasound.so.2")
            asound.snd_lib_error_set_handler(c_error_handler)
        except: pass  # ALSA_suppress
        pa = pyaudio.PyAudio()
        try:
            # First: set PulseAudio default source to Hollyland
            subprocess.run(["pactl", "set-source-mute", self._mic_source, "0"], capture_output=True)
            subprocess.run(["pactl", "set-source-volume", self._mic_source, "100%"], capture_output=True)
            # Search for wireless mic by name
            for i in range(pa.get_device_count()):
                info = pa.get_device_info_by_index(i)
                name = info.get("name", "").lower()
                if info["maxInputChannels"] > 0 and ("wireless" in name or "hollyland" in name):
                    log.info("Mic found: [%d] %s (%dHz)", i, info["name"], int(info["defaultSampleRate"]))
                    return i
            # Fallback to 'default' or 'pulse' device
            for i in range(pa.get_device_count()):
                info = pa.get_device_info_by_index(i)
                if info["maxInputChannels"] > 0 and info.get("name", "") in ("default", "pulse"):
                    log.info("Mic fallback: [%d] %s", i, info["name"])
                    return i
            log.warning("No mic found, using device 0")
            return 0
        finally:
            pa.terminate()
    # ─── MIC CALIBRATION ──────────────────────────────────
    def _calibrate_mic(self):
        """Calibrate barge-in threshold from ambient noise."""
        import pyaudio
        import ctypes
        ERROR_HANDLER_FUNC = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p)
        def _alsa_error_handler(filename, line, function, err, fmt):
            pass  # suppress
        c_error_handler = ERROR_HANDLER_FUNC(_alsa_error_handler)
        try:
            asound = ctypes.cdll.LoadLibrary("libasound.so.2")
            asound.snd_lib_error_set_handler(c_error_handler)
        except: pass  # ALSA_suppress
        pa = pyaudio.PyAudio()
        mic_info = pa.get_device_info_by_index(self._mic_device_idx)
        mic_rate = int(mic_info["defaultSampleRate"])
        mic_channels = 1
        try:
            stream = pa.open(format=pyaudio.paInt16, channels=mic_channels,
                             rate=mic_rate, input=True,
                             input_device_index=self._mic_device_idx,
                             frames_per_buffer=CHUNK_SIZE)
            values = []
            for _ in range(40):
                data = stream.read(CHUNK_SIZE, exception_on_overflow=False)
                values.append(audio_energy(data))
            stream.stop_stream()
            stream.close()
            avg_noise = sum(values) / len(values) if values else 0
            self.barge_in_threshold = max(self.MIN_THRESHOLD, avg_noise * 3.0)
            log.info("Mic calibrated: noise=%.0f, threshold=%.0f", avg_noise, self.barge_in_threshold)
        except Exception as e:
            log.warning("Calibration failed: %s", e)
        finally:
            pa.terminate()
    # ─── G1 SPEAKER PLAYBACK ─────────────────────────────
    def _play_buffer_on_g1(self, pcm_24k: np.ndarray):
        """Play 24kHz audio on G1 speaker (resample to 16kHz, single call)."""
        if len(pcm_24k) < 100:
            return
        # Resample 24kHz → 16kHz
        tl = int(len(pcm_24k) * 16000 / 24000)
        audio_16k = np.interp(
            np.linspace(0, len(pcm_24k), tl, endpoint=False),
            np.arange(len(pcm_24k)),
            pcm_24k.astype(np.float64),
        ).astype(np.int16)
        from unitree_sdk2py.g1.audio.g1_audio_api import (
            ROBOT_API_ID_AUDIO_START_PLAY,
            ROBOT_API_ID_AUDIO_STOP_PLAY,
        )
        client = self._audio._client
        if not client:
            return
        app_name = "gemini"
        client._Call(ROBOT_API_ID_AUDIO_STOP_PLAY, json.dumps({"app_name": app_name}))
        time.sleep(0.1)
        pcm = audio_16k.tobytes()
        sid = f"s_{int(time.time() * 1000)}"
        param = json.dumps({
            "app_name": app_name,
            "stream_id": sid,
            "sample_rate": 16000,
            "channels": 1,
            "bits_per_sample": 16,
        })
        client._CallRequestWithParamAndBin(ROBOT_API_ID_AUDIO_START_PLAY, param, list(pcm))
        duration = len(audio_16k) / 16000
        time.sleep(duration + 0.3)
        client._Call(ROBOT_API_ID_AUDIO_STOP_PLAY, json.dumps({"app_name": app_name}))
    # ─── WEBSOCKET TASKS ─────────────────────────────────
    async def _capture_mic(self, ws):
        """Continuously capture mic via PyAudio and send to Gemini."""
        import pyaudio
        import ctypes
        ERROR_HANDLER_FUNC = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p)
        def _alsa_error_handler(filename, line, function, err, fmt):
            pass  # suppress
        c_error_handler = ERROR_HANDLER_FUNC(_alsa_error_handler)
        try:
            asound = ctypes.cdll.LoadLibrary("libasound.so.2")
            asound.snd_lib_error_set_handler(c_error_handler)
        except: pass  # ALSA_suppress
        pa = pyaudio.PyAudio()
        mic_info = pa.get_device_info_by_index(self._mic_device_idx)
        mic_rate = int(mic_info["defaultSampleRate"])
        mic_channels = 1
        # Open mic at native rate/channels
        stream = pa.open(format=pyaudio.paInt16, channels=mic_channels,
                         rate=mic_rate, input=True,
                         input_device_index=self._mic_device_idx,
                         frames_per_buffer=CHUNK_SIZE)
        log.info("Mic stream: device=%d, rate=%d, ch=%d", self._mic_device_idx, mic_rate, mic_channels)
        loud_chunks = 0
        loop = asyncio.get_event_loop()
        needs_resample = mic_rate != SEND_RATE or mic_channels != 1
        try:
            while self._running:
                data = await loop.run_in_executor(
                    None, lambda: stream.read(CHUNK_SIZE, exception_on_overflow=False))
                # Convert to mono 16kHz if needed
                if needs_resample:
                    audio = np.frombuffer(data, dtype=np.int16)
                    # Stereo to mono
                    if mic_channels == 2:
                        audio = audio.reshape(-1, 2).mean(axis=1).astype(np.int16)
                    # Resample to 16kHz
                    if mic_rate != SEND_RATE:
                        tl = int(len(audio) * SEND_RATE / mic_rate)
                        if tl > 0:
                            audio = np.interp(
                                np.linspace(0, len(audio), tl, endpoint=False),
                                np.arange(len(audio)),
                                audio.astype(np.float64),
                            ).astype(np.int16)
                    data = audio.tobytes()
                energy = audio_energy(data)
                now = time.time()
                # Barge-in detection
                if self.speaking and now >= self._barge_in_block_until:
                    if (now - self._ai_speaking_since) >= self.AI_SPEAK_GRACE:
                        if energy > self.barge_in_threshold:
                            loud_chunks += 1
                        else:
                            loud_chunks = 0
                        if loud_chunks > self.REQUIRED_LOUD_CHUNKS:
                            log.info("Barge-in detected!")
                            self.interrupted = True
                            self.speaking = False
                            while not self._audio_queue.empty():
                                try: self._audio_queue.get_nowait()
                                except: break
                            loud_chunks = 0
                            self._barge_in_block_until = now + self.BARGE_IN_COOLDOWN
                # Echo suppression: send silence while speaking
                data_to_send = data
                if self.SEND_SILENCE_WHEN_SPEAKING and self.speaking:
                    gate = self.barge_in_threshold * self.SPEAKING_ENERGY_GATE
                    if energy < gate:
                        data_to_send = SILENCE_PCM
                # Send to Gemini
                b64 = base64.b64encode(data_to_send).decode()
                msg = {
                    "realtime_input": {
                        "media_chunks": [
                            {"mime_type": f"audio/pcm;rate={SEND_RATE}", "data": b64}
                        ]
                    }
                }
                await ws.send(json.dumps(msg))
        except Exception as e:
            if self._running:
                log.error("Mic error: %s", e)
        finally:
            stream.stop_stream()
            stream.close()
            pa.terminate()
    async def _receive_audio(self, ws):
        """Receive audio responses and transcriptions from Gemini."""
        async for msg in ws:
            if not self._running:
                break
            try:
                response = json.loads(msg)
                server_content = response.get("serverContent", {})
                if server_content.get("interrupted"):
                    self.interrupted = False
                # User transcription (partial/streaming)
                input_tr = (
                    server_content.get("inputTranscription")
                    or server_content.get("input_transcription")
                    or server_content.get("inputAudioTranscription")
                    or server_content.get("input_audio_transcription")
                )
                if isinstance(input_tr, dict):
                    text = (input_tr.get("text") or "").strip()
                    now = time.time()
                    if text and now >= self._ignore_input_until and not self.speaking:
                        # Buffer ASR text
                        if now - self._asr_last_time > self.ASR_WINDOW_SEC:
                            self._asr_buf = ""
                        self._asr_buf = text  # Gemini sends cumulative transcription
                        self._asr_last_time = now
                if self.interrupted:
                    continue
                # Audio from Gemini
                model_turn = server_content.get("modelTurn")
                if model_turn:
                    for part in model_turn.get("parts", []):
                        inline_data = part.get("inlineData")
                        if inline_data:
                            audio_b64 = inline_data.get("data")
                            if audio_b64:
                                now = time.time()
                                if not self.speaking:
                                    self._ai_speaking_since = now
                                    # Gemini started responding — fire transcript callback
                                    if self._asr_buf and self._on_transcript:
                                        self._on_transcript(self._asr_buf, "user")
                                self.speaking = True
                                self._last_ai_audio_time = now
                                self._ignore_input_until = now + self.ECHO_GUARD_SEC
                                audio_bytes = base64.b64decode(audio_b64)
                                await self._audio_queue.put(audio_bytes)
                        # Text from Gemini (thinking/response text)
                        text_part = part.get("text", "").strip()
                        if text_part and self._on_transcript:
                            self._on_transcript(text_part, "marcus")
                # Turn complete — Gemini finished speaking
                turn_complete = server_content.get("turnComplete")
                if turn_complete:
                    # Clear ASR buffer after turn
                    self._asr_buf = ""
            except Exception as e:
                log.error("Receive error: %s", e)
    async def _play_audio(self):
        """Collect Gemini audio chunks and play on G1 speaker."""
        while self._running:
            try:
                if not self.speaking:
                    await asyncio.sleep(0.05)
                    continue
                # Pre-buffer
                buffered = False
                while self.speaking and not buffered:
                    if self._audio_queue.qsize() >= self.PREBUFFER_CHUNKS:
                        buffered = True
                    else:
                        await asyncio.sleep(0.01)
                # Collect all audio chunks
                buffer_chunks = []
                while self.speaking:
                    try:
                        data = await asyncio.wait_for(
                            self._audio_queue.get(), timeout=self.PLAYBACK_TIMEOUT)
                        audio = np.frombuffer(data, dtype=np.int16)
                        buffer_chunks.append(audio)
                        self._last_ai_audio_time = time.time()
                    except asyncio.TimeoutError:
                        if self._audio_queue.empty():
                            if time.time() - self._last_ai_audio_time > 0.3:
                                break
                # Play on G1 speaker
                if buffer_chunks:
                    full_audio = np.concatenate(buffer_chunks)
                    duration = len(full_audio) / RECEIVE_RATE
                    log.info("Playing %.1fs on G1", duration)
                    await asyncio.get_event_loop().run_in_executor(
                        None, self._play_buffer_on_g1, full_audio)
                self.speaking = False
            except Exception as e:
                log.error("Play error: %s", e)
                self.speaking = False
    # ─── MAIN LOOP ────────────────────────────────────────
    async def _run_async(self):
        import websockets
        import inspect
        system_prompt = load_system_prompt()
        # Unmute mic
        subprocess.run(["pactl", "set-source-mute", self._mic_source, "0"], capture_output=True)
        subprocess.run(["pactl", "set-source-volume", self._mic_source, "100%"], capture_output=True)
        # Calibrate
        self._calibrate_mic()
        ws_kwargs = {"max_size": None}
        try:
            sig = inspect.signature(websockets.connect)
            if "extra_headers" in sig.parameters:
                ws_kwargs["extra_headers"] = {"Content-Type": "application/json"}
            else:
                ws_kwargs["additional_headers"] = {"Content-Type": "application/json"}
        except Exception:
            ws_kwargs["extra_headers"] = {"Content-Type": "application/json"}
        while self._running:
            try:
                log.info("Connecting to Gemini...")
                async with websockets.connect(URI, **ws_kwargs) as ws:
                    setup_msg = {
                        "setup": {
                            "model": MODEL,
                            "generationConfig": {
                                "responseModalities": ["AUDIO"],
                                "thinkingConfig": {"thinkingBudget": 0},
                                "speechConfig": {
                                    "voiceConfig": {
                                        "prebuiltVoiceConfig": {"voiceName": VOICE_NAME}
                                    }
                                },
                            },
                            "realtimeInputConfig": {
                                "automaticActivityDetection": {
                                    "startOfSpeechSensitivity": "START_SENSITIVITY_HIGH",
                                    "prefixPaddingMs": 40,
                                    "endOfSpeechSensitivity": "END_SENSITIVITY_HIGH",
                                    "silenceDurationMs": 250,
                                }
                            },
                            "inputAudioTranscription": {},
                            "systemInstruction": {"parts": [{"text": system_prompt}]},
                        }
                    }
                    await ws.send(json.dumps(setup_msg))
                    await ws.recv()
                    log.info("Connected! Always listening...")
                    self._audio_queue = asyncio.Queue()
                    await asyncio.gather(
                        self._capture_mic(ws),
                        self._receive_audio(ws),
                        self._play_audio(),
                    )
            except Exception as e:
                if self._running:
                    log.error("Connection error: %s — reconnecting in 3s", e)
                    await asyncio.sleep(3)
    def _voice_thread(self):
        asyncio.run(self._run_async())
    # ─── START / STOP ─────────────────────────────────────
    def start(self):
        if self._running:
            return
        self._running = True
        self._thread = threading.Thread(target=self._voice_thread, daemon=True, name="gemini_voice")
        self._thread.start()
        log.info("Gemini voice module started")
    def stop(self):
        self._running = False
        if self._thread:
            self._thread.join(timeout=5)
            self._thread = None
        log.info("Gemini voice module stopped")
    @property
    def is_running(self) -> bool:
        return self._running
    @property
    def state(self) -> str:
        return "LISTENING" if self._running else "STOPPED"
    @property
    def is_speaking(self) -> bool:
        return self.speaking
 # ─── STANDALONE TEST ─────────────────────────────────────
 if __name__ == "__main__":
    import sys
    sys.path.insert(0, PROJECT_ROOT)
    from API.audio_api import AudioAPI
    def on_transcript(text, role):
        print(f"  [{role.upper()}] {text}")
    audio = AudioAPI()
    voice = GeminiVoiceModule(audio, on_transcript=on_transcript)
    print("Gemini voice v2 — speak anytime. Ctrl+C to stop.\n")
    voice.start()
    try:
        while voice.is_running:
            time.sleep(0.5)
    except KeyboardInterrupt:
        print("\nStopping...")
        voice.stop()
--- a/Voice/marcus_voice.py
+++ b/Voice/marcus_voice.py
@ -1,19 +1,20 @@
 #!/usr/bin/env python3
 """
-Features/Voice/marcus_voice.py — Marcus Always-Listening Voice Module
+Voice/marcus_voice.py — Marcus Always-Listening Voice Module (English)
-======================================================================
+=======================================================================
 State machine:
  IDLE → (wake word detected) → WAKE_HEARD
  WAKE_HEARD → (record command) → PROCESSING
  PROCESSING → (Whisper transcribe) → send to brain → SPEAKING
  SPEAKING → (TTS done) → IDLE
-Wake word: "Marcus" / "ماركوس" (detected by Whisper tiny)
+Wake word: "Marcus"  (detected by Whisper tiny)
 Commands:  Transcribed by Whisper small
-TTS:       Handled by API/audio_api.py
+Mic:       G1 built-in array mic via UDP multicast (Voice/builtin_mic.py)
 TTS:       English only, Unitree built-in TtsMaker (API/audio_api.py)
 Usage:
-    from Features.Voice.marcus_voice import VoiceModule
+    from Voice.marcus_voice import VoiceModule
    voice = VoiceModule(audio_api, on_command=brain.handle_voice_command)
    voice.start()   # background thread
    voice.stop()
@ -21,7 +22,6 @@ Usage:
 import logging
 import os
 import subprocess
 import threading
 import time
 import numpy as np
@ -74,7 +74,8 @@ class VoiceModule:
        """
        Args:
            audio_api:   AudioAPI instance (from API/audio_api.py)
-            on_command:  callback(text: str, lang: str) — called when command is transcribed
+            on_command:  callback(text: str, lang: str) — "lang" is always "en"
                         now; kept in the signature for interface stability.
        """
        self._audio = audio_api
        self._on_command = on_command
@ -83,13 +84,23 @@ class VoiceModule:
        self._stt = self._config["stt"]
        self._mic = self._config["mic"]
-        # Whisper models — lazy loaded
+        # Whisper models — lazy loaded on first _voice_loop() iteration
        self._wake_model = None
        self._cmd_model = None
-        # Wake words
+        # Wake words (English only — built-in TTS doesn't do Arabic)
-        self._wake_en = [w.lower() for w in self._stt["wake_words_en"]]
+        self._wake_en = [w.lower() for w in self._stt.get("wake_words_en",
-        self._wake_ar = self._stt["wake_words_ar"]
+                                                           ["marcus", "marcos"])]
        # G1 built-in mic (UDP multicast).
        from Voice.builtin_mic import BuiltinMic
        _mcfg = self._config.get("mic_udp", {})
        self._mic_capture = BuiltinMic(
            group=_mcfg.get("group", "239.168.123.161"),
            port=_mcfg.get("port",  5555),
            buf_max=_mcfg.get("buffer_max_bytes", 64000),
        )
        self._sample_rate = self._mic_capture.sample_rate    # 16000
        # State
        self._state = State.IDLE
@ -97,7 +108,7 @@ class VoiceModule:
        self._thread = None
        self._lock = threading.Lock()
-        log.info("VoiceModule initialized")
+        log.info("VoiceModule initialized (mic: G1 built-in UDP)")
    # ─── MODEL LOADING ────────────────────────────────────
@ -115,69 +126,49 @@ class VoiceModule:
            self._cmd_model = whisper.load_model(self._stt["command_model"])
            log.info("Command model ready")
-    # ─── MIC RECORDING ────────────────────────────────────
+    # ─── MIC RECORDING (G1 built-in UDP) ──────────────────
    def _record_chunk(self, seconds: float) -> np.ndarray:
-        """Record audio chunk from mic via parec."""
+        """Capture a fixed-duration chunk from the G1 built-in mic."""
-        source = self._mic["source_index"]
+        num_bytes = int(seconds * self._sample_rate * 2)   # int16 mono
-        rate = str(self._mic["rate"])
+        raw = bytearray()
-
+        bite = 1024
-        proc = subprocess.Popen(
+        while len(raw) < num_bytes:
-            ["parec", "-d", source,
+            raw.extend(self._mic_capture.read_chunk(min(bite, num_bytes - len(raw))))
-             "--format=s16le", f"--rate={rate}", "--channels=1", "--raw"],
+        return np.frombuffer(bytes(raw), dtype=np.int16)
            stdout=subprocess.PIPE,
        )
        time.sleep(seconds)
        proc.terminate()
        raw = proc.stdout.read()
        return np.frombuffer(raw, dtype=np.int16)
    def _record_until_silence(self) -> np.ndarray:
-        """Record until silence is detected or max duration reached."""
+        """Capture until RMS drops below threshold for `silence_duration_sec`."""
-        source = self._mic["source_index"]
+        threshold   = self._stt.get("silence_threshold", 500)
-        rate = self._mic["rate"]
+        silence_dur = self._stt.get("silence_duration_sec", 1.5)
-        threshold = self._stt["silence_threshold"]
+        max_dur     = self._stt.get("max_record_sec", 15)
        silence_dur = self._stt["silence_duration_sec"]
        max_dur = self._stt["max_record_sec"]
-        chunk_sec = 0.5
+        chunk_sec           = 0.5
-        chunk_samples = int(rate * chunk_sec)
+        chunk_bytes         = int(self._sample_rate * chunk_sec) * 2
-        silence_chunks_needed = int(silence_dur / chunk_sec)
+        silence_chunks_need = int(silence_dur / chunk_sec)
-        max_chunks = int(max_dur / chunk_sec)
+        max_chunks          = int(max_dur / chunk_sec)
-        proc = subprocess.Popen(
+        all_audio     = []
            ["parec", "-d", source,
             "--format=s16le", f"--rate={rate}", "--channels=1", "--raw"],
            stdout=subprocess.PIPE,
        )
        all_audio = []
        silence_count = 0
-        chunk_count = 0
+        chunk_count   = 0
-        try:
+        while chunk_count < max_chunks:
-            while chunk_count < max_chunks:
+            raw = self._mic_capture.read_chunk(chunk_bytes)
-                data = proc.stdout.read(chunk_samples * 2)  # 2 bytes per sample
+            if not raw:
-                if not data:
+                break
-                    break
+            chunk = np.frombuffer(raw, dtype=np.int16)
            all_audio.append(chunk)
            chunk_count += 1
-                chunk = np.frombuffer(data, dtype=np.int16)
+            rms = np.sqrt(np.mean(chunk.astype(np.float64) ** 2))
-                all_audio.append(chunk)
+            if rms < threshold:
-                chunk_count += 1
+                silence_count += 1
            else:
                silence_count = 0
-                # Check for silence
+            if silence_count >= silence_chunks_need and chunk_count > 2:
-                rms = np.sqrt(np.mean(chunk.astype(np.float64) ** 2))
+                log.info("Silence detected after %.1fs", chunk_count * chunk_sec)
-                if rms < threshold:
+                break
                    silence_count += 1
                else:
                    silence_count = 0
                if silence_count >= silence_chunks_needed and chunk_count > 2:
                    log.info("Silence detected after %.1fs", chunk_count * chunk_sec)
                    break
        finally:
            proc.terminate()
            proc.stdout.read()  # drain
        if all_audio:
            return np.concatenate(all_audio)
@ -205,38 +196,18 @@ class VoiceModule:
        return text
    def _check_wake_word(self, text: str) -> bool:
-        """Check if transcribed text contains a wake word."""
+        """Check if transcribed text contains an English wake word."""
        text_lower = text.lower().strip()
-
+        return any(w in text_lower for w in self._wake_en)
        # English wake words
        for w in self._wake_en:
            if w in text_lower:
                return True
        # Arabic wake words
        for w in self._wake_ar:
            if w in text:
                return True
        return False
    # ─── MAIN LOOP ────────────────────────────────────────
    def _voice_loop(self):
        """Main voice processing loop — runs in background thread."""
        self._load_whisper()
        self._mic_capture.start()
        log.info("Voice loop started — listening for wake word...")
        # Unmute mic once
        subprocess.run(
            ["pactl", "set-source-mute", self._mic["source_index"], "0"],
            capture_output=True,
        )
        subprocess.run(
            ["pactl", "set-source-volume", self._mic["source_index"], "100%"],
            capture_output=True,
        )
        while self._running:
            try:
                if self._state == State.IDLE:
@ -279,9 +250,7 @@ class VoiceModule:
            self._state = State.WAKE_HEARD
            # Acknowledge
-            self._audio.speak(
+            self._audio.speak(self._config["messages"]["wake_heard"])
                self._config["messages"]["wake_heard"], "en"
            )
    def _do_wake_heard(self):
        """Record the command until silence."""
@ -294,7 +263,7 @@ class VoiceModule:
        if len(audio) < 4000:  # < 0.25s at 16kHz
            log.info("Too short, ignoring")
-            self._audio.speak(self._config["messages"]["no_speech"], "en")
+            self._audio.speak(self._config["messages"]["no_speech"])
            self._state = State.IDLE
            return
@ -308,18 +277,16 @@ class VoiceModule:
        if not text or len(text.strip()) < 2:
            log.info("Empty transcription")
-            self._audio.speak(self._config["messages"]["no_speech"], "en")
+            self._audio.speak(self._config["messages"]["no_speech"])
            self._state = State.IDLE
            return
-        # Detect language
+        log.info("Command: %s", text)
        lang = "ar" if any('\u0600' <= c <= '\u06FF' for c in text) else "en"
        log.info("Command [%s]: %s", lang, text)
-        # Send to brain callback
+        # Send to brain callback (lang always "en" in this build)
        if self._on_command:
            try:
-                self._on_command(text, lang)
+                self._on_command(text, "en")
            except Exception as e:
                log.error("Brain callback error: %s", e)
@ -342,6 +309,10 @@ class VoiceModule:
    def stop(self):
        """Stop voice listening."""
        self._running = False
        try:
            self._mic_capture.stop()
        except Exception:
            pass
        if self._thread:
            self._thread.join(timeout=5)
            self._thread = None